From 75d05e0294e171b7e46a7a9774da304be1f12ec9 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Wed, 4 Jun 2025 14:56:48 -0700
Subject: [PATCH 1/3] Update XLM-RoBERTa model documentation with enhanced
 usage examples and improved layout

---
 docs/source/en/model_doc/xlm-roberta.md | 108 +++++++++++++++++-------
 1 file changed, 79 insertions(+), 29 deletions(-)
diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md
index 2bc890257a69..30258da1c4ce 100644
--- a/docs/source/en/model_doc/xlm-roberta.md
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -16,43 +16,93 @@ rendered properly in your Markdown viewer.
 
 # XLM-RoBERTa
 
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>
 
-## Overview
+[XLM-RoBERTa](https://arxiv.org/abs/1911.02116) is a large multilingual language model trained on 2.5TB of filtered CommonCrawl data across 100 languages. What makes XLM-RoBERTa unique is its ability to achieve strong performance on both high and low-resource languages without sacrificing per-language performance, outperforming previous multilingual models like mBERT by significant margins.
 
-The XLM-RoBERTa model was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume
-Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's
-RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl
-data.
+You can find all the original XLM-RoBERTa checkpoints under the [XLM-RoBERTa](https://huggingface.co/models?search=xlm-roberta) collection.
 
-The abstract from the paper is the following:
+> [!TIP]
+> Click on the XLM-RoBERTa models in the right sidebar for more examples of how to apply XLM-RoBERTa to different cross-lingual tasks like classification, translation, and question answering.
 
-*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a
-wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
-languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
-outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on
-XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
-low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We
-also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
-trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
-languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
-per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
-will make XLM-R code, data, and models publicly available.*
+The example below demonstrates how to use XLM-RoBERTa for masked language modeling with [`Pipeline`] or the [`AutoModel`] class.
 
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-## Usage tips
+```python
+from transformers import pipeline
 
-- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
-  not require `lang` tensors to understand which language is used, and should be able to determine the correct
-  language from the input ids.
-- Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses masked language modeling on sentences coming from one language.
+# Initialize the pipeline with XLM-RoBERTa
+unmasker = pipeline("fill-mask", model="xlm-roberta-base")
+
+# Example in English
+result = unmasker("Hello, I'm a [MASK] model.")
+print(result)
+
+# Example in French
+result = unmasker("Bonjour, je suis un modèle [MASK].")
+print(result)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import torch
+
+# Load model and tokenizer
+model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
+tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+
+# Prepare input
+text = "Hello, I'm a [MASK] model."
+inputs = tokenizer(text, return_tensors="pt")
+
+# Get prediction
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits.argmax(dim=-1)
+
+# Decode prediction
+predicted_token = tokenizer.decode(predictions[0][inputs["input_ids"][0] == tokenizer.mask_token_id])
+print(f"Predicted token: {predicted_token}")
+```
+
+</hfoption>
+</hfoptions>
+
+## Model Details
+
+XLM-RoBERTa was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. The model achieves state-of-the-art results on several cross-lingual benchmarks:
+
+- +13.8% average accuracy on XNLI
+- +12.3% average F1 score on MLQA
+- +2.1% average F1 score on NER
+- Particularly strong on low-resource languages (e.g., +11.8% for Swahili, +9.2% for Urdu)
+
+### Key Features
+
+- Trained on 100 languages using 2.5TB of filtered CommonCrawl data
+- Based on RoBERTa architecture with multilingual adaptations
+- No language token required - automatically detects language from input
+- Uses only masked language modeling (no translation objective)
+- Competitive with strong monolingual models on GLUE and XNLI
+
+## Usage Tips
+
+- Unlike some XLM models, XLM-RoBERTa doesn't require `lang` tensors - it automatically determines the language from input
+- Uses RoBERTa's training approach but adapted for multilingual data
+- Particularly effective for low-resource languages
+- Supports all standard RoBERTa tasks (classification, token classification, QA, etc.)
 
 ## Resources
 

From 10cd505f3eb585429b36b0f125a3c148c8bac2de Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Thu, 5 Jun 2025 16:56:19 -0700
Subject: [PATCH 2/3] Added CLI command example and quantization example for
 XLM RoBERTa model card.

---
 docs/source/en/model_doc/xlm-roberta.md | 105 +++++++++++++++---------
 1 file changed, 67 insertions(+), 38 deletions(-)

diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md
index 30258da1c4ce..4040e5c89d0c 100644
--- a/docs/source/en/model_doc/xlm-roberta.md
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -14,8 +14,6 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# XLM-RoBERTa
-
 <div style="float: right;">
     <div class="flex flex-wrap space-x-1">
         <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
@@ -25,31 +23,33 @@ rendered properly in your Markdown viewer.
     </div>
 </div>
 
-[XLM-RoBERTa](https://arxiv.org/abs/1911.02116) is a large multilingual language model trained on 2.5TB of filtered CommonCrawl data across 100 languages. What makes XLM-RoBERTa unique is its ability to achieve strong performance on both high and low-resource languages without sacrificing per-language performance, outperforming previous multilingual models like mBERT by significant margins.
+# XLM-RoBERTa
+
+[XLM-RoBERTa](https://huggingface.co/papers/1911.02116) is a large multilingual masked language model trained on 2.5TB of filtered CommonCrawl data across 100 languages. It shows that scaling the model provides strong performance gains on high-resource and low-resource languages. The model uses the [RoBERTa](./roberta) pretraining objectives on the [XLM](./xlm) model.
 
-You can find all the original XLM-RoBERTa checkpoints under the [XLM-RoBERTa](https://huggingface.co/models?search=xlm-roberta) collection.
+You can find all the original XLM-RoBERTa checkpoints under the [Facebook AI community](https://huggingface.co/FacebookAI) organization.
 
 > [!TIP]
 > Click on the XLM-RoBERTa models in the right sidebar for more examples of how to apply XLM-RoBERTa to different cross-lingual tasks like classification, translation, and question answering.
 
-The example below demonstrates how to use XLM-RoBERTa for masked language modeling with [`Pipeline`] or the [`AutoModel`] class.
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
 
 <hfoptions id="usage">
 <hfoption id="Pipeline">
 
 ```python
+import torch
 from transformers import pipeline
 
 # Initialize the pipeline with XLM-RoBERTa
-unmasker = pipeline("fill-mask", model="xlm-roberta-base")
-
-# Example in English
-result = unmasker("Hello, I'm a [MASK] model.")
-print(result)
-
+pipeline = pipeline(
+    task="fill-mask",
+    model="facebook/xlm-roberta-base",
+    torch_dtype=torch.float16,
+    device=0
+)
 # Example in French
-result = unmasker("Bonjour, je suis un modèle [MASK].")
-print(result)
+pipeline("Bonjour, je suis un modèle [MASK].")
 ```
 
 </hfoption>
@@ -60,46 +60,75 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
 import torch
 
 # Load model and tokenizer
-model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
-tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+tokenizer = AutoTokenizer.from_pretrained(
+    "facebook/xlm-roberta-base"
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "facebook/xlm-roberta-base",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
 
 # Prepare input
-text = "Hello, I'm a [MASK] model."
-inputs = tokenizer(text, return_tensors="pt")
+inputs = tokenizer("Bonjour, je suis un modèle [MASK].", return_tensors="pt").to("cuda")
 
-# Get prediction
 with torch.no_grad():
     outputs = model(**inputs)
-    predictions = outputs.logits.argmax(dim=-1)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
 
-# Decode prediction
-predicted_token = tokenizer.decode(predictions[0][inputs["input_ids"][0] == tokenizer.mask_token_id])
-print(f"Predicted token: {predicted_token}")
+print(f"The predicted token is: {predicted_token}")
 ```
 
+</hfoption>
+<hfoption id="CLI">
+
+```bash
+transformers-cli run fill-mask \
+    --model facebook/xlm-roberta-base \
+    --text "Bonjour, je suis un modèle [MASK]." \
+    --device 0
+# Output: The predicted token is: modèle
+```
 </hfoption>
 </hfoptions>
 
-## Model Details
-
-XLM-RoBERTa was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. The model achieves state-of-the-art results on several cross-lingual benchmarks:
-
-- +13.8% average accuracy on XNLI
-- +12.3% average F1 score on MLQA
-- +2.1% average F1 score on NER
-- Particularly strong on low-resource languages (e.g., +11.8% for Swahili, +9.2% for Urdu)
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [quantization guide](../quantization) overview for more available quantization backends.
 
-### Key Features
+The example below uses bitsandbytes the quantive the weights to 4 bits
 
-- Trained on 100 languages using 2.5TB of filtered CommonCrawl data
-- Based on RoBERTa architecture with multilingual adaptations
-- No language token required - automatically detects language from input
-- Uses only masked language modeling (no translation objective)
-- Competitive with strong monolingual models on GLUE and XNLI
+```python
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer, BitsAndBytesConfig
+# Load model and tokenizer with bitsandbytes quantization
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+    bnb_4bit_quant_type="nf4",  # or "fp4" for float 4-bit quantization
+    bnb_4bit_use_double_quant=True,  # use double quantization for better performance
+)
+tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-base")
+model = AutoModelForMaskedLM.from_pretrained(
+    "facebook/xlm-roberta-base",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="flash_attention_2",
+    quantization_config=quantization_config
+)
+
+inputs = tokenizer("Bonjour, je suis un modèle [MASK].", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
 
-## Usage Tips
+## Notes
 
-- Unlike some XLM models, XLM-RoBERTa doesn't require `lang` tensors - it automatically determines the language from input
+- Unlike some XLM models, XLM-RoBERTa doesn't require `lang` tensors to understand what language is being used. It automatically determines the language from the input IDs
 - Uses RoBERTa's training approach but adapted for multilingual data
 - Particularly effective for low-resource languages
 - Supports all standard RoBERTa tasks (classification, token classification, QA, etc.)

From 4590094c0ff6926845505274697ad5ed31c86808 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Fri, 6 Jun 2025 15:45:46 -0700
Subject: [PATCH 3/3] Minor change to transformers CLI and quantization example
 for XLM roberta model card

---
 docs/source/en/model_doc/xlm-roberta.md | 35 +++++++++----------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md
index 4040e5c89d0c..80465da245ef 100644
--- a/docs/source/en/model_doc/xlm-roberta.md
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -32,7 +32,7 @@ You can find all the original XLM-RoBERTa checkpoints under the [Facebook AI com
 > [!TIP]
 > Click on the XLM-RoBERTa models in the right sidebar for more examples of how to apply XLM-RoBERTa to different cross-lingual tasks like classification, translation, and question answering.
 
-The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
+The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
 
 <hfoptions id="usage">
 <hfoption id="Pipeline">
@@ -41,16 +41,14 @@ The example below demonstrates how to predict the `[MASK]` token with [`Pipeline
 import torch
 from transformers import pipeline
 
-# Initialize the pipeline with XLM-RoBERTa
 pipeline = pipeline(
     task="fill-mask",
-    model="facebook/xlm-roberta-base",
+    model="FacebookAI/xlm-roberta-base",
     torch_dtype=torch.float16,
     device=0
 )
 # Example in French
-pipeline("Bonjour, je suis un modèle [MASK].")
-```
+pipeline("Bonjour, je suis un modèle <mask>.")
 
 </hfoption>
 <hfoption id="AutoModel">
@@ -59,19 +57,18 @@ pipeline("Bonjour, je suis un modèle [MASK].")
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 import torch
 
-# Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
-    "facebook/xlm-roberta-base"
+    "FacebookAI/xlm-roberta-base"
 )
 model = AutoModelForMaskedLM.from_pretrained(
-    "facebook/xlm-roberta-base",
+    "FacebookAI/xlm-roberta-base",
     torch_dtype=torch.float16,
     device_map="auto",
     attn_implementation="sdpa"
 )
 
 # Prepare input
-inputs = tokenizer("Bonjour, je suis un modèle [MASK].", return_tensors="pt").to("cuda")
+inputs = tokenizer("Bonjour, je suis un modèle <mask>.", return_tensors="pt").to("cuda")
 
 with torch.no_grad():
     outputs = model(**inputs)
@@ -85,26 +82,21 @@ print(f"The predicted token is: {predicted_token}")
 ```
 
 </hfoption>
-<hfoption id="CLI">
+<hfoption id="transformers CLI">
 
 ```bash
-transformers-cli run fill-mask \
-    --model facebook/xlm-roberta-base \
-    --text "Bonjour, je suis un modèle [MASK]." \
-    --device 0
-# Output: The predicted token is: modèle
+echo -e "Plants create <mask> through a process known as photosynthesis." | transformers-cli run --task fill-mask --model FacebookAI/xlm-roberta-base --device 0
 ```
 </hfoption>
 </hfoptions>
 
 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [quantization guide](../quantization) overview for more available quantization backends.
 
-The example below uses bitsandbytes the quantive the weights to 4 bits
+The example below uses [bitsandbytes](../quantization/bitsandbytes) the quantive the weights to 4 bits
 
 ```python
 import torch
 from transformers import AutoModelForMaskedLM, AutoTokenizer, BitsAndBytesConfig
-# Load model and tokenizer with bitsandbytes quantization
 
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
@@ -112,16 +104,16 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_quant_type="nf4",  # or "fp4" for float 4-bit quantization
     bnb_4bit_use_double_quant=True,  # use double quantization for better performance
 )
-tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-base")
+tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-large")
 model = AutoModelForMaskedLM.from_pretrained(
-    "facebook/xlm-roberta-base",
+    "facebook/xlm-roberta-large",
     torch_dtype=torch.float16,
     device_map="auto",
     attn_implementation="flash_attention_2",
     quantization_config=quantization_config
 )
 
-inputs = tokenizer("Bonjour, je suis un modèle [MASK].", return_tensors="pt").to("cuda")
+inputs = tokenizer("Bonjour, je suis un modèle <mask>.", return_tensors="pt").to("cuda")
 outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
@@ -129,9 +121,6 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ## Notes
 
 - Unlike some XLM models, XLM-RoBERTa doesn't require `lang` tensors to understand what language is being used. It automatically determines the language from the input IDs
-- Uses RoBERTa's training approach but adapted for multilingual data
-- Particularly effective for low-resource languages
-- Supports all standard RoBERTa tasks (classification, token classification, QA, etc.)
 
 ## Resources