From db146d7ec89a926fe5772bf8514f29b18eb35523 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 23 Jun 2025 14:59:05 +0200 Subject: [PATCH 1/3] fix --- tests/models/rag/test_modeling_rag.py | 85 +++++++++++++++++++++------ 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py index 0189129e8c37..ee274012a661 100644 --- a/tests/models/rag/test_modeling_rag.py +++ b/tests/models/rag/test_modeling_rag.py @@ -16,6 +16,7 @@ import json import os import shutil +import subprocess import tempfile import unittest from unittest.mock import patch @@ -49,7 +50,7 @@ if is_torch_available() and is_datasets_available() and is_faiss_available(): import faiss import torch - from datasets import Dataset + from datasets import Dataset, load_dataset from transformers import ( AutoConfig, @@ -679,6 +680,27 @@ def config_and_inputs(self): @require_tokenizers @require_torch_non_multi_accelerator class RagModelIntegrationTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.temp_dir = tempfile.TemporaryDirectory() + cls.dataset_path = cls.temp_dir.name + cls.index_path = os.path.join(cls.temp_dir.name, "index") + + ds = load_dataset("hf-internal-testing/wiki_dpr_dummy")["train"] + ds.save_to_disk(cls.dataset_path) + subprocess.run( + [ + "wget", + "-O", + f"{cls.index_path}", + "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index", + ] + ) + + @classmethod + def tearDownClass(cls): + cls.temp_dir.cleanup() + def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch @@ -722,8 +744,9 @@ def get_rag_config(self): max_combined_length=300, dataset="wiki_dpr", dataset_split="train", - index_name="exact", - index_path=None, + index_name="custom", + passages_path=self.dataset_path, + index_path=self.index_path, use_dummy_dataset=True, retrieval_vector_size=768, retrieval_batch_size=8, @@ -841,8 +864,8 @@ def test_rag_token_generate_beam(self): output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True) # Expected outputs as given by model at integration time. - EXPECTED_OUTPUT_TEXT_1 = "\"She's My Kind of Girl" - EXPECTED_OUTPUT_TEXT_2 = "\"She's My Kind of Love" + EXPECTED_OUTPUT_TEXT_1 = '"She\'s My Kind of Girl" was released through Epic Records in Japan in March 1972. The song was a Top 10 hit in the country. It was the first single to be released by ABBA in the UK. The single was followed by "En Carousel" and "Love Has Its Uses"' + EXPECTED_OUTPUT_TEXT_2 = '"She\'s My Kind of Girl" was released through Epic Records in Japan in March 1972. The song was a Top 10 hit in the country. It was the first single to be released by ABBA in the UK. The single was followed by "En Carousel" and "Love Has Its Ways"' self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1) self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2) @@ -903,7 +926,10 @@ def test_data_questions(self): def test_rag_sequence_generate_batch(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") retriever = RagRetriever.from_pretrained( - "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" + "facebook/rag-sequence-nq", + index_name="custom", + passages_path=self.dataset_path, + index_path=self.index_path, ) rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to( torch_device @@ -926,12 +952,13 @@ def test_rag_sequence_generate_batch(self): outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + # PR #31938 cause the output being changed from `june 22, 2018` to `june 22 , 2018`. EXPECTED_OUTPUTS = [ " albert einstein", - " june 22, 2018", + " june 22 , 2018", " amplitude modulation", " tim besley ( chairman )", - " june 20, 2018", + " june 20 , 2018", " 1980", " 7.0", " 8", @@ -943,9 +970,9 @@ def test_rag_sequence_generate_batch_from_context_input_ids(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") retriever = RagRetriever.from_pretrained( "facebook/rag-sequence-nq", - index_name="exact", - use_dummy_dataset=True, - dataset_revision="b24a417", + index_name="custom", + passages_path=self.dataset_path, + index_path=self.index_path, ) rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to( torch_device @@ -981,10 +1008,10 @@ def test_rag_sequence_generate_batch_from_context_input_ids(self): EXPECTED_OUTPUTS = [ " albert einstein", - " june 22, 2018", + " june 22 , 2018", " amplitude modulation", " tim besley ( chairman )", - " june 20, 2018", + " june 20 , 2018", " 1980", " 7.0", " 8", @@ -995,7 +1022,7 @@ def test_rag_sequence_generate_batch_from_context_input_ids(self): def test_rag_token_generate_batch(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") retriever = RagRetriever.from_pretrained( - "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" + "facebook/rag-token-nq", index_name="custom", passages_path=self.dataset_path, index_path=self.index_path ) rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to( torch_device @@ -1023,10 +1050,10 @@ def test_rag_token_generate_batch(self): EXPECTED_OUTPUTS = [ " albert einstein", - " september 22, 2017", + " september 22 , 2017", " amplitude modulation", " stefan persson", - " april 20, 2018", + " april 20 , 2018", " the 1970s", " 7.1. 2", " 13", @@ -1037,6 +1064,27 @@ def test_rag_token_generate_batch(self): @require_torch @require_retrieval class RagModelSaveLoadTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.temp_dir = tempfile.TemporaryDirectory() + cls.dataset_path = cls.temp_dir.name + cls.index_path = os.path.join(cls.temp_dir.name, "index") + + ds = load_dataset("hf-internal-testing/wiki_dpr_dummy")["train"] + ds.save_to_disk(cls.dataset_path) + subprocess.run( + [ + "wget", + "-O", + f"{cls.index_path}", + "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index", + ] + ) + + @classmethod + def tearDownClass(cls): + cls.temp_dir.cleanup() + def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch @@ -1060,8 +1108,9 @@ def get_rag_config(self): max_combined_length=300, dataset="wiki_dpr", dataset_split="train", - index_name="exact", - index_path=None, + index_name="custom", + passages_path=self.dataset_path, + index_path=self.index_path, use_dummy_dataset=True, retrieval_vector_size=768, retrieval_batch_size=8, From d4e1f370953e899a05df3bfe93374fa3216c2a07 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 23 Jun 2025 17:01:33 +0200 Subject: [PATCH 2/3] fix --- tests/models/rag/test_modeling_rag.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py index ee274012a661..27c70c87cd0c 100644 --- a/tests/models/rag/test_modeling_rag.py +++ b/tests/models/rag/test_modeling_rag.py @@ -684,7 +684,7 @@ class RagModelIntegrationTests(unittest.TestCase): def setUpClass(cls): cls.temp_dir = tempfile.TemporaryDirectory() cls.dataset_path = cls.temp_dir.name - cls.index_path = os.path.join(cls.temp_dir.name, "index") + cls.index_path = os.path.join(cls.temp_dir.name, "index.faiss") ds = load_dataset("hf-internal-testing/wiki_dpr_dummy")["train"] ds.save_to_disk(cls.dataset_path) @@ -693,7 +693,7 @@ def setUpClass(cls): "wget", "-O", f"{cls.index_path}", - "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index", + "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index.faiss", ] ) @@ -1068,7 +1068,7 @@ class RagModelSaveLoadTests(unittest.TestCase): def setUpClass(cls): cls.temp_dir = tempfile.TemporaryDirectory() cls.dataset_path = cls.temp_dir.name - cls.index_path = os.path.join(cls.temp_dir.name, "index") + cls.index_path = os.path.join(cls.temp_dir.name, "index.faiss") ds = load_dataset("hf-internal-testing/wiki_dpr_dummy")["train"] ds.save_to_disk(cls.dataset_path) @@ -1077,7 +1077,7 @@ def setUpClass(cls): "wget", "-O", f"{cls.index_path}", - "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index", + "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index.faiss", ] ) From 59756fb1333303cd4531c5682f1a0d8886c0f0c2 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 23 Jun 2025 17:26:10 +0200 Subject: [PATCH 3/3] fix --- tests/models/rag/test_modeling_rag.py | 28 +++++++++++---------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py index 27c70c87cd0c..4163cf793d55 100644 --- a/tests/models/rag/test_modeling_rag.py +++ b/tests/models/rag/test_modeling_rag.py @@ -16,12 +16,12 @@ import json import os import shutil -import subprocess import tempfile import unittest from unittest.mock import patch import numpy as np +import requests from transformers import BartTokenizer, T5Tokenizer from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES @@ -688,14 +688,11 @@ def setUpClass(cls): ds = load_dataset("hf-internal-testing/wiki_dpr_dummy")["train"] ds.save_to_disk(cls.dataset_path) - subprocess.run( - [ - "wget", - "-O", - f"{cls.index_path}", - "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index.faiss", - ] - ) + + url = "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index.faiss" + response = requests.get(url, stream=True) + with open(cls.index_path, "wb") as fp: + fp.write(response.content) @classmethod def tearDownClass(cls): @@ -1072,14 +1069,11 @@ def setUpClass(cls): ds = load_dataset("hf-internal-testing/wiki_dpr_dummy")["train"] ds.save_to_disk(cls.dataset_path) - subprocess.run( - [ - "wget", - "-O", - f"{cls.index_path}", - "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index.faiss", - ] - ) + + url = "https://huggingface.co/datasets/hf-internal-testing/wiki_dpr_dummy/resolve/main/index.faiss" + response = requests.get(url, stream=True) + with open(cls.index_path, "wb") as fp: + fp.write(response.content) @classmethod def tearDownClass(cls):