From 640270857ff90c45978201773df9a0965ce36092 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Tue, 30 Jan 2024 14:26:57 +0000 Subject: [PATCH 01/12] Add return_set_n_documents option for HF datasets --- rib/data.py | 18 +++++ rib/loader.py | 64 ++++++++++----- rib_scripts/rib_build/edges_pythia-14m.yaml | 6 +- tests/test_build_graph.py | 26 +++--- tests/test_data.py | 24 ++++++ tests/test_float_precision.py | 8 +- tests/test_loader.py | 90 ++++++++++++++++++++- tests/utils.py | 5 +- 8 files changed, 198 insertions(+), 43 deletions(-) create mode 100644 tests/test_data.py diff --git a/rib/data.py b/rib/data.py index 4085ac20..366f6f46 100644 --- a/rib/data.py +++ b/rib/data.py @@ -76,11 +76,29 @@ class HFDatasetConfig(DatasetConfig): return_set_portion: Literal["first", "last"] = Field( "first", description="Whether to load the first or last portion of the return_set." ) + return_set_n_documents: Optional[int] = Field( + None, + description="The number of documents to load from the dataset. If None and " + "return_set_n_samples is not None, will load all documents before sampling. If None and " + "return_set_n_samples is None, will load all possible samples in return_set_frac.", + ) n_ctx: Optional[int] = Field( None, description="Dataset will be packed to sequences of this length. Should be <1024 for gpt2." "<2048 for most other models.", ) + seed: Optional[int] = Field(0, description="The random seed value for reproducibility.") + + @model_validator(mode="after") + def verify_return_n_documents(self) -> "HFDatasetConfig": + """Verify return_set_n_documents and related fields are set correctly.""" + if self.return_set_n_documents is not None: + if self.return_set_n_samples is None or self.return_set_frac is not None: + raise ValueError( + "If return_set_n_documents is not None, then return_set_n_samples must be " + "not None and return_set_frac must be None." + ) + return self class ModularArithmeticDatasetConfig(DatasetConfig): diff --git a/rib/loader.py b/rib/loader.py index 4701efe1..c36c7954 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -218,6 +218,8 @@ def tokenize_dataset( dataset: Dataset, tokenizer: AutoTokenizer, n_ctx: int, + n_samples: Optional[int] = None, + seed: Optional[int] = None, ) -> TensorDataset: """Tokenize a dataset using the provided tokenizer. @@ -229,35 +231,44 @@ def tokenize_dataset( dataset to tokenize. Created from `hf_load_dataset`. tokenizer (AutoTokenizer): The tokenizer to use. n_ctx (int): The context length to use. + n_samples (Optional[int]): The number of samples to use. If None, uses all samples. + seed (Optional[int]): The seed to use for sampling. Returns: TensorDataset: The tokenized dataset. """ - # Tokenize all samples and merge them together + # Tokenize all samples and merge them into one long list of tokens all_tokens = [] for example in dataset: # type: ignore tokens = tokenizer(example["text"])["input_ids"] - all_tokens.extend(tokens) + # Add the eos token to the end of each sample as was done in the original training + # https://github.com/EleutherAI/pythia/issues/123#issuecomment-1791136253 + all_tokens.extend(tokens + [tokenizer.eos_token_id]) # Split the merged tokens into chunks that fit the context length - chunks = [all_tokens[i : i + n_ctx] for i in range(0, len(all_tokens), n_ctx)] - - # Convert chunks to input_ids and labels - # we ignore the final chunk, as it contains a token we don't have a label for - # and is also probably too short and we don't want to pad. - input_ids_list = [] - labels_list = [] - for i, chunk in enumerate(chunks[:-1]): - input_id = chunk - label = input_id[1:] + [chunks[i + 1][0]] # with first token from next chunk + raw_chunks = [all_tokens[i : i + n_ctx] for i in range(0, len(all_tokens), n_ctx)] + + # Note that we ignore the final raw_chunk, as we get the label for the final token in a chunk + # from the subsequent chunk. + if n_samples is not None: + # Randomly select n_samples chunks + generator = torch.Generator() if seed is None else torch.Generator().manual_seed(seed) + chunk_idxs = torch.randperm(len(raw_chunks) - 1, generator=generator)[:n_samples].tolist() + else: + chunk_idxs = list(range(len(raw_chunks) - 1)) - input_ids_list.append(input_id) - labels_list.append(label) + chunks = [raw_chunks[i] for i in chunk_idxs] - input_ids = torch.tensor(input_ids_list, dtype=torch.long) - labels = torch.tensor(labels_list, dtype=torch.long) + all_labels: list[list[int]] = [] + for i, chunk in enumerate(chunks): + # Get the label for the last token using the next chunk in raw_chunks + final_token_label = raw_chunks[chunk_idxs[i] + 1][0] + labels = chunk[1:] + [final_token_label] + all_labels.append(labels) - return TensorDataset(input_ids, labels) + return TensorDataset( + torch.tensor(chunks, dtype=torch.long), torch.tensor(all_labels, dtype=torch.long) + ) def create_hf_dataset( @@ -300,22 +311,33 @@ def create_hf_dataset( assert dataset_config.return_set in ["train", "test"], "Only train and test sets are supported" if dataset_config.return_set_frac: + # Sample from all documents in return_set_frac% of return_set_portion percent = int(dataset_config.return_set_frac * 100) if dataset_config.return_set_portion == "first": data_split = f"{dataset_config.return_set}[:{percent}%]" elif dataset_config.return_set_portion == "last": data_split = f"{dataset_config.return_set}[-{percent}%:]" - elif dataset_config.return_set_n_samples: + elif dataset_config.return_set_n_documents: + # Only load the first/last n documents from return_set and sample return_set_n_samples. if dataset_config.return_set_portion == "first": - data_split = f"{dataset_config.return_set}[:{dataset_config.return_set_n_samples}]" + data_split = f"{dataset_config.return_set}[:{dataset_config.return_set_n_documents}]" elif dataset_config.return_set_portion == "last": - data_split = f"{dataset_config.return_set}[-{dataset_config.return_set_n_samples}:]" + data_split = f"{dataset_config.return_set}[-{dataset_config.return_set_n_documents}:]" + else: + # Sample return_set_n_samples from all documents in return_set + data_split = dataset_config.return_set raw_dataset = hf_load_dataset(dataset_config.name, split=data_split) tokenizer = AutoTokenizer.from_pretrained(dataset_config.tokenizer_name) tokenizer.pad_token = tokenizer.eos_token - return tokenize_dataset(dataset=raw_dataset, tokenizer=tokenizer, n_ctx=n_ctx) + return tokenize_dataset( + dataset=raw_dataset, + tokenizer=tokenizer, + n_ctx=n_ctx, + n_samples=dataset_config.return_set_n_samples, + seed=dataset_config.seed, + ) def create_vision_dataset(dataset_config: VisionDatasetConfig) -> Dataset: diff --git a/rib_scripts/rib_build/edges_pythia-14m.yaml b/rib_scripts/rib_build/edges_pythia-14m.yaml index baf30d7e..8658d9ea 100644 --- a/rib_scripts/rib_build/edges_pythia-14m.yaml +++ b/rib_scripts/rib_build/edges_pythia-14m.yaml @@ -8,9 +8,11 @@ dataset: name: NeelNanda/pile-10k tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations - return_set_frac: null - return_set_n_samples: 20 + return_set_frac: 0.1 + return_set_n_documents: null + return_set_n_samples: null return_set_portion: first + n_ctx: 50 node_layers: - mlp_out.0 - ln2.3 diff --git a/tests/test_build_graph.py b/tests/test_build_graph.py index dd797df1..70c45a96 100644 --- a/tests/test_build_graph.py +++ b/tests/test_build_graph.py @@ -204,26 +204,24 @@ def test_modular_arithmetic_build_graph(basis_formula, edge_formula): @pytest.mark.slow def test_pythia_14m_build_graph(): atol = 0 # Works with 1e-7 for float32 and 0 for float64 - config = get_pythia_config() + config = get_pythia_config({"dataset": {"n_ctx": None}}) results = graph_build_test(config=config, atol=atol) get_rib_acts_test(results, atol=0) @pytest.mark.slow -def test_pythia_14m_build_graph_jacobian(): +def test_pythia_14m_build_graph_jacobian_stochastic(): atol = 0 # Works with 0 for batch_size 900 but not 1800 - updates = [ - # Runs in around 30s on a5000 - {"basis_formula": "jacobian"}, - {"dataset": {"return_set_n_samples": 1}}, - {"dataset": {"n_ctx": 2}}, - {"batch_size": 900}, - {"node_layers": ["ln2.1", "mlp_out.5", "unembed"]}, - {"calculate_edges": True}, - {"edge_formula": "stochastic"}, - {"n_stochastic_sources": 1}, - ] - config = get_pythia_config(*updates) + config = get_pythia_config( + { + "basis_formula": "jacobian", + "dataset": {"return_set_n_documents": 10, "return_set_n_samples": 1, "n_ctx": 2}, + "node_layers": ["ln2.1", "mlp_out.5", "unembed"], + "calculate_edges": True, + "edge_formula": "stochastic", + "n_stochastic_sources": 1, + } + ) results = graph_build_test(config=config, atol=atol) get_rib_acts_test(results, atol=0) diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 00000000..03e2d987 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,24 @@ +import pytest + +from rib.data import HFDatasetConfig +from rib.utils import replace_pydantic_model + + +def test_invalid_hf_dataset_config(): + """Test that invalid combinations of attributes in HFDatasetConfig are caught.""" + base_config = HFDatasetConfig( + dataset_type="huggingface", + name="test", + tokenizer_name="test", + return_set="train", + ) + + with pytest.raises(ValueError): + # return_set_frac and return_set_n_samples cannot be used together + replace_pydantic_model(base_config, {"return_set_frac": 0.5, "return_set_n_samples": 10}) + # return_set_n_documents and return_set_frac cannot be used + replace_pydantic_model(base_config, {"return_set_frac": 0.5, "return_set_n_documents": 10}) + # If return_set_n_documents is used, return_set_n_samples must be not None + replace_pydantic_model(base_config, {"return_set_n_documents": 10}) + # If return_set_n_documents is used, return_set_frac must be None + replace_pydantic_model(base_config, {"return_set_n_documents": 10, "return_set_frac": 0.5}) diff --git a/tests/test_float_precision.py b/tests/test_float_precision.py index 35ec136a..91efee93 100644 --- a/tests/test_float_precision.py +++ b/tests/test_float_precision.py @@ -32,7 +32,8 @@ def rib_results(self, temp_object) -> dict[str, RibBuildResults]: tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations return_set_frac: null - return_set_n_samples: 10 + return_set_n_documents: 30 + return_set_n_samples: 3 return_set_portion: first node_layers: - mlp_out.0 @@ -140,9 +141,10 @@ def ablation_results(self, temp_object, rib_results) -> dict: dataset_type: huggingface name: NeelNanda/pile-10k tokenizer_name: EleutherAI/pythia-14m - return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations + return_set: train return_set_frac: null - return_set_n_samples: 10 + return_set_n_documents: 30 + return_set_n_samples: 3 return_set_portion: first ablation_node_layers: - mlp_out.0 diff --git a/tests/test_loader.py b/tests/test_loader.py index ef7a09aa..1457cdca 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -2,8 +2,8 @@ import torch from torch.utils.data import Subset, TensorDataset -from rib.loader import load_sequential_transformer -from rib.utils import get_data_subset +from rib.loader import load_sequential_transformer, tokenize_dataset +from rib.utils import get_data_subset, set_seed @pytest.mark.parametrize( @@ -58,3 +58,89 @@ def test_load_transformer(model_str): device="cpu", fold_bias=True, ) + + +class MockTokenizer: + def __init__(self): + self.eos_token_id = 0 # Example EOS token ID + self.generator = torch.Generator().manual_seed(0) + + def __call__(self, text: str) -> dict[str, list[int]]: + # Generate between 5 to 10 token IDs as an example + num_tokens = torch.randint(5, 10, (1,), generator=self.generator).item() + token_ids = torch.randint(1, 100, (num_tokens,), generator=self.generator).tolist() + return {"input_ids": token_ids} + + +class TestTokenizeDataset: + @pytest.fixture(autouse=True) + def setup_class(self): + self.sample_texts = ["This is a test.", "Another test sentence."] + self.sample_dataset = [{"text": text} for text in self.sample_texts] + # Create a dummy tokenizer that spits out random tokens + set_seed(0) + self.tokenizer = MockTokenizer() + + def test_outputs_are_all_n_ctx_length(self): + n_ctx = 5 + tokenized_dataset = tokenize_dataset(self.sample_dataset, self.tokenizer, n_ctx) + for input_ids, labels in tokenized_dataset: + assert len(input_ids) == n_ctx + assert len(labels) == n_ctx + + def test_dataset_has_expected_size(self): + n_ctx = 5 + n_samples = 3 + tokenized_dataset = tokenize_dataset(self.sample_dataset, self.tokenizer, n_ctx, n_samples) + assert len(tokenized_dataset) == n_samples + + def test_seed_reproducibility(self): + n_ctx = 5 + n_samples = 2 + seed = 0 + dataset1 = tokenize_dataset(self.sample_dataset, self.tokenizer, n_ctx, n_samples, seed) + duplicate_tokenizer = MockTokenizer() + dataset2 = tokenize_dataset( + self.sample_dataset, duplicate_tokenizer, n_ctx, n_samples, seed + ) + assert torch.equal(dataset1.tensors[0], dataset2.tensors[0]) and torch.equal( + dataset1.tensors[1], dataset2.tensors[1] + ) + + def test_different_seeds(self): + n_ctx = 5 + n_samples = 2 + dataset1 = tokenize_dataset(self.sample_dataset, self.tokenizer, n_ctx, n_samples, 42) + duplicate_tokenizer = MockTokenizer() + dataset2 = tokenize_dataset(self.sample_dataset, duplicate_tokenizer, n_ctx, n_samples, 43) + assert not torch.equal(dataset1.tensors[0], dataset2.tensors[0]) or not torch.equal( + dataset1.tensors[1], dataset2.tensors[1] + ) + + def test_input_ids_equal_labels_no_sampling(self): + """If not sampling (i.e. n_samples is None), input_ids and labels differ by one token. + + Moreover, the final label of one chunk is the input_id of the first token in the next chunk. + So we can flatten the input_ids and labels and check that they are equal (offset by one). + """ + n_ctx = 5 + tokenized_dataset = tokenize_dataset(self.sample_dataset, self.tokenizer, n_ctx) + flattened_input_ids = [ + token_id for input_ids, _ in tokenized_dataset for token_id in input_ids + ] + flattened_labels = [token_id for _, labels in tokenized_dataset for token_id in labels] + assert len(flattened_input_ids) == len(flattened_labels) + assert flattened_input_ids[1:] == flattened_labels[:-1] + + def test_input_ids_equal_labels_sampling(self): + """Check that the labels match the input_ids except for the final token when sampling. + + When (randomly) sampling, the chunks will not be ordered, so we can't check that the final + token label is the input_id of the first token in the next chunk. + """ + n_ctx = 5 + n_samples = 3 + tokenized_dataset = tokenize_dataset(self.sample_dataset, self.tokenizer, n_ctx, n_samples) + for input_ids, labels in tokenized_dataset: + assert len(input_ids) == len(labels) + assert torch.equal(input_ids[1:], labels[:-1]) diff --git a/tests/utils.py b/tests/utils.py index 8ba26007..f7aa437f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -49,8 +49,11 @@ def get_pythia_config(*updates: dict) -> RibBuildConfig: tokenizer_name: EleutherAI/pythia-14m return_set: train return_set_frac: null - return_set_n_samples: 10 # 10 samples gives 3x2048 tokens + return_set_n_documents: 20 + return_set_n_samples: 3 return_set_portion: first + n_ctx: 128 + seed: 0 node_layers: - ln2.1 - unembed From fc8ff148e196afd3a687366ab2fc3d38ced635e9 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Tue, 30 Jan 2024 14:37:37 +0000 Subject: [PATCH 02/12] Assert there are enough documents for n_samples --- rib/loader.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rib/loader.py b/rib/loader.py index c36c7954..26daaf34 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -253,7 +253,12 @@ def tokenize_dataset( if n_samples is not None: # Randomly select n_samples chunks generator = torch.Generator() if seed is None else torch.Generator().manual_seed(seed) - chunk_idxs = torch.randperm(len(raw_chunks) - 1, generator=generator)[:n_samples].tolist() + raw_chunk_idxs = torch.randperm(len(raw_chunks) - 1, generator=generator) + assert len(raw_chunk_idxs) >= n_samples, ( + f"Cannot sample {n_samples} chunks from dataset with {len(raw_chunks)} chunks of " + f"length {n_ctx}." + ) + chunk_idxs = raw_chunk_idxs[:n_samples].tolist() else: chunk_idxs = list(range(len(raw_chunks) - 1)) From 305f43d3edf2d8361d6c0246561a7a957d33f011 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 16:14:42 +0000 Subject: [PATCH 03/12] Add assert that tokenizer.eos_token_id exists --- rib/loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rib/loader.py b/rib/loader.py index 26daaf34..fe8b1f48 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -237,6 +237,7 @@ def tokenize_dataset( Returns: TensorDataset: The tokenized dataset. """ + assert tokenizer.eos_token_id is not None, "Tokenizer must have an eos token id" # Tokenize all samples and merge them into one long list of tokens all_tokens = [] for example in dataset: # type: ignore From e3a295ebdea23f4b49532642d52c66f58657461f Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 16:27:23 +0000 Subject: [PATCH 04/12] Remove 'return_set' from return_set_n_samples and return_set_n_documents --- rib/data.py | 28 +++++++++----------- rib/loader.py | 18 ++++++------- rib/rib_builder.py | 8 +++--- rib_scripts/ablations/orthog_pythia-14m.yaml | 2 +- rib_scripts/ablations/rib_pythia-14m.yaml | 2 +- rib_scripts/rib_build/Cs_pythia-14m.yaml | 2 +- rib_scripts/rib_build/edges_pythia-14m.yaml | 4 +-- rib_scripts/rib_build/tinystories.yaml | 2 +- tests/test_ablations.py | 16 +++++------ tests/test_build_graph.py | 8 +++--- tests/test_data.py | 16 +++++------ tests/test_float_precision.py | 8 +++--- tests/utils.py | 8 +++--- 13 files changed, 60 insertions(+), 62 deletions(-) diff --git a/rib/data.py b/rib/data.py index 366f6f46..f0fa78c7 100644 --- a/rib/data.py +++ b/rib/data.py @@ -30,9 +30,9 @@ class DatasetConfig(BaseModel): return_set_frac: Optional[float] = Field( None, description="The fraction of the returned dataset (train/test/all) to use. Cannot be" - "used with return_set_n_samples.", + "used with n_samples.", ) - return_set_n_samples: Optional[int] = Field( + n_samples: Optional[int] = Field( None, description="The number of raw samples to return from the dataset (train/test/all). " "Cannot be used with return_set_frac.", @@ -40,14 +40,12 @@ class DatasetConfig(BaseModel): @model_validator(mode="after") def verify_return_set_frac_and_n_samples(self) -> "DatasetConfig": - """Verify not both return_set_frac and return_set_n_samples are set and check values.""" + """Verify not both return_set_frac and n_samples are set and check values.""" frac = self.return_set_frac if frac is not None: - if self.return_set_n_samples is not None: - raise ValueError( - "Cannot have both return_set_frac and return_set_n_samples be non-None." - ) + if self.n_samples is not None: + raise ValueError("Cannot have both return_set_frac and n_samples be non-None.") if isinstance(self, HFDatasetConfig) and (frac < 0.01 or frac > 1): raise ValueError( f"return_set_frac must be > 0.01 and < 1 since huggingface dataset `split` " @@ -76,11 +74,11 @@ class HFDatasetConfig(DatasetConfig): return_set_portion: Literal["first", "last"] = Field( "first", description="Whether to load the first or last portion of the return_set." ) - return_set_n_documents: Optional[int] = Field( + n_documents: Optional[int] = Field( None, description="The number of documents to load from the dataset. If None and " - "return_set_n_samples is not None, will load all documents before sampling. If None and " - "return_set_n_samples is None, will load all possible samples in return_set_frac.", + "n_samples is not None, will load all documents before sampling. If None and " + "n_samples is None, will load all possible samples in return_set_frac.", ) n_ctx: Optional[int] = Field( None, @@ -91,11 +89,11 @@ class HFDatasetConfig(DatasetConfig): @model_validator(mode="after") def verify_return_n_documents(self) -> "HFDatasetConfig": - """Verify return_set_n_documents and related fields are set correctly.""" - if self.return_set_n_documents is not None: - if self.return_set_n_samples is None or self.return_set_frac is not None: + """Verify n_documents and related fields are set correctly.""" + if self.n_documents is not None: + if self.n_samples is None or self.return_set_frac is not None: raise ValueError( - "If return_set_n_documents is not None, then return_set_n_samples must be " + "If n_documents is not None, then n_samples must be " "not None and return_set_frac must be None." ) return self @@ -154,7 +152,7 @@ class VisionDatasetConfig(DatasetConfig): seed: Optional[int] = 0 return_set: Literal["train", "test"] = "train" return_set_frac: Optional[float] = None # Needed for some reason to avoid mypy errors - return_set_n_samples: Optional[int] = None # Needed for some reason to avoid mypy errors + n_samples: Optional[int] = None # Needed for some reason to avoid mypy errors class BlockVectorDatasetConfig(DatasetConfig): diff --git a/rib/loader.py b/rib/loader.py index fe8b1f48..14c17c17 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -208,7 +208,7 @@ def create_modular_arithmetic_dataset( dataset_subset = get_data_subset( dataset, frac=dataset_config.return_set_frac, - n_samples=dataset_config.return_set_n_samples, + n_samples=dataset_config.n_samples, seed=seed, ) return dataset_subset @@ -323,14 +323,14 @@ def create_hf_dataset( data_split = f"{dataset_config.return_set}[:{percent}%]" elif dataset_config.return_set_portion == "last": data_split = f"{dataset_config.return_set}[-{percent}%:]" - elif dataset_config.return_set_n_documents: - # Only load the first/last n documents from return_set and sample return_set_n_samples. + elif dataset_config.n_documents: + # Only load the first/last n documents from return_set and sample n_samples. if dataset_config.return_set_portion == "first": - data_split = f"{dataset_config.return_set}[:{dataset_config.return_set_n_documents}]" + data_split = f"{dataset_config.return_set}[:{dataset_config.n_documents}]" elif dataset_config.return_set_portion == "last": - data_split = f"{dataset_config.return_set}[-{dataset_config.return_set_n_documents}:]" + data_split = f"{dataset_config.return_set}[-{dataset_config.n_documents}:]" else: - # Sample return_set_n_samples from all documents in return_set + # Sample n_samples from all documents in return_set data_split = dataset_config.return_set raw_dataset = hf_load_dataset(dataset_config.name, split=data_split) @@ -341,7 +341,7 @@ def create_hf_dataset( dataset=raw_dataset, tokenizer=tokenizer, n_ctx=n_ctx, - n_samples=dataset_config.return_set_n_samples, + n_samples=dataset_config.n_samples, seed=dataset_config.seed, ) @@ -359,7 +359,7 @@ def create_vision_dataset(dataset_config: VisionDatasetConfig) -> Dataset: dataset = get_data_subset( raw_dataset, frac=dataset_config.return_set_frac, - n_samples=dataset_config.return_set_n_samples, + n_samples=dataset_config.n_samples, seed=dataset_config.seed, ) return dataset @@ -371,7 +371,7 @@ def create_block_vector_dataset(dataset_config: BlockVectorDatasetConfig) -> Dat dataset = get_data_subset( raw_dataset, frac=dataset_config.return_set_frac, - n_samples=dataset_config.return_set_n_samples, + n_samples=dataset_config.n_samples, seed=dataset_config.seed, ) return dataset diff --git a/rib/rib_builder.py b/rib/rib_builder.py index 780dd3d7..f38b425a 100644 --- a/rib/rib_builder.py +++ b/rib/rib_builder.py @@ -299,11 +299,11 @@ def _verify_compatible_configs(config: RibBuildConfig, loaded_config: RibBuildCo assert ( config.dataset.return_set_frac <= loaded_config.dataset.return_set_frac ), "Cannot use a larger return_set_frac for edges than to calculate the Cs" - elif config.dataset.return_set_n_samples is not None: - assert loaded_config.dataset.return_set_n_samples is not None + elif config.dataset.n_samples is not None: + assert loaded_config.dataset.n_samples is not None assert ( - config.dataset.return_set_n_samples <= loaded_config.dataset.return_set_n_samples - ), "Cannot use a larger return_set_n_samples for edges than to calculate the Cs" + config.dataset.n_samples <= loaded_config.dataset.n_samples + ), "Cannot use a larger n_samples for edges than to calculate the Cs" def load_interaction_rotations( diff --git a/rib_scripts/ablations/orthog_pythia-14m.yaml b/rib_scripts/ablations/orthog_pythia-14m.yaml index 07a6acdb..12cc55eb 100644 --- a/rib_scripts/ablations/orthog_pythia-14m.yaml +++ b/rib_scripts/ablations/orthog_pythia-14m.yaml @@ -11,7 +11,7 @@ dataset: tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations return_set_frac: 0.01 - return_set_n_samples: null + n_samples: null return_set_portion: last ablation_node_layers: - mlp_out.0 diff --git a/rib_scripts/ablations/rib_pythia-14m.yaml b/rib_scripts/ablations/rib_pythia-14m.yaml index a08e3dca..1b5a30d1 100644 --- a/rib_scripts/ablations/rib_pythia-14m.yaml +++ b/rib_scripts/ablations/rib_pythia-14m.yaml @@ -12,7 +12,7 @@ dataset: tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations return_set_frac: 0.01 - return_set_n_samples: null + n_samples: null return_set_portion: first ablation_node_layers: - mlp_out.0 diff --git a/rib_scripts/rib_build/Cs_pythia-14m.yaml b/rib_scripts/rib_build/Cs_pythia-14m.yaml index 270eb881..ea55ae33 100644 --- a/rib_scripts/rib_build/Cs_pythia-14m.yaml +++ b/rib_scripts/rib_build/Cs_pythia-14m.yaml @@ -9,7 +9,7 @@ dataset: tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations return_set_frac: 0.9 - return_set_n_samples: null + n_samples: null return_set_portion: first node_layers: - mlp_out.0 diff --git a/rib_scripts/rib_build/edges_pythia-14m.yaml b/rib_scripts/rib_build/edges_pythia-14m.yaml index 8658d9ea..363040a3 100644 --- a/rib_scripts/rib_build/edges_pythia-14m.yaml +++ b/rib_scripts/rib_build/edges_pythia-14m.yaml @@ -9,8 +9,8 @@ dataset: tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations return_set_frac: 0.1 - return_set_n_documents: null - return_set_n_samples: null + n_documents: null + n_samples: null return_set_portion: first n_ctx: 50 node_layers: diff --git a/rib_scripts/rib_build/tinystories.yaml b/rib_scripts/rib_build/tinystories.yaml index 739135df..822531bb 100644 --- a/rib_scripts/rib_build/tinystories.yaml +++ b/rib_scripts/rib_build/tinystories.yaml @@ -8,7 +8,7 @@ dataset: tokenizer_name: EleutherAI/gpt-neo-125M return_set: train return_set_frac: null - return_set_n_samples: 5000 # avg ~235 toks / story + n_samples: 5000 # avg ~235 toks / story return_set_portion: first n_ctx: 256 # needs to be <= 511 for the model to behave reasonably node_layers: diff --git a/tests/test_ablations.py b/tests/test_ablations.py index 8637dac1..f4ebd5ea 100644 --- a/tests/test_ablations.py +++ b/tests/test_ablations.py @@ -112,7 +112,7 @@ def test_run_mnist_ablations(ablation_type, tmp_path): "node_layers": ["layers.1", "layers.2", "output"], "batch_size": 100, "dtype": "float32", - "dataset": {"return_set_n_samples": 100, "return_set_frac": None}, + "dataset": {"n_samples": 100, "return_set_frac": None}, } ) results = rib_build(build_config) @@ -135,7 +135,7 @@ def test_run_mnist_ablations(ablation_type, tmp_path): dataset: dataset_type: torchvision name: MNIST - return_set_n_samples: 100 + n_samples: 100 batch_size: 64 # 2 batches seed: 0 out_dir: null @@ -154,7 +154,7 @@ def test_run_modular_arithmetic_rib_ablations(ablation_type, tmp_path): { "node_layers": ["ln1.0", "ln2.0", "mlp_out.0", "unembed", "output"], "batch_size": 100, - "dataset": {"return_set_n_samples": 100}, + "dataset": {"n_samples": 100}, } ) results = rib_build(build_config) @@ -174,7 +174,7 @@ def test_run_modular_arithmetic_rib_ablations(ablation_type, tmp_path): dataset: dataset_type: modular_arithmetic return_set: train - return_set_n_samples: 100 + n_samples: 100 ablation_node_layers: - ln1.0 - ln2.0 @@ -199,7 +199,7 @@ def test_run_mnist_ablations_bisect(ablation_type, tmp_path): "node_layers": ["layers.1", "layers.2", "output"], "batch_size": 100, "dtype": "float32", - "dataset": {"return_set_n_samples": 100, "return_set_frac": None}, + "dataset": {"n_samples": 100, "return_set_frac": None}, } ) results = rib_build(build_config) @@ -220,7 +220,7 @@ def test_run_mnist_ablations_bisect(ablation_type, tmp_path): dataset: dataset_type: torchvision name: MNIST - return_set_n_samples: 100 + n_samples: 100 batch_size: 64 # two batches seed: 0 out_dir: null @@ -239,7 +239,7 @@ def test_run_modular_arithmetic_rib_ablations_bisect(ablation_type, tmp_path): { "node_layers": ["ln1.0", "ln2.0", "mlp_out.0", "unembed", "output"], "batch_size": 100, - "dataset": {"return_set_n_samples": 100}, + "dataset": {"n_samples": 100}, } ) results = rib_build(build_config) @@ -256,7 +256,7 @@ def test_run_modular_arithmetic_rib_ablations_bisect(ablation_type, tmp_path): dataset: dataset_type: modular_arithmetic return_set: train - return_set_n_samples: 100 + n_samples: 100 ablation_node_layers: - ln1.0 - ln2.0 diff --git a/tests/test_build_graph.py b/tests/test_build_graph.py index b1c0a1b8..38499bcf 100644 --- a/tests/test_build_graph.py +++ b/tests/test_build_graph.py @@ -217,7 +217,7 @@ def test_pythia_14m_build_graph_jacobian_stochastic(): config = get_pythia_config( { "basis_formula": "jacobian", - "dataset": {"return_set_n_documents": 10, "return_set_n_samples": 1, "n_ctx": 2}, + "dataset": {"n_documents": 10, "n_samples": 1, "n_ctx": 2}, "node_layers": ["ln2.1", "mlp_out.5", "unembed"], "calculate_edges": True, "edge_formula": "stochastic", @@ -635,13 +635,13 @@ def test_stochastic_source_modadd_convergence(): NOTE: This is quite a weak test, but the runs a slow so we're taking a hit on the test quality. """ node_layers = ["mlp_in.0", "mlp_out.0"] - return_set_n_samples = 3 + n_samples = 3 batch_size = 3 # Calc squared edges config_squared = get_modular_arithmetic_config( { - "dataset": {"return_set_n_samples": return_set_n_samples}, + "dataset": {"n_samples": n_samples}, "batch_size": batch_size, "edge_formula": "squared", "node_layers": node_layers, @@ -658,7 +658,7 @@ def test_stochastic_source_modadd_convergence(): for n_stochastic_sources_edges in [1, 3, 7]: config_stochastic = get_modular_arithmetic_config( { - "dataset": {"return_set_n_samples": return_set_n_samples}, + "dataset": {"n_samples": n_samples}, "batch_size": batch_size, "edge_formula": "squared", "node_layers": node_layers, diff --git a/tests/test_data.py b/tests/test_data.py index 03e2d987..17021c4d 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -14,11 +14,11 @@ def test_invalid_hf_dataset_config(): ) with pytest.raises(ValueError): - # return_set_frac and return_set_n_samples cannot be used together - replace_pydantic_model(base_config, {"return_set_frac": 0.5, "return_set_n_samples": 10}) - # return_set_n_documents and return_set_frac cannot be used - replace_pydantic_model(base_config, {"return_set_frac": 0.5, "return_set_n_documents": 10}) - # If return_set_n_documents is used, return_set_n_samples must be not None - replace_pydantic_model(base_config, {"return_set_n_documents": 10}) - # If return_set_n_documents is used, return_set_frac must be None - replace_pydantic_model(base_config, {"return_set_n_documents": 10, "return_set_frac": 0.5}) + # return_set_frac and n_samples cannot be used together + replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_samples": 10}) + # n_documents and return_set_frac cannot be used + replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_documents": 10}) + # If n_documents is used, n_samples must be not None + replace_pydantic_model(base_config, {"n_documents": 10}) + # If n_documents is used, return_set_frac must be None + replace_pydantic_model(base_config, {"n_documents": 10, "return_set_frac": 0.5}) diff --git a/tests/test_float_precision.py b/tests/test_float_precision.py index 91efee93..6ab314e8 100644 --- a/tests/test_float_precision.py +++ b/tests/test_float_precision.py @@ -32,8 +32,8 @@ def rib_results(self, temp_object) -> dict[str, RibBuildResults]: tokenizer_name: EleutherAI/pythia-14m return_set: train # pile-10k only has train, so we take the first 90% for building and last 10% for ablations return_set_frac: null - return_set_n_documents: 30 - return_set_n_samples: 3 + n_documents: 30 + n_samples: 3 return_set_portion: first node_layers: - mlp_out.0 @@ -143,8 +143,8 @@ def ablation_results(self, temp_object, rib_results) -> dict: tokenizer_name: EleutherAI/pythia-14m return_set: train return_set_frac: null - return_set_n_documents: 30 - return_set_n_samples: 3 + n_documents: 30 + n_samples: 3 return_set_portion: first ablation_node_layers: - mlp_out.0 diff --git a/tests/utils.py b/tests/utils.py index 9b7eea7d..4790a636 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,7 +19,7 @@ def get_modular_arithmetic_config(*updates: dict) -> RibBuildConfig: dataset: dataset_type: modular_arithmetic return_set: train - return_set_n_samples: 10 + n_samples: 10 node_layers: - ln1.0 - mlp_in.0 @@ -53,8 +53,8 @@ def get_pythia_config(*updates: dict) -> RibBuildConfig: tokenizer_name: EleutherAI/pythia-14m return_set: train return_set_frac: null - return_set_n_documents: 20 - return_set_n_samples: 3 + n_documents: 20 + n_samples: 3 return_set_portion: first n_ctx: 128 seed: 0 @@ -88,7 +88,7 @@ def get_tinystories_config(*updates: dict) -> RibBuildConfig: tokenizer_name: EleutherAI/gpt-neo-125M return_set: train return_set_frac: null - return_set_n_samples: 1 # avg ~235 toks / story + n_samples: 1 # avg ~235 toks / story return_set_portion: first n_ctx: 10 # needs to be <= 511 for the model to behave reasonably node_layers: From a0ada4fa34fc5e42f4eee9846b664e396b88d07c Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 16:44:06 +0000 Subject: [PATCH 05/12] Ensure tokenizers have eos_token_id --- rib/loader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rib/loader.py b/rib/loader.py index 14c17c17..9a34bd24 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -246,6 +246,12 @@ def tokenize_dataset( # https://github.com/EleutherAI/pythia/issues/123#issuecomment-1791136253 all_tokens.extend(tokens + [tokenizer.eos_token_id]) + # There shouldn't be any padding tokens, so ensure that there are len(dataset) eos tokens + len_dataset = len(dataset) # type: ignore + assert all_tokens.count(tokenizer.eos_token_id) == len_dataset, ( + f"Number of eos tokens ({all_tokens.count(tokenizer.eos_token_id)}) does not match " + f"number of samples ({len_dataset})." + ) # Split the merged tokens into chunks that fit the context length raw_chunks = [all_tokens[i : i + n_ctx] for i in range(0, len(all_tokens), n_ctx)] @@ -337,13 +343,14 @@ def create_hf_dataset( tokenizer = AutoTokenizer.from_pretrained(dataset_config.tokenizer_name) tokenizer.pad_token = tokenizer.eos_token - return tokenize_dataset( + tokenized_dataset = tokenize_dataset( dataset=raw_dataset, tokenizer=tokenizer, n_ctx=n_ctx, n_samples=dataset_config.n_samples, seed=dataset_config.seed, ) + return tokenized_dataset def create_vision_dataset(dataset_config: VisionDatasetConfig) -> Dataset: From 588807bf565c33477858cdc4cc82f2def7694549 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 17:13:20 +0000 Subject: [PATCH 06/12] Support validation return_set (for tinystories) --- rib/data.py | 4 ++-- rib/loader.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/rib/data.py b/rib/data.py index f0fa78c7..56dbbc23 100644 --- a/rib/data.py +++ b/rib/data.py @@ -22,7 +22,7 @@ class DatasetConfig(BaseModel): """Base class for dataset configs.""" model_config = ConfigDict(extra="forbid", frozen=True) - return_set: Literal["train", "test", "all"] = Field( + return_set: Literal["train", "test", "validation", "all"] = Field( "train", description="The dataset split to return. If 'all', returns the combined train and test " "datasets.", @@ -68,7 +68,7 @@ class HFDatasetConfig(DatasetConfig): description="The HuggingFace name for the tokenizer. Please check whether the tokenizer is " "compatible with the model you are using.", ) - return_set: Literal["train", "test"] = Field( + return_set: Literal["train", "test", "validation"] = Field( ..., description="The dataset split to return from HuggingFace." ) return_set_portion: Literal["first", "last"] = Field( diff --git a/rib/loader.py b/rib/loader.py index 9a34bd24..ef158a84 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -320,7 +320,11 @@ def create_hf_dataset( f"({model_n_ctx})." ) - assert dataset_config.return_set in ["train", "test"], "Only train and test sets are supported" + assert dataset_config.return_set in [ + "train", + "test", + "validation", + ], f"Invalid return_set: {dataset_config.return_set}. Must be one of train, test, validation." if dataset_config.return_set_frac: # Sample from all documents in return_set_frac% of return_set_portion From f6458e778a296022034be434f25b88b6cbbfc0c2 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 17:17:41 +0000 Subject: [PATCH 07/12] Fix broken test that used edge_formula=stochastic --- rib/data.py | 2 +- tests/test_build_graph.py | 2 +- tests/utils.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rib/data.py b/rib/data.py index 56dbbc23..15131745 100644 --- a/rib/data.py +++ b/rib/data.py @@ -1,4 +1,4 @@ -"""Define custom datasets.""" +"""Defines the dataset configs and datasets used in RIB.""" from typing import Literal, Optional diff --git a/tests/test_build_graph.py b/tests/test_build_graph.py index 38499bcf..68234dcc 100644 --- a/tests/test_build_graph.py +++ b/tests/test_build_graph.py @@ -220,7 +220,7 @@ def test_pythia_14m_build_graph_jacobian_stochastic(): "dataset": {"n_documents": 10, "n_samples": 1, "n_ctx": 2}, "node_layers": ["ln2.1", "mlp_out.5", "unembed"], "calculate_edges": True, - "edge_formula": "stochastic", + "edge_formula": "squared", "n_stochastic_sources_edges": 1, } ) diff --git a/tests/utils.py b/tests/utils.py index 4790a636..02250689 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -67,6 +67,7 @@ def get_pythia_config(*updates: dict) -> RibBuildConfig: n_intervals: 0 dtype: float64 calculate_edges: false + edge_formula: squared eval_type: ce_loss out_dir: null basis_formula: (1-0)*alpha From c2ba2183b25d7f021c09fb58c067adbe6e2aab6a Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 19:24:29 +0000 Subject: [PATCH 08/12] Hacky temp fix for test_stochastic_basis_tinystories --- tests/test_build_graph.py | 3 +-- tests/utils.py | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_build_graph.py b/tests/test_build_graph.py index 68234dcc..88527a20 100644 --- a/tests/test_build_graph.py +++ b/tests/test_build_graph.py @@ -687,8 +687,7 @@ def no_stoc_result(): @pytest.mark.parametrize( ["pos_sources", "hidden_sources", "error"], [ - [None, 10, 0.2], - [None, 40, 0.07], + [None, 40, 0.1], [2, None, 0.07], [2, 40, 0.1], ], diff --git a/tests/utils.py b/tests/utils.py index 02250689..948ad243 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -89,7 +89,8 @@ def get_tinystories_config(*updates: dict) -> RibBuildConfig: tokenizer_name: EleutherAI/gpt-neo-125M return_set: train return_set_frac: null - n_samples: 1 # avg ~235 toks / story + n_documents: 1 # avg ~235 toks / story + n_samples: 15 return_set_portion: first n_ctx: 10 # needs to be <= 511 for the model to behave reasonably node_layers: @@ -217,9 +218,7 @@ def _assignment_permutations(sim: torch.Tensor) -> tuple[list[int], list[int]]: def assert_basis_similarity( - ir_A: InteractionRotation, - ir_B: InteractionRotation, - error: Optional[float] = 0.02, + ir_A: InteractionRotation, ir_B: InteractionRotation, error: Optional[float] = 0.02 ): """ Compare two InteractionRotations and assert similarity, allowing for permutations. From f895ad2d4f24b2d23681cef50b215448197ae943 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Wed, 31 Jan 2024 19:25:09 +0000 Subject: [PATCH 09/12] Simplify validation for return_set_frac, n_documents, n_samples --- .vscode/launch.json | 9 +++++ rib/data.py | 54 +++++++++++++------------- rib/loader.py | 8 +++- rib_scripts/rib_build/tinystories.yaml | 3 +- 4 files changed, 44 insertions(+), 30 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index a6527357..5a81778d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -85,6 +85,15 @@ "console": "integratedTerminal", "justMyCode": true }, + { + "name": "build tiny stories", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/rib_scripts/rib_build/run_rib_build.py", + "args": "${workspaceFolder}/rib_scripts/rib_build/tinystories.yaml", + "console": "integratedTerminal", + "justMyCode": true + }, { "name": "build edges_pythia-14m", "type": "python", diff --git a/rib/data.py b/rib/data.py index 15131745..d1aab7c8 100644 --- a/rib/data.py +++ b/rib/data.py @@ -29,30 +29,26 @@ class DatasetConfig(BaseModel): ) return_set_frac: Optional[float] = Field( None, - description="The fraction of the returned dataset (train/test/all) to use. Cannot be" - "used with n_samples.", + description="The fraction of the returned dataset (train/test/validation/all) to load. " + "This will be sampled from using n_samples if n_samples is not None.", ) n_samples: Optional[int] = Field( None, - description="The number of raw samples to return from the dataset (train/test/all). " - "Cannot be used with return_set_frac.", + description="The number of n_ctx length tokenized samples to load from the dataset. This " + "will be sampled from either return_set_frac or n_documents if they are not None, or the " + "entire dataset if they are None. If n_samples is None, will load all samples in " + "return_set_frac (or n_documents if provided in a child class).", ) @model_validator(mode="after") - def verify_return_set_frac_and_n_samples(self) -> "DatasetConfig": - """Verify not both return_set_frac and n_samples are set and check values.""" + def verify_return_set_options(self) -> "DatasetConfig": + """Can't have both return_set_frac and n_samples be non-None for dataset with n_documents.""" frac = self.return_set_frac + if not hasattr(self, "n_documents") and (frac is not None and self.n_samples is not None): + raise ValueError( + "Cannot have both return_set_frac and n_samples be non-None for this dataset." + ) - if frac is not None: - if self.n_samples is not None: - raise ValueError("Cannot have both return_set_frac and n_samples be non-None.") - if isinstance(self, HFDatasetConfig) and (frac < 0.01 or frac > 1): - raise ValueError( - f"return_set_frac must be > 0.01 and < 1 since huggingface dataset `split` " - f"method does not correctly convert other values to perecentages." - ) - if frac <= 0 or frac > 1: - raise ValueError(f"return_set_frac must be > 0 and <= 1.") return self @@ -76,9 +72,9 @@ class HFDatasetConfig(DatasetConfig): ) n_documents: Optional[int] = Field( None, - description="The number of documents to load from the dataset. If None and " - "n_samples is not None, will load all documents before sampling. If None and " - "n_samples is None, will load all possible samples in return_set_frac.", + description="The number of documents to load from the dataset before (optional) sampling " + "with n_samples. If None, will load all documents in return_set_frac (or all documents if " + "return_set_frac is None).", ) n_ctx: Optional[int] = Field( None, @@ -88,14 +84,18 @@ class HFDatasetConfig(DatasetConfig): seed: Optional[int] = Field(0, description="The random seed value for reproducibility.") @model_validator(mode="after") - def verify_return_n_documents(self) -> "HFDatasetConfig": - """Verify n_documents and related fields are set correctly.""" - if self.n_documents is not None: - if self.n_samples is None or self.return_set_frac is not None: - raise ValueError( - "If n_documents is not None, then n_samples must be " - "not None and return_set_frac must be None." - ) + def verify_return_set_options(self) -> "HFDatasetConfig": + frac = self.return_set_frac + # Can't have both return_set_frac and n_documents be non-None + if frac is not None and self.n_documents is not None: + raise ValueError( + "Cannot have both return_set_frac and n_documents be non-None for HF datasets." + ) + if frac is not None and (frac < 0.01 or frac > 1): + raise ValueError( + f"return_set_frac must be > 0.01 and < 1 since huggingface dataset `split` " + f"method does not correctly convert other values to perecentages." + ) return self diff --git a/rib/loader.py b/rib/loader.py index ef158a84..9161b217 100644 --- a/rib/loader.py +++ b/rib/loader.py @@ -226,6 +226,9 @@ def tokenize_dataset( Tokenizes the dataset and splits it into chunks that fit the context length. The labels are the input_ids shifted by one position. + The final chunk is not included in the dataset as it does not have a label for its final token. + Excluding it also means that we don't have to worry about padding. + Args: raw_dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): The raw dataset to tokenize. Created from `hf_load_dataset`. @@ -257,17 +260,18 @@ def tokenize_dataset( # Note that we ignore the final raw_chunk, as we get the label for the final token in a chunk # from the subsequent chunk. + n_raw_chunks = len(raw_chunks) - 1 if n_samples is not None: # Randomly select n_samples chunks generator = torch.Generator() if seed is None else torch.Generator().manual_seed(seed) - raw_chunk_idxs = torch.randperm(len(raw_chunks) - 1, generator=generator) + raw_chunk_idxs = torch.randperm(n_raw_chunks, generator=generator) assert len(raw_chunk_idxs) >= n_samples, ( f"Cannot sample {n_samples} chunks from dataset with {len(raw_chunks)} chunks of " f"length {n_ctx}." ) chunk_idxs = raw_chunk_idxs[:n_samples].tolist() else: - chunk_idxs = list(range(len(raw_chunks) - 1)) + chunk_idxs = list(range(n_raw_chunks)) chunks = [raw_chunks[i] for i in chunk_idxs] diff --git a/rib_scripts/rib_build/tinystories.yaml b/rib_scripts/rib_build/tinystories.yaml index 822531bb..32cdd605 100644 --- a/rib_scripts/rib_build/tinystories.yaml +++ b/rib_scripts/rib_build/tinystories.yaml @@ -8,7 +8,8 @@ dataset: tokenizer_name: EleutherAI/gpt-neo-125M return_set: train return_set_frac: null - n_samples: 5000 # avg ~235 toks / story + n_documents: 5000 # avg ~235 toks / story + n_samples: 3000 return_set_portion: first n_ctx: 256 # needs to be <= 511 for the model to behave reasonably node_layers: From 00414ac81454d924f7430cad94bf47ebb977ac12 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Thu, 1 Feb 2024 10:08:30 +0000 Subject: [PATCH 10/12] Fix dataset config validation tests --- tests/test_data.py | 61 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 17021c4d..c8c1104c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,11 +1,20 @@ import pytest -from rib.data import HFDatasetConfig +from rib.data import ( + BlockVectorDatasetConfig, + HFDatasetConfig, + ModularArithmeticDatasetConfig, + VisionDatasetConfig, +) from rib.utils import replace_pydantic_model -def test_invalid_hf_dataset_config(): - """Test that invalid combinations of attributes in HFDatasetConfig are caught.""" +def test_hf_dataset_config_validation(): + """Test the validation of the HFDatasetConfig model. + + For HF datasets, we can't have both return_set_frac and n_documents be non-None, but we can + have all other combinations. + """ base_config = HFDatasetConfig( dataset_type="huggingface", name="test", @@ -13,12 +22,42 @@ def test_invalid_hf_dataset_config(): return_set="train", ) + # valid combinations + replace_pydantic_model( + base_config, {"return_set_frac": 0.5, "n_samples": 10, "n_documents": None} + ) + replace_pydantic_model( + base_config, {"return_set_frac": None, "n_samples": 10, "n_documents": 10} + ) + replace_pydantic_model( + base_config, {"return_set_frac": None, "n_samples": None, "n_documents": 10} + ) + replace_pydantic_model( + base_config, {"return_set_frac": 0.5, "n_samples": None, "n_documents": None} + ) + + # invalid combination with pytest.raises(ValueError): - # return_set_frac and n_samples cannot be used together - replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_samples": 10}) - # n_documents and return_set_frac cannot be used - replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_documents": 10}) - # If n_documents is used, n_samples must be not None - replace_pydantic_model(base_config, {"n_documents": 10}) - # If n_documents is used, return_set_frac must be None - replace_pydantic_model(base_config, {"n_documents": 10, "return_set_frac": 0.5}) + replace_pydantic_model( + base_config, {"return_set_frac": 0.5, "n_samples": 10, "n_documents": 10} + ) + + +def test_non_hf_dataset_config_validation(): + """Test the validation of dataset configs that are not HFDatasetConfig. + + We can't have both return_set_frac and n_samples be non-None. + """ + for base_config in [ + BlockVectorDatasetConfig(dataset_type="block_vector"), + VisionDatasetConfig(dataset_type="torchvision"), + ModularArithmeticDatasetConfig(dataset_type="modular_arithmetic"), + ]: + # valid combinations + replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_samples": None}) + replace_pydantic_model(base_config, {"return_set_frac": None, "n_samples": 10}) + replace_pydantic_model(base_config, {"return_set_frac": None, "n_samples": None}) + + # invalid combination + with pytest.raises(ValueError): + replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_samples": 10}) From 2ca210ba800b97b33076ba997b486a456d87a8e8 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Thu, 1 Feb 2024 10:10:20 +0000 Subject: [PATCH 11/12] Clean validation tests further --- tests/test_data.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index c8c1104c..8a98a3d3 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -21,23 +21,17 @@ def test_hf_dataset_config_validation(): tokenizer_name="test", return_set="train", ) + valid_combinations = [ + {"return_set_frac": 0.5, "n_samples": None, "n_documents": None}, + {"return_set_frac": None, "n_samples": 10, "n_documents": None}, + {"return_set_frac": None, "n_samples": None, "n_documents": 10}, + {"return_set_frac": 0.5, "n_samples": None, "n_documents": 10}, + ] + for combination in valid_combinations: + replace_pydantic_model(base_config, combination) - # valid combinations - replace_pydantic_model( - base_config, {"return_set_frac": 0.5, "n_samples": 10, "n_documents": None} - ) - replace_pydantic_model( - base_config, {"return_set_frac": None, "n_samples": 10, "n_documents": 10} - ) - replace_pydantic_model( - base_config, {"return_set_frac": None, "n_samples": None, "n_documents": 10} - ) - replace_pydantic_model( - base_config, {"return_set_frac": 0.5, "n_samples": None, "n_documents": None} - ) - - # invalid combination with pytest.raises(ValueError): + # invalid combination replace_pydantic_model( base_config, {"return_set_frac": 0.5, "n_samples": 10, "n_documents": 10} ) @@ -53,11 +47,14 @@ def test_non_hf_dataset_config_validation(): VisionDatasetConfig(dataset_type="torchvision"), ModularArithmeticDatasetConfig(dataset_type="modular_arithmetic"), ]: - # valid combinations - replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_samples": None}) - replace_pydantic_model(base_config, {"return_set_frac": None, "n_samples": 10}) - replace_pydantic_model(base_config, {"return_set_frac": None, "n_samples": None}) + valid_combinations = [ + {"return_set_frac": 0.5, "n_samples": None}, + {"return_set_frac": None, "n_samples": 10}, + {"return_set_frac": None, "n_samples": None}, + ] + for combination in valid_combinations: + replace_pydantic_model(base_config, combination) - # invalid combination with pytest.raises(ValueError): + # invalid combination replace_pydantic_model(base_config, {"return_set_frac": 0.5, "n_samples": 10}) From 680c28f24e9e86bd0d83d26a3a97d761a4a9b8e8 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Thu, 1 Feb 2024 10:34:50 +0000 Subject: [PATCH 12/12] Fix failing dataset config validation tests --- tests/test_data.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 8a98a3d3..016a7ca2 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -25,16 +25,23 @@ def test_hf_dataset_config_validation(): {"return_set_frac": 0.5, "n_samples": None, "n_documents": None}, {"return_set_frac": None, "n_samples": 10, "n_documents": None}, {"return_set_frac": None, "n_samples": None, "n_documents": 10}, - {"return_set_frac": 0.5, "n_samples": None, "n_documents": 10}, + {"return_set_frac": 0.1, "n_samples": 10, "n_documents": None}, + {"return_set_frac": None, "n_samples": 10, "n_documents": 10}, ] for combination in valid_combinations: replace_pydantic_model(base_config, combination) with pytest.raises(ValueError): - # invalid combination + # Can't have both return_set_frac and n_documents be non-None replace_pydantic_model( base_config, {"return_set_frac": 0.5, "n_samples": 10, "n_documents": 10} ) + with pytest.raises(ValueError): + # Frac is < 0.01 + replace_pydantic_model(base_config, {"return_set_frac": 0.001, "n_samples": None}) + with pytest.raises(ValueError): + # Frac is > 1 + replace_pydantic_model(base_config, {"return_set_frac": 1.1, "n_samples": None}) def test_non_hf_dataset_config_validation():