diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d75a2809..f35e7b07 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: # Standard hooks - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.2.0 + rev: v6.0.0 hooks: - id: check-added-large-files - id: check-case-conflict @@ -35,7 +35,7 @@ repos: exclude: ^pii_processing/ - repo: https://github.com/asottile/pyupgrade - rev: v2.32.1 + rev: v3.21.2 hooks: - id: pyupgrade exclude: ^pii_processing/ @@ -46,21 +46,21 @@ repos: # - id: isort # Black, the code formatter, natively supports pre-commit -- repo: https://github.com/psf/black - rev: 22.3.0 # Keep in sync with blacken-docs +- repo: https://github.com/psf/black-pre-commit-mirror + rev: 26.1.0 # Keep in sync with blacken-docs hooks: - id: black exclude: ^pii_processing/ # Changes tabs to spaces - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.1.14 + rev: v1.5.6 hooks: - id: remove-tabs exclude: ^(pii_processing|.*Makefile) - repo: https://github.com/shellcheck-py/shellcheck-py - rev: v0.8.0.4 + rev: v0.11.0.1 hooks: - id: shellcheck exclude: ^(pii_processing/|cc_pseudo_crawl) diff --git a/ac_dc/anonymization.py b/ac_dc/anonymization.py index af61230f..e5fed667 100644 --- a/ac_dc/anonymization.py +++ b/ac_dc/anonymization.py @@ -30,7 +30,7 @@ def apply_regex_anonymization( tag_type=tag_type, ) if anonymize_condition: - for (ent, start, end, tag) in ner: + for ent, start, end, tag in ner: # we need to actually walk through and replace by start, end span. sentence = sentence.replace(ent, f" <{tag}> ") return sentence, ner diff --git a/ac_dc/deduplicate/self_deduplicate.py b/ac_dc/deduplicate/self_deduplicate.py index 74cf88b2..be8008e2 100644 --- a/ac_dc/deduplicate/self_deduplicate.py +++ b/ac_dc/deduplicate/self_deduplicate.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # @Date : 2022-01-08 22:39:29 # @Author : Chenghao Mou (mouchenghao@gmail.com) # @Description: Self-deduplication with `datasets` @@ -28,7 +27,7 @@ def main(conf: str) -> None: - with open(conf, "r") as f: + with open(conf) as f: conf = yaml.safe_load(f.read()) if conf["load_from_disk"]["path"]: diff --git a/ac_dc/languages_id.py b/ac_dc/languages_id.py index 6220d013..7f027ab3 100644 --- a/ac_dc/languages_id.py +++ b/ac_dc/languages_id.py @@ -1,6 +1,5 @@ import pandas as pd - langs_id = [ { "lang": "Arabic", diff --git a/ac_dc/normalization.py b/ac_dc/normalization.py index 652e810f..6e18e3c0 100644 --- a/ac_dc/normalization.py +++ b/ac_dc/normalization.py @@ -1,7 +1,6 @@ import re from typing import Dict - non_printing_characters_re = re.compile( f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" ) diff --git a/ac_dc/parameters_filtering.py b/ac_dc/parameters_filtering.py index f4930347..2341c79a 100644 --- a/ac_dc/parameters_filtering.py +++ b/ac_dc/parameters_filtering.py @@ -1,7 +1,6 @@ import string import emoji - main_special_characters = string.punctuation + string.digits + string.whitespace other_special_characters = ( "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" diff --git a/ac_dc/visualization/get_data_for_visualization.py b/ac_dc/visualization/get_data_for_visualization.py index 55c241f7..f529db16 100644 --- a/ac_dc/visualization/get_data_for_visualization.py +++ b/ac_dc/visualization/get_data_for_visualization.py @@ -90,9 +90,9 @@ def compute_stats(self): ) for n in range(2, 16) } - stats_document[ - "character_repetition_ratio" - ] = character_repetition_ratios + stats_document["character_repetition_ratio"] = ( + character_repetition_ratios + ) word_repetition_ratios = { n: round( diff --git a/ac_dc/visualization/visualization.py b/ac_dc/visualization/visualization.py index 3e532aa4..1900e297 100644 --- a/ac_dc/visualization/visualization.py +++ b/ac_dc/visualization/visualization.py @@ -290,16 +290,16 @@ def get_cond(key, cutoff, max_cutoff): "stopwords_ratio" ] for i in range(len(self.docs["stopwords_ratio"])): - self.docs["stopwords_ratio"].iloc[ - i - ] = Filtering.compute_stopwords_ratio( - self.docs["text"].iloc[i], - self.sentencepiece_model_tok, - self.param["strip_characters"], - self.param["cond_words_augmentation"], - self.param["words_augmentation_group_sizes"], - self.param["words_augmentation_join_char"], - new_stopwords, + self.docs["stopwords_ratio"].iloc[i] = ( + Filtering.compute_stopwords_ratio( + self.docs["text"].iloc[i], + self.sentencepiece_model_tok, + self.param["strip_characters"], + self.param["cond_words_augmentation"], + self.param["words_augmentation_group_sizes"], + self.param["words_augmentation_join_char"], + new_stopwords, + ) ) cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed." cutoff_stopwords_ratio = st.slider( @@ -326,16 +326,16 @@ def get_cond(key, cutoff, max_cutoff): "flagged_words_ratio" ] for i in range(len(self.docs["flagged_words_ratio"])): - self.docs["flagged_words_ratio"].iloc[ - i - ] = Filtering.compute_flagged_words_ratio( - self.docs["text"].iloc[i], - self.sentencepiece_model_tok, - self.param["strip_characters"], - self.param["cond_words_augmentation"], - self.param["words_augmentation_group_sizes"], - self.param["words_augmentation_join_char"], - new_flagged_words, + self.docs["flagged_words_ratio"].iloc[i] = ( + Filtering.compute_flagged_words_ratio( + self.docs["text"].iloc[i], + self.sentencepiece_model_tok, + self.param["strip_characters"], + self.param["cond_words_augmentation"], + self.param["words_augmentation_group_sizes"], + self.param["words_augmentation_join_char"], + new_flagged_words, + ) ) cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed." max_fwr = np.max(self.docs["flagged_words_ratio"]) diff --git a/bertin/evaluation/run_glue.py b/bertin/evaluation/run_glue.py index a08cba25..a28775f2 100644 --- a/bertin/evaluation/run_glue.py +++ b/bertin/evaluation/run_glue.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Finetuning the library models for sequence classification on GLUE.""" +"""Finetuning the library models for sequence classification on GLUE.""" + # You can also adapt this script on your own text classification task. Pointers for this are left as comments. import logging @@ -384,9 +384,11 @@ def main(): # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( - model_args.config_name - if model_args.config_name - else model_args.model_name_or_path, + ( + model_args.config_name + if model_args.config_name + else model_args.model_name_or_path + ), num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, @@ -394,9 +396,11 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name - if model_args.tokenizer_name - else model_args.model_name_or_path, + ( + model_args.tokenizer_name + if model_args.tokenizer_name + else model_args.model_name_or_path + ), cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, diff --git a/bertin/evaluation/run_ner.py b/bertin/evaluation/run_ner.py index dbd9cd9a..b19d0024 100644 --- a/bertin/evaluation/run_ner.py +++ b/bertin/evaluation/run_ner.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2020 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,6 +15,7 @@ """ Fine-tuning the library models for token classification. """ + # You can also adapt this script on your own token classification task and datasets. Pointers for this are left as # comments. @@ -364,9 +364,11 @@ def get_label_list(labels): # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( - model_args.config_name - if model_args.config_name - else model_args.model_name_or_path, + ( + model_args.config_name + if model_args.config_name + else model_args.model_name_or_path + ), num_labels=num_labels, label2id=label_to_id, id2label={i: l for l, i in label_to_id.items()}, @@ -636,9 +638,9 @@ def compute_metrics(p): kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name - kwargs[ - "dataset" - ] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + kwargs["dataset"] = ( + f"{data_args.dataset_name} {data_args.dataset_config_name}" + ) else: kwargs["dataset"] = data_args.dataset_name diff --git a/bertin/mc4/mc4.py b/bertin/mc4/mc4.py index 923e5e20..e02cb932 100644 --- a/bertin/mc4/mc4.py +++ b/bertin/mc4/mc4.py @@ -1,6 +1,5 @@ """Perplexity Sampled mC4 dataset based on Common Crawl.""" - import gzip import json @@ -404,7 +403,7 @@ def _generate_examples(self, filepaths): for filepath in filepaths: logger.info("generating examples from = %s", filepath) if filepath.endswith("jsonl"): - with open(filepath, "r", encoding="utf-8") as f: + with open(filepath, encoding="utf-8") as f: for line in f: if line: example = json.loads(line) diff --git a/bertin/run_mlm_flax.py b/bertin/run_mlm_flax.py index 54251b94..68747cd8 100644 --- a/bertin/run_mlm_flax.py +++ b/bertin/run_mlm_flax.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2021 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,6 +19,7 @@ Here is the full list of checkpoints on the hub that can be fine-tuned by this script: https://huggingface.co/models?filter=masked-lm """ + import logging import os import sys diff --git a/bertin/run_mlm_flax_stream.py b/bertin/run_mlm_flax_stream.py index a33eaae1..3087aecb 100644 --- a/bertin/run_mlm_flax_stream.py +++ b/bertin/run_mlm_flax_stream.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2021 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,6 +19,7 @@ Here is the full list of checkpoints on the hub that can be fine-tuned by this script: https://huggingface.co/models?filter=masked-lm """ + import json import logging import os @@ -446,7 +446,7 @@ def restore_checkpoint(save_dir, state): args = joblib.load(os.path.join(save_dir, "training_args.joblib")) data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib")) - with open(os.path.join(save_dir, "training_state.json"), "r") as f: + with open(os.path.join(save_dir, "training_state.json")) as f: training_state = json.load(f) step = training_state["step"] diff --git a/bertin/utils/dataset_perplexity.py b/bertin/utils/dataset_perplexity.py index 2ca470c8..ecf02308 100644 --- a/bertin/utils/dataset_perplexity.py +++ b/bertin/utils/dataset_perplexity.py @@ -17,7 +17,7 @@ def get_perplexity(doc): with open("mc4-es-train-50M-stats.csv", "w") as csv: - with open("mc4-es-train-50M-steps.jsonl", "r") as data: + with open("mc4-es-train-50M-steps.jsonl") as data: for line in tqdm(data): text = json.loads(line)["text"] csv.write(f"{len(text.split())},{get_perplexity(text)}\n") diff --git a/cc_pseudo_crawl/python_scripts/deeper.py b/cc_pseudo_crawl/python_scripts/deeper.py index ac630f9b..86070c00 100644 --- a/cc_pseudo_crawl/python_scripts/deeper.py +++ b/cc_pseudo_crawl/python_scripts/deeper.py @@ -1,6 +1,7 @@ """ Generate list of urls to query for next depth. We then need to use Athena to make a fancy query. """ + import csv import re import subprocess diff --git a/cc_pseudo_crawl/python_scripts/download_warc.py b/cc_pseudo_crawl/python_scripts/download_warc.py index a5699c76..0ba1bde1 100644 --- a/cc_pseudo_crawl/python_scripts/download_warc.py +++ b/cc_pseudo_crawl/python_scripts/download_warc.py @@ -143,9 +143,9 @@ def get_warcs(batch): existing_compressed_warcs, ) - batch["compressed_warc"], batch["download_exception"] = [ + batch["compressed_warc"], batch["download_exception"] = ( list(l) for l in zip(*warcs_or_exceptions) - ] + ) return batch diff --git a/cc_pseudo_crawl/python_scripts/exact_deduplicates.py b/cc_pseudo_crawl/python_scripts/exact_deduplicates.py index 15eca712..b6703217 100644 --- a/cc_pseudo_crawl/python_scripts/exact_deduplicates.py +++ b/cc_pseudo_crawl/python_scripts/exact_deduplicates.py @@ -1,4 +1,5 @@ """Taken from Teven and Leandro""" + import gzip import os import shutil @@ -10,7 +11,6 @@ from datasets import load_dataset, Features from datasets.utils.logging import set_verbosity_info - set_verbosity_info() logger = logging.getLogger(__name__) diff --git a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py index 1753acf8..f605e7b9 100644 --- a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py +++ b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py @@ -21,7 +21,7 @@ def main(): seed_ids = [] for seed_path in args.seed_paths: - with open(seed_path, "r") as fi: + with open(seed_path) as fi: data = csv.reader(fi) # First line is all the headers that we remove. seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0] diff --git a/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py b/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py index f2cea5de..c672c889 100644 --- a/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py +++ b/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py @@ -126,7 +126,7 @@ def process_batch(batch, skip_set): # looks at up to the first 10K pages for a seed and # records lines that appear in at least 1% of the unique pages def get_lines_to_skip(dset, n_records, pourcentage_threshold, min_repetition_threshold): - line_counts = defaultdict(lambda: 0) + line_counts = defaultdict(int) seen_pages = set() seed = SeedSequence(42) diff --git a/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py b/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py index 8c1d0a8e..b3e0d93f 100644 --- a/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py +++ b/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py @@ -1,6 +1,7 @@ """ Deduplicating using `datasets` is much harder, we but we forgot to generate an id when building an index, so we're screwed. """ + import logging import subprocess from argparse import ArgumentParser diff --git a/kenlm_training/cc_net/execution.py b/kenlm_training/cc_net/execution.py index 6ab09a56..f88d6c93 100644 --- a/kenlm_training/cc_net/execution.py +++ b/kenlm_training/cc_net/execution.py @@ -19,8 +19,7 @@ class Executor(Protocol): - def __call__(self, function: Callable[..., str], *args: Iterable) -> None: - ... + def __call__(self, function: Callable[..., str], *args: Iterable) -> None: ... class SubmititRetryOnTimeout(submitit.helpers.Checkpointable): diff --git a/kenlm_training/cc_net/flat_hash_set.py b/kenlm_training/cc_net/flat_hash_set.py index f7529fe9..40eae07b 100644 --- a/kenlm_training/cc_net/flat_hash_set.py +++ b/kenlm_training/cc_net/flat_hash_set.py @@ -29,23 +29,17 @@ def __repr__(self): implementation = type(self).__name__ return f"[{implementation}, len: {len(self)}" - def __len__(self) -> int: - ... + def __len__(self) -> int: ... - def __contains__(self, values: Sequence[np.uint64]) -> np.ndarray: - ... + def __contains__(self, values: Sequence[np.uint64]) -> np.ndarray: ... - def __getitem__(self, values) -> np.ndarray: - ... + def __getitem__(self, values) -> np.ndarray: ... - def __setitem__(self, keys, values) -> None: - ... + def __setitem__(self, keys, values) -> None: ... - def items(self) -> Iterable[Tuple[np.uint64, np.uint8]]: - ... + def items(self) -> Iterable[Tuple[np.uint64, np.uint8]]: ... - def keys(self) -> Iterable[np.uint64]: - ... + def keys(self) -> Iterable[np.uint64]: ... def __iter__(self) -> Iterator[np.uint64]: return iter(self.keys()) diff --git a/kenlm_training/cc_net/jsonql.py b/kenlm_training/cc_net/jsonql.py index 0ff57f23..04b71c89 100644 --- a/kenlm_training/cc_net/jsonql.py +++ b/kenlm_training/cc_net/jsonql.py @@ -7,6 +7,7 @@ """ Manipulate files containing one json per line. """ + import argparse import collections import contextlib @@ -290,7 +291,7 @@ def __getstate__(self) -> Tuple[tuple, dict, bool]: def __setstate__(self, state: Tuple[tuple, dict, bool]): if self.warn_when_pickling: warnings.warn(f"Unpickling transformer: {type(self)}. This can be slow.") - (args, kwargs, expect_json) = state + args, kwargs, expect_json = state # When unpickling `__new__` isn't called so we have to doit ourselves. Transformer.__init__(self, state_args=args, state_kwargs=kwargs) type(self).__init__(self, *args, **kwargs) @@ -880,8 +881,7 @@ def describe(source, columns=None, weights=None, **kwargs): continue if "." in k or k == ALL_DOCUMENTS: continue - for line in display_stats(stats, k, weights=weights, **kwargs): - yield line + yield from display_stats(stats, k, weights=weights, **kwargs) def shard(lines): @@ -902,17 +902,13 @@ def get_or_set(dictionary, key, default): class SimpleIO(Protocol): """A subset of methods from TextIO.""" - def close(self) -> None: - ... + def close(self) -> None: ... - def write(self, line: str) -> int: - ... + def write(self, line: str) -> int: ... - def __enter__(self) -> "SimpleIO": - ... + def __enter__(self) -> "SimpleIO": ... - def __exit__(self, exc_type, exc_value, traceback): - ... + def __exit__(self, exc_type, exc_value, traceback): ... def open_read(filename: ReadableFileLike) -> Iterable[str]: @@ -961,7 +957,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]: if filename.suffix == ".gz": file: TextIO = gzip.open(filename, "rt") # type: ignore else: - file = open(filename, "rt") + file = open(filename) return _close_when_exhausted(file) @@ -1015,7 +1011,7 @@ def open_write( if filename.suffix == ".gz": return BlockedGzipWriter(Path(filename), mode, block_size="64M") - return open(filename, "wt") + return open(filename, "w") def parse_size(size): diff --git a/kenlm_training/cc_net/tools/expand_corpus.py b/kenlm_training/cc_net/tools/expand_corpus.py index 46d16bc4..1c5f9c41 100644 --- a/kenlm_training/cc_net/tools/expand_corpus.py +++ b/kenlm_training/cc_net/tools/expand_corpus.py @@ -277,7 +277,7 @@ def mine( print("Submited job array:", jobs[0]) for j in submitit.helpers.as_completed(jobs): - (i, o) = j.result() + i, o = j.result() print("Mined sentences from", i, "to", o) return outputs diff --git a/kenlm_training/tests/test_jsonql.py b/kenlm_training/tests/test_jsonql.py index 7d9768e7..ba6ea138 100644 --- a/kenlm_training/tests/test_jsonql.py +++ b/kenlm_training/tests/test_jsonql.py @@ -209,7 +209,7 @@ def test_split_file_middle_of_line(tmp_path: Path): def test_split_file_middle_of_char(tmp_path: Path): file = tmp_path / "test.txt" - content = "Hello\U0001F40D\nWorld\n" + content = "Hello\U0001f40d\nWorld\n" # split is here ^^ with open(file, "w") as o: @@ -262,7 +262,7 @@ def do(self, x): def acc(values): print("acc: started") res = 0 - for (x, _) in values: + for x, _ in values: res += int(x) print("acc: done") yield f"acc: result={res}" diff --git a/perplexity_lenses/perplexity_lenses/data.py b/perplexity_lenses/perplexity_lenses/data.py index 778749de..1d7781f2 100644 --- a/perplexity_lenses/perplexity_lenses/data.py +++ b/perplexity_lenses/perplexity_lenses/data.py @@ -34,9 +34,11 @@ def hub_dataset_to_dataframe( { text_column: sentence, "perplexity": model.get_perplexity(sentence), - "label": x.get("labels", [])[0] - if len(x.get("labels", [])) > 0 - else "NONE", # Special case for registry dataset + "label": ( + x.get("labels", [])[0] + if len(x.get("labels", [])) > 0 + else "NONE" + ), # Special case for registry dataset } for sentence in x[text_column].split("\n") ] @@ -46,9 +48,9 @@ def hub_dataset_to_dataframe( lambda x: { text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column]), - "label": x.get("labels", [])[0] - if len(x.get("labels", [])) > 0 - else "NONE", # Special case for registry dataset + "label": ( + x.get("labels", [])[0] if len(x.get("labels", [])) > 0 else "NONE" + ), # Special case for registry dataset } ) instances = [] diff --git a/pii-manager/setup.py b/pii-manager/setup.py index c5b0714b..4c369204 100644 --- a/pii-manager/setup.py +++ b/pii-manager/setup.py @@ -27,7 +27,7 @@ def requirements(filename="requirements.txt"): """Read the requirements file""" - with io.open(filename, "r") as f: + with open(filename) as f: return [line.strip() for line in f if line and line[0] != "#"] @@ -35,7 +35,7 @@ def long_description(): """ Take the README and remove markdown hyperlinks """ - with open("README.md", "rt", encoding="utf-8") as f: + with open("README.md", encoding="utf-8") as f: desc = f.read() desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M) return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X) diff --git a/pii-manager/src/pii_manager/api/manager.py b/pii-manager/src/pii_manager/api/manager.py index cdb3d7dd..3d32c851 100644 --- a/pii-manager/src/pii_manager/api/manager.py +++ b/pii-manager/src/pii_manager/api/manager.py @@ -15,7 +15,6 @@ from ..helper.exception import InvArgException from ..lang import LANG_ANY, COUNTRY_ANY - DEFAULT_TEMPLATES = {"replace": "<{name}>", "tag": "<{name}:{value}>"} @@ -31,13 +30,11 @@ def fetch_all_tasks( """ taskdict = get_taskdict(debug=debug) # Language-independent - for task in taskdict[LANG_ANY].values(): - yield task + yield from taskdict[LANG_ANY].values() langdict = taskdict.get(lang, {}) # Country-independent - for task in langdict.get(COUNTRY_ANY, {}).values(): - yield task + yield from langdict.get(COUNTRY_ANY, {}).values() # Country-specific if country: if country[0] in (COUNTRY_ANY, "all"): @@ -45,8 +42,7 @@ def fetch_all_tasks( for c in country: if c == COUNTRY_ANY: # already included above continue - for task in langdict.get(c, {}).values(): - yield task + yield from langdict.get(c, {}).values() def fetch_task( @@ -166,9 +162,7 @@ def __init__( self._process = ( self.process_full if self.mode == "full" - else self.process_extract - if self.mode == "extract" - else self.process_subst + else self.process_extract if self.mode == "extract" else self.process_subst ) def __repr__(self) -> str: diff --git a/pii-manager/src/pii_manager/helper/base.py b/pii-manager/src/pii_manager/helper/base.py index 013b4549..93fa55b2 100644 --- a/pii-manager/src/pii_manager/helper/base.py +++ b/pii-manager/src/pii_manager/helper/base.py @@ -10,7 +10,6 @@ from .context import context_spec, context_check, CONTEXT_NORM_OPTIONS from .exception import PiiUnimplemented - NORM_OPTIONS = dict(whitespace=True, lowercase=True) diff --git a/pii-manager/src/pii_manager/helper/context.py b/pii-manager/src/pii_manager/helper/context.py index 4b508055..d82976ad 100644 --- a/pii-manager/src/pii_manager/helper/context.py +++ b/pii-manager/src/pii_manager/helper/context.py @@ -9,7 +9,6 @@ from .exception import InvArgException from .normalizer import normalize - # Default width around a Pii where context is searched for DEFAULT_CONTEXT_WIDTH = 64 diff --git a/pii-manager/src/pii_manager/helper/json.py b/pii-manager/src/pii_manager/helper/json.py index 54694d67..6f453515 100644 --- a/pii-manager/src/pii_manager/helper/json.py +++ b/pii-manager/src/pii_manager/helper/json.py @@ -3,7 +3,6 @@ in particular PiiEntity objects """ - from collections.abc import Iterator import datetime import json diff --git a/pii-manager/src/pii_manager/helper/taskdict.py b/pii-manager/src/pii_manager/helper/taskdict.py index c733f1ff..fd6db31b 100644 --- a/pii-manager/src/pii_manager/helper/taskdict.py +++ b/pii-manager/src/pii_manager/helper/taskdict.py @@ -172,11 +172,11 @@ def build_subdict(task_list: List[Tuple], lang: str, country: str = None) -> Dic task_type = ( "PiiTask" if _is_pii_class(src[1]) - else "callable" - if callable(src[1]) - else "regex" - if isinstance(src[1], str) - else None + else ( + "callable" + if callable(src[1]) + else "regex" if isinstance(src[1], str) else None + ) ) # Build the dict td = {"pii": src[0], "type": task_type, "task": src[1]} diff --git a/pii-manager/src/pii_manager/lang/any/credit_card.py b/pii-manager/src/pii_manager/lang/any/credit_card.py index cc2db7d0..d523e270 100644 --- a/pii-manager/src/pii_manager/lang/any/credit_card.py +++ b/pii-manager/src/pii_manager/lang/any/credit_card.py @@ -15,7 +15,6 @@ from pii_manager import PiiEnum, PiiEntity from pii_manager.helper import BasePiiTask - # ---------------------------------------------------------------------------- # base regex to detect candidates to credit card numbers diff --git a/pii-manager/src/pii_manager/lang/any/email.py b/pii-manager/src/pii_manager/lang/any/email.py index bd7d8b81..924660a2 100644 --- a/pii-manager/src/pii_manager/lang/any/email.py +++ b/pii-manager/src/pii_manager/lang/any/email.py @@ -4,7 +4,6 @@ from pii_manager import PiiEnum - _EMAIL_PATTERN = r"[\w\.=-]+ @ [\w\.-]+ \. [\w]{2,3}" diff --git a/pii-manager/src/pii_manager/lang/any/ip_address.py b/pii-manager/src/pii_manager/lang/any/ip_address.py index 67089ac9..bbe057b7 100644 --- a/pii-manager/src/pii_manager/lang/any/ip_address.py +++ b/pii-manager/src/pii_manager/lang/any/ip_address.py @@ -4,7 +4,6 @@ from pii_manager import PiiEnum - _IP_PATTERN = r""" \b (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \. ){3} diff --git a/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py b/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py index 48c284b3..9d7bbf14 100644 --- a/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py +++ b/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py @@ -3,7 +3,6 @@ prefix and country code) """ - from pii_manager import PiiEnum PATTERN_INT_PHONE = r""" diff --git a/pii-manager/src/pii_manager/lang/en/au/abn.py b/pii-manager/src/pii_manager/lang/en/au/abn.py index c11b9a8f..2c5e077e 100644 --- a/pii-manager/src/pii_manager/lang/en/au/abn.py +++ b/pii-manager/src/pii_manager/lang/en/au/abn.py @@ -2,6 +2,7 @@ Detection and validation of Australian business number (ABN). """ + import re from stdnum.au import abn @@ -10,7 +11,6 @@ from pii_manager import PiiEnum - _ABN_PATTERN = r"\b (?: \d{2} \s \d{3} \s \d{3} \s \d{3} | \d{11} ) \b" _ABN_REGEX = re.compile(_ABN_PATTERN, flags=re.X) diff --git a/pii-manager/src/pii_manager/lang/en/au/tfn.py b/pii-manager/src/pii_manager/lang/en/au/tfn.py index 3f2384dc..ee198ea0 100644 --- a/pii-manager/src/pii_manager/lang/en/au/tfn.py +++ b/pii-manager/src/pii_manager/lang/en/au/tfn.py @@ -2,6 +2,7 @@ Detection and validation of Australian Tax File Number (TFN). """ + import re from stdnum.au import tfn @@ -10,7 +11,6 @@ from pii_manager import PiiEnum - _TFN_PATTERN = r"\b (?: \d{3} \s \d{3} \s \d{3} | \d{8,9} ) \b" _TFN_REGEX = re.compile(_TFN_PATTERN, flags=re.X) diff --git a/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py b/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py index fa3be47c..31007927 100644 --- a/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py +++ b/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py @@ -12,7 +12,6 @@ from pii_manager import PiiEnum - _SIN_REGEX = re.compile(r"\d{3}[-\ ]\d{3}[-\ ]\d{3}", flags=re.X) diff --git a/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py b/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py index d0569666..733ff1b8 100644 --- a/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py +++ b/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py @@ -12,7 +12,6 @@ from pii_manager import PiiEnum - _AADHAAR_REGEX = re.compile(r"[2-9]\d{3}\ ?\d{4}\ ?\d{4}", flags=re.X) diff --git a/pii-manager/src/pii_manager/lang/en/us/social_security_number.py b/pii-manager/src/pii_manager/lang/en/us/social_security_number.py index 92b3dd33..bc2a6b83 100644 --- a/pii-manager/src/pii_manager/lang/en/us/social_security_number.py +++ b/pii-manager/src/pii_manager/lang/en/us/social_security_number.py @@ -7,7 +7,6 @@ from pii_manager import PiiEnum - _SSN_PATTERN = r"(?!000|666|333)0*(?:[0-6][0-9][0-9]|[0-7][0-6][0-9]|[0-7][0-7][0-2])[-\ ](?!00)[0-9]{2}[-\ ](?!0000)[0-9]{4}" diff --git a/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py b/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py index f82aa441..9a32feea 100644 --- a/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py +++ b/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py @@ -3,13 +3,11 @@ prefix and country code), for ES """ - from pii_manager import PiiEnum # The pattern for the regex is the same as for English from ...en.any.international_phone_number import PATTERN_INT_PHONE - PII_TASKS = [ { "pii": PiiEnum.PHONE_NUMBER, diff --git a/pii-manager/src/pii_manager/lang/es/mx/curp.py b/pii-manager/src/pii_manager/lang/es/mx/curp.py index 96453673..2ab11777 100644 --- a/pii-manager/src/pii_manager/lang/es/mx/curp.py +++ b/pii-manager/src/pii_manager/lang/es/mx/curp.py @@ -12,7 +12,6 @@ from pii_manager import PiiEnum - _CURP_PATTERN = r"[A-Z] [AEIOU] [A-Z]{2} \d{6} [HM] [A-Z]{5} [0-9A-Z] \d" _CURP_REGEX = re.compile(_CURP_PATTERN, flags=re.X) diff --git a/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py b/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py index e3c3d428..faaf4a63 100644 --- a/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py +++ b/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py @@ -1,4 +1,5 @@ """ Reuse the SIN code implemented for en """ + from pii_manager.lang.en.ca.social_insurance_number import PII_TASKS diff --git a/pii-manager/src/pii_manager/lang/pt/br/cpf.py b/pii-manager/src/pii_manager/lang/pt/br/cpf.py index 607de1f1..d82bc55b 100644 --- a/pii-manager/src/pii_manager/lang/pt/br/cpf.py +++ b/pii-manager/src/pii_manager/lang/pt/br/cpf.py @@ -13,7 +13,6 @@ from pii_manager import PiiEnum - _CPF_REGEX = re.compile(r"\d{3} \. \d{3} \. \d{3} - \d{2}", flags=re.X) diff --git a/pii-manager/src/pii_manager/lang/pt/pt/govid.py b/pii-manager/src/pii_manager/lang/pt/pt/govid.py index b87195d5..278c6b97 100644 --- a/pii-manager/src/pii_manager/lang/pt/pt/govid.py +++ b/pii-manager/src/pii_manager/lang/pt/pt/govid.py @@ -13,7 +13,6 @@ from pii_manager import PiiEnum, PiiEntity from pii_manager.helper import BasePiiTask - # regex for NIF & CC _NIF_PATTERN = r"(?: PT \x20?)? (?: \d{3} \x20 \d{3} \x20 \d{3} | \d{9} )" _CC_PATTERN = r"\d{8} \x20? \d \x20? [A-Z0-9]{2}\d" diff --git a/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py b/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py index eea1c277..f96d7221 100644 --- a/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py +++ b/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py @@ -11,7 +11,6 @@ from stdnum.cn import ric - # Detect candidates (separately) for RIC and passport-like numbers _GOV_ID_PATTERN = r"(? str: def readfile(name: str) -> str: - with open(name, "rt", encoding="utf-8") as f: + with open(name, encoding="utf-8") as f: return f.read().strip() diff --git a/pii-manager/test/unit/api/test_file_taskfile.py b/pii-manager/test/unit/api/test_file_taskfile.py index 722dfa8f..3371d7b7 100644 --- a/pii-manager/test/unit/api/test_file_taskfile.py +++ b/pii-manager/test/unit/api/test_file_taskfile.py @@ -14,7 +14,7 @@ def datafile(name: str) -> str: def readfile(name: str) -> str: - with open(name, "rt", encoding="utf-8") as f: + with open(name, encoding="utf-8") as f: return f.read().strip() diff --git a/pii-manager/test/unit/api/test_manager.py b/pii-manager/test/unit/api/test_manager.py index 5f74dbf6..a4b342ce 100644 --- a/pii-manager/test/unit/api/test_manager.py +++ b/pii-manager/test/unit/api/test_manager.py @@ -3,7 +3,6 @@ from pii_manager import PiiEnum from pii_manager.api import PiiManager - TEST = ( "El número de la tarjeta de crédito es 4273 9666 4581 5642", "El número de la tarjeta de crédito es ", @@ -21,7 +20,10 @@ def test20_info(): info = obj.task_info() exp = { - (PiiEnum.CREDIT_CARD, None,): [ + ( + PiiEnum.CREDIT_CARD, + None, + ): [ ( "credit card", "Credit card numbers for most international credit cards (detect & validate)", diff --git a/pii-manager/test/unit/api/test_manager_add.py b/pii-manager/test/unit/api/test_manager_add.py index a61e5eee..a3116c46 100644 --- a/pii-manager/test/unit/api/test_manager_add.py +++ b/pii-manager/test/unit/api/test_manager_add.py @@ -9,7 +9,6 @@ from pii_manager.lang import COUNTRY_ANY from pii_manager.helper.base import BasePiiTask - # --------------------------------------------------------------------- DUMMY_REGEX = { @@ -47,7 +46,7 @@ def test110_call(): obj = PiiManager("en", None, PiiEnum.EMAIL_ADDRESS) obj.add_tasks([DUMMY_REGEX]) - for (doc, exp) in TEST_REGEX: + for doc, exp in TEST_REGEX: got = obj(doc) assert got == exp @@ -86,6 +85,6 @@ def test200_call(): obj = PiiManager("en") obj.add_tasks([DUMMY_CLASS]) - for (doc, exp) in TEST_CLASS: + for doc, exp in TEST_CLASS: got = obj(doc) assert got == exp diff --git a/pii-manager/test/unit/api/test_manager_ctx.py b/pii-manager/test/unit/api/test_manager_ctx.py index f74701fc..c475498e 100644 --- a/pii-manager/test/unit/api/test_manager_ctx.py +++ b/pii-manager/test/unit/api/test_manager_ctx.py @@ -38,7 +38,7 @@ def test10_context_regex(): """ obj = PiiManager("en", mode="extract") obj.add_tasks([DUMMY_REGEX]) - for (text, exp) in TEST: + for text, exp in TEST: got = obj(text) assert list(got) == exp @@ -64,6 +64,6 @@ def test20_context_class(): """ obj = PiiManager("en", mode="extract") obj.add_tasks([DUMMY_CLASS]) - for (text, exp) in TEST: + for text, exp in TEST: got = obj(text) assert list(got) == exp diff --git a/pii-manager/test/unit/helper/test_context.py b/pii-manager/test/unit/helper/test_context.py index 6a158864..39946f91 100644 --- a/pii-manager/test/unit/helper/test_context.py +++ b/pii-manager/test/unit/helper/test_context.py @@ -1,6 +1,7 @@ """ Test the context checking function """ + import pytest import pii_manager.helper.context as mod @@ -74,7 +75,7 @@ def test10_context_true(): """ Check valid contexts """ - for (text, context) in TEST_TRUE: + for text, context in TEST_TRUE: spec = mod.context_spec(context) assert mod.context_check(text, spec, 20) is True @@ -83,7 +84,7 @@ def test20_context_false(): """ Check invalid contexts """ - for (text, context) in TEST_FALSE: + for text, context in TEST_FALSE: spec = mod.context_spec(context) assert mod.context_check(text, spec, 20) is False diff --git a/pii-manager/test/unit/helper/test_norm.py b/pii-manager/test/unit/helper/test_norm.py index 0a1b73ba..20d72c50 100644 --- a/pii-manager/test/unit/helper/test_norm.py +++ b/pii-manager/test/unit/helper/test_norm.py @@ -1,6 +1,5 @@ import pii_manager.helper.normalizer as mod - TEST = [("the Social Security\nNumber is 34512", "the social security number is 34512")] @@ -8,5 +7,5 @@ def test10_normalizer(): """ Create base object """ - for (text, exp) in TEST: + for text, exp in TEST: assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp diff --git a/pii-manager/test/unit/lang/any/test_bitcoin_address.py b/pii-manager/test/unit/lang/any/test_bitcoin_address.py index 1f5da9f8..d65bc405 100644 --- a/pii-manager/test/unit/lang/any/test_bitcoin_address.py +++ b/pii-manager/test/unit/lang/any/test_bitcoin_address.py @@ -2,11 +2,9 @@ Test bitcoin addresses """ - from pii_manager import PiiEnum from pii_manager.api import PiiManager - TEST = [ # A valid bitcoin address ( diff --git a/pii-manager/test/unit/lang/any/test_credit_card.py b/pii-manager/test/unit/lang/any/test_credit_card.py index 35a68465..fee41526 100644 --- a/pii-manager/test/unit/lang/any/test_credit_card.py +++ b/pii-manager/test/unit/lang/any/test_credit_card.py @@ -5,7 +5,6 @@ from pii_manager import PiiEnum from pii_manager.api import PiiManager - TEST = [ # A valid credit card number ( diff --git a/pii-manager/test/unit/lang/any/test_email.py b/pii-manager/test/unit/lang/any/test_email.py index 835eb819..b4eff41d 100644 --- a/pii-manager/test/unit/lang/any/test_email.py +++ b/pii-manager/test/unit/lang/any/test_email.py @@ -5,7 +5,6 @@ from pii_manager import PiiEnum from pii_manager.api import PiiManager - TEST = [ # A valid email address ( diff --git a/pii-manager/test/unit/lang/any/test_ip_address.py b/pii-manager/test/unit/lang/any/test_ip_address.py index cd040cf9..a14b355e 100644 --- a/pii-manager/test/unit/lang/any/test_ip_address.py +++ b/pii-manager/test/unit/lang/any/test_ip_address.py @@ -5,7 +5,6 @@ from pii_manager import PiiEnum from pii_manager.api import PiiManager - TEST = [ # A valid IP address ( diff --git a/requirements.txt b/requirements.txt index 24e425cf..8dd4d35b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ fasttext>=0.9.2 fsspec ftfy indexed_gzip>=1.6.1 -indexed_gzip>=1.6.1 langid>=1.1.6 nltk scikit-learn diff --git a/tokenizer/python_script/dedup_exact_article.py b/tokenizer/python_script/dedup_exact_article.py index e658f838..ef18e130 100644 --- a/tokenizer/python_script/dedup_exact_article.py +++ b/tokenizer/python_script/dedup_exact_article.py @@ -1,4 +1,5 @@ """Taken from Teven and Leandro""" + import gzip import os import shutil @@ -9,7 +10,6 @@ from datasets import load_from_disk from datasets.utils.logging import set_verbosity_info - set_verbosity_info() logger = logging.getLogger(__name__) diff --git a/tokenizer/python_script/dedup_lines.py b/tokenizer/python_script/dedup_lines.py index ea3e4a81..f22f7a72 100644 --- a/tokenizer/python_script/dedup_lines.py +++ b/tokenizer/python_script/dedup_lines.py @@ -28,6 +28,7 @@ META_COLUMNS = ["meta"] + # filter text to remove certain lines (e.g. menu items, copyright notice) def filter_lines(article, skip_set, used_lines): # TODO discuss the strip @@ -48,9 +49,11 @@ def filter_lines(article, skip_set, used_lines): def filter_lines_by_batch(texts, skip_set, used_lines, preserve_code, metadata=None): if preserve_code: filtered_lines = [ - filter_lines(article, skip_set, used_lines) - if "lm_code" in eval(metadata_item)["source_dataset"] - else (article, "") + ( + filter_lines(article, skip_set, used_lines) + if "lm_code" in eval(metadata_item)["source_dataset"] + else (article, "") + ) for article, metadata_item in zip(texts, metadata) ] else: @@ -86,8 +89,8 @@ def process_batch(batch, skip_set, used_lines, args): # looks at up to the first 10K pages for a seed and # records lines that appear in at least 1% of the unique pages def get_lines_to_skip(dset, n_records, pourcentage_threshold, min_repetition_threshold): - line_counts = defaultdict(lambda: 0) - seen_pages = defaultdict(lambda: 0) + line_counts = defaultdict(int) + seen_pages = defaultdict(int) seed = SeedSequence(42) rng = default_rng(seed) diff --git a/tokenizer/python_script/ram_dedup_lines.py b/tokenizer/python_script/ram_dedup_lines.py index 5e6f68b5..4690f93d 100644 --- a/tokenizer/python_script/ram_dedup_lines.py +++ b/tokenizer/python_script/ram_dedup_lines.py @@ -78,7 +78,7 @@ def main(): f"Filtered out {number_of_samples_before - number_of_samples_after_filtering_none} / {number_of_samples_before}" ) - seen = defaultdict(lambda: 0) + seen = defaultdict(int) def remove_duplicate_lines(examples): new_exemples = {"text": [], "meta": []}