diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d75a2809..f35e7b07 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@
 repos:
 # Standard hooks
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.2.0
+  rev: v6.0.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
@@ -35,7 +35,7 @@ repos:
     exclude: ^pii_processing/
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v2.32.1
+  rev: v3.21.2
   hooks:
   - id: pyupgrade
     exclude: ^pii_processing/
@@ -46,21 +46,21 @@ repos:
 #  - id: isort
 
 # Black, the code formatter, natively supports pre-commit
-- repo: https://github.com/psf/black
-  rev: 22.3.0 # Keep in sync with blacken-docs
+- repo: https://github.com/psf/black-pre-commit-mirror
+  rev: 26.1.0 # Keep in sync with blacken-docs
   hooks:
   - id: black
     exclude: ^pii_processing/
 
 # Changes tabs to spaces
 - repo: https://github.com/Lucas-C/pre-commit-hooks
-  rev: v1.1.14
+  rev: v1.5.6
   hooks:
   - id: remove-tabs
     exclude: ^(pii_processing|.*Makefile)
 
 - repo: https://github.com/shellcheck-py/shellcheck-py
-  rev: v0.8.0.4
+  rev: v0.11.0.1
   hooks:
   - id: shellcheck
     exclude: ^(pii_processing/|cc_pseudo_crawl)
diff --git a/ac_dc/anonymization.py b/ac_dc/anonymization.py
index af61230f..e5fed667 100644
--- a/ac_dc/anonymization.py
+++ b/ac_dc/anonymization.py
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
         tag_type=tag_type,
     )
     if anonymize_condition:
-        for (ent, start, end, tag) in ner:
+        for ent, start, end, tag in ner:
             # we need to actually walk through and replace by start, end span.
             sentence = sentence.replace(ent, f" <{tag}> ")
     return sentence, ner
diff --git a/ac_dc/deduplicate/self_deduplicate.py b/ac_dc/deduplicate/self_deduplicate.py
index 74cf88b2..be8008e2 100644
--- a/ac_dc/deduplicate/self_deduplicate.py
+++ b/ac_dc/deduplicate/self_deduplicate.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Date       : 2022-01-08 22:39:29
 # @Author     : Chenghao Mou (mouchenghao@gmail.com)
 # @Description: Self-deduplication with `datasets`
@@ -28,7 +27,7 @@
 
 def main(conf: str) -> None:
 
-    with open(conf, "r") as f:
+    with open(conf) as f:
         conf = yaml.safe_load(f.read())
 
     if conf["load_from_disk"]["path"]:
diff --git a/ac_dc/languages_id.py b/ac_dc/languages_id.py
index 6220d013..7f027ab3 100644
--- a/ac_dc/languages_id.py
+++ b/ac_dc/languages_id.py
@@ -1,6 +1,5 @@
 import pandas as pd
 
-
 langs_id = [
     {
         "lang": "Arabic",
diff --git a/ac_dc/normalization.py b/ac_dc/normalization.py
index 652e810f..6e18e3c0 100644
--- a/ac_dc/normalization.py
+++ b/ac_dc/normalization.py
@@ -1,7 +1,6 @@
 import re
 from typing import Dict
 
-
 non_printing_characters_re = re.compile(
     f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
 )
diff --git a/ac_dc/parameters_filtering.py b/ac_dc/parameters_filtering.py
index f4930347..2341c79a 100644
--- a/ac_dc/parameters_filtering.py
+++ b/ac_dc/parameters_filtering.py
@@ -1,7 +1,6 @@
 import string
 import emoji
 
-
 main_special_characters = string.punctuation + string.digits + string.whitespace
 other_special_characters = (
     "    　    ￼’“”–ー一▬…✦�­£​•€«»°·═"
diff --git a/ac_dc/visualization/get_data_for_visualization.py b/ac_dc/visualization/get_data_for_visualization.py
index 55c241f7..f529db16 100644
--- a/ac_dc/visualization/get_data_for_visualization.py
+++ b/ac_dc/visualization/get_data_for_visualization.py
@@ -90,9 +90,9 @@ def compute_stats(self):
                     )
                     for n in range(2, 16)
                 }
-                stats_document[
-                    "character_repetition_ratio"
-                ] = character_repetition_ratios
+                stats_document["character_repetition_ratio"] = (
+                    character_repetition_ratios
+                )
 
                 word_repetition_ratios = {
                     n: round(
diff --git a/ac_dc/visualization/visualization.py b/ac_dc/visualization/visualization.py
index 3e532aa4..1900e297 100644
--- a/ac_dc/visualization/visualization.py
+++ b/ac_dc/visualization/visualization.py
@@ -290,16 +290,16 @@ def get_cond(key, cutoff, max_cutoff):
                             "stopwords_ratio"
                         ]
                         for i in range(len(self.docs["stopwords_ratio"])):
-                            self.docs["stopwords_ratio"].iloc[
-                                i
-                            ] = Filtering.compute_stopwords_ratio(
-                                self.docs["text"].iloc[i],
-                                self.sentencepiece_model_tok,
-                                self.param["strip_characters"],
-                                self.param["cond_words_augmentation"],
-                                self.param["words_augmentation_group_sizes"],
-                                self.param["words_augmentation_join_char"],
-                                new_stopwords,
+                            self.docs["stopwords_ratio"].iloc[i] = (
+                                Filtering.compute_stopwords_ratio(
+                                    self.docs["text"].iloc[i],
+                                    self.sentencepiece_model_tok,
+                                    self.param["strip_characters"],
+                                    self.param["cond_words_augmentation"],
+                                    self.param["words_augmentation_group_sizes"],
+                                    self.param["words_augmentation_join_char"],
+                                    new_stopwords,
+                                )
                             )
                     cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
                     cutoff_stopwords_ratio = st.slider(
@@ -326,16 +326,16 @@ def get_cond(key, cutoff, max_cutoff):
                             "flagged_words_ratio"
                         ]
                         for i in range(len(self.docs["flagged_words_ratio"])):
-                            self.docs["flagged_words_ratio"].iloc[
-                                i
-                            ] = Filtering.compute_flagged_words_ratio(
-                                self.docs["text"].iloc[i],
-                                self.sentencepiece_model_tok,
-                                self.param["strip_characters"],
-                                self.param["cond_words_augmentation"],
-                                self.param["words_augmentation_group_sizes"],
-                                self.param["words_augmentation_join_char"],
-                                new_flagged_words,
+                            self.docs["flagged_words_ratio"].iloc[i] = (
+                                Filtering.compute_flagged_words_ratio(
+                                    self.docs["text"].iloc[i],
+                                    self.sentencepiece_model_tok,
+                                    self.param["strip_characters"],
+                                    self.param["cond_words_augmentation"],
+                                    self.param["words_augmentation_group_sizes"],
+                                    self.param["words_augmentation_join_char"],
+                                    new_flagged_words,
+                                )
                             )
                     cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
                     max_fwr = np.max(self.docs["flagged_words_ratio"])
diff --git a/bertin/evaluation/run_glue.py b/bertin/evaluation/run_glue.py
index a08cba25..a28775f2 100644
--- a/bertin/evaluation/run_glue.py
+++ b/bertin/evaluation/run_glue.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
+
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
@@ -384,9 +384,11 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        (
+            model_args.config_name
+            if model_args.config_name
+            else model_args.model_name_or_path
+        ),
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
@@ -394,9 +396,11 @@ def main():
         use_auth_token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name
-        if model_args.tokenizer_name
-        else model_args.model_name_or_path,
+        (
+            model_args.tokenizer_name
+            if model_args.tokenizer_name
+            else model_args.model_name_or_path
+        ),
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
diff --git a/bertin/evaluation/run_ner.py b/bertin/evaluation/run_ner.py
index dbd9cd9a..b19d0024 100644
--- a/bertin/evaluation/run_ner.py
+++ b/bertin/evaluation/run_ner.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +15,7 @@
 """
 Fine-tuning the library models for token classification.
 """
+
 # You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
 # comments.
 
@@ -364,9 +364,11 @@ def get_label_list(labels):
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        (
+            model_args.config_name
+            if model_args.config_name
+            else model_args.model_name_or_path
+        ),
         num_labels=num_labels,
         label2id=label_to_id,
         id2label={i: l for l, i in label_to_id.items()},
@@ -636,9 +638,9 @@ def compute_metrics(p):
             kwargs["dataset_tags"] = data_args.dataset_name
             if data_args.dataset_config_name is not None:
                 kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs[
-                    "dataset"
-                ] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+                kwargs["dataset"] = (
+                    f"{data_args.dataset_name} {data_args.dataset_config_name}"
+                )
             else:
                 kwargs["dataset"] = data_args.dataset_name
 
diff --git a/bertin/mc4/mc4.py b/bertin/mc4/mc4.py
index 923e5e20..e02cb932 100644
--- a/bertin/mc4/mc4.py
+++ b/bertin/mc4/mc4.py
@@ -1,6 +1,5 @@
 """Perplexity Sampled mC4 dataset based on Common Crawl."""
 
-
 import gzip
 import json
 
@@ -404,7 +403,7 @@ def _generate_examples(self, filepaths):
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
             if filepath.endswith("jsonl"):
-                with open(filepath, "r", encoding="utf-8") as f:
+                with open(filepath, encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
diff --git a/bertin/run_mlm_flax.py b/bertin/run_mlm_flax.py
index 54251b94..68747cd8 100644
--- a/bertin/run_mlm_flax.py
+++ b/bertin/run_mlm_flax.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +19,7 @@
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
+
 import logging
 import os
 import sys
diff --git a/bertin/run_mlm_flax_stream.py b/bertin/run_mlm_flax_stream.py
index a33eaae1..3087aecb 100644
--- a/bertin/run_mlm_flax_stream.py
+++ b/bertin/run_mlm_flax_stream.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +19,7 @@
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
+
 import json
 import logging
 import os
@@ -446,7 +446,7 @@ def restore_checkpoint(save_dir, state):
     args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
     data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
 
-    with open(os.path.join(save_dir, "training_state.json"), "r") as f:
+    with open(os.path.join(save_dir, "training_state.json")) as f:
         training_state = json.load(f)
     step = training_state["step"]
 
diff --git a/bertin/utils/dataset_perplexity.py b/bertin/utils/dataset_perplexity.py
index 2ca470c8..ecf02308 100644
--- a/bertin/utils/dataset_perplexity.py
+++ b/bertin/utils/dataset_perplexity.py
@@ -17,7 +17,7 @@ def get_perplexity(doc):
 
 
 with open("mc4-es-train-50M-stats.csv", "w") as csv:
-    with open("mc4-es-train-50M-steps.jsonl", "r") as data:
+    with open("mc4-es-train-50M-steps.jsonl") as data:
         for line in tqdm(data):
             text = json.loads(line)["text"]
             csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
diff --git a/cc_pseudo_crawl/python_scripts/deeper.py b/cc_pseudo_crawl/python_scripts/deeper.py
index ac630f9b..86070c00 100644
--- a/cc_pseudo_crawl/python_scripts/deeper.py
+++ b/cc_pseudo_crawl/python_scripts/deeper.py
@@ -1,6 +1,7 @@
 """
 Generate list of urls to query for next depth. We then need to use Athena to make a fancy query.
 """
+
 import csv
 import re
 import subprocess
diff --git a/cc_pseudo_crawl/python_scripts/download_warc.py b/cc_pseudo_crawl/python_scripts/download_warc.py
index a5699c76..0ba1bde1 100644
--- a/cc_pseudo_crawl/python_scripts/download_warc.py
+++ b/cc_pseudo_crawl/python_scripts/download_warc.py
@@ -143,9 +143,9 @@ def get_warcs(batch):
         existing_compressed_warcs,
     )
 
-    batch["compressed_warc"], batch["download_exception"] = [
+    batch["compressed_warc"], batch["download_exception"] = (
         list(l) for l in zip(*warcs_or_exceptions)
-    ]
+    )
     return batch
 
 
diff --git a/cc_pseudo_crawl/python_scripts/exact_deduplicates.py b/cc_pseudo_crawl/python_scripts/exact_deduplicates.py
index 15eca712..b6703217 100644
--- a/cc_pseudo_crawl/python_scripts/exact_deduplicates.py
+++ b/cc_pseudo_crawl/python_scripts/exact_deduplicates.py
@@ -1,4 +1,5 @@
 """Taken from Teven and Leandro"""
+
 import gzip
 import os
 import shutil
@@ -10,7 +11,6 @@
 from datasets import load_dataset, Features
 from datasets.utils.logging import set_verbosity_info
 
-
 set_verbosity_info()
 logger = logging.getLogger(__name__)
 
diff --git a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
index 1753acf8..f605e7b9 100644
--- a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
+++ b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
@@ -21,7 +21,7 @@ def main():
 
     seed_ids = []
     for seed_path in args.seed_paths:
-        with open(seed_path, "r") as fi:
+        with open(seed_path) as fi:
             data = csv.reader(fi)
             # First line is all the headers that we remove.
             seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
diff --git a/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py b/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py
index f2cea5de..c672c889 100644
--- a/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py
+++ b/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py
@@ -126,7 +126,7 @@ def process_batch(batch, skip_set):
 # looks at up to the first 10K pages for a seed and
 # records lines that appear in at least 1% of the unique pages
 def get_lines_to_skip(dset, n_records, pourcentage_threshold, min_repetition_threshold):
-    line_counts = defaultdict(lambda: 0)
+    line_counts = defaultdict(int)
     seen_pages = set()
 
     seed = SeedSequence(42)
diff --git a/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py b/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py
index 8c1d0a8e..b3e0d93f 100644
--- a/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py
+++ b/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py
@@ -1,6 +1,7 @@
 """
 Deduplicating using `datasets` is much harder, we but we forgot to generate an id when building an index, so we're screwed.
 """
+
 import logging
 import subprocess
 from argparse import ArgumentParser
diff --git a/kenlm_training/cc_net/execution.py b/kenlm_training/cc_net/execution.py
index 6ab09a56..f88d6c93 100644
--- a/kenlm_training/cc_net/execution.py
+++ b/kenlm_training/cc_net/execution.py
@@ -19,8 +19,7 @@
 
 
 class Executor(Protocol):
-    def __call__(self, function: Callable[..., str], *args: Iterable) -> None:
-        ...
+    def __call__(self, function: Callable[..., str], *args: Iterable) -> None: ...
 
 
 class SubmititRetryOnTimeout(submitit.helpers.Checkpointable):
diff --git a/kenlm_training/cc_net/flat_hash_set.py b/kenlm_training/cc_net/flat_hash_set.py
index f7529fe9..40eae07b 100644
--- a/kenlm_training/cc_net/flat_hash_set.py
+++ b/kenlm_training/cc_net/flat_hash_set.py
@@ -29,23 +29,17 @@ def __repr__(self):
         implementation = type(self).__name__
         return f"[{implementation}, len: {len(self)}"
 
-    def __len__(self) -> int:
-        ...
+    def __len__(self) -> int: ...
 
-    def __contains__(self, values: Sequence[np.uint64]) -> np.ndarray:
-        ...
+    def __contains__(self, values: Sequence[np.uint64]) -> np.ndarray: ...
 
-    def __getitem__(self, values) -> np.ndarray:
-        ...
+    def __getitem__(self, values) -> np.ndarray: ...
 
-    def __setitem__(self, keys, values) -> None:
-        ...
+    def __setitem__(self, keys, values) -> None: ...
 
-    def items(self) -> Iterable[Tuple[np.uint64, np.uint8]]:
-        ...
+    def items(self) -> Iterable[Tuple[np.uint64, np.uint8]]: ...
 
-    def keys(self) -> Iterable[np.uint64]:
-        ...
+    def keys(self) -> Iterable[np.uint64]: ...
 
     def __iter__(self) -> Iterator[np.uint64]:
         return iter(self.keys())
diff --git a/kenlm_training/cc_net/jsonql.py b/kenlm_training/cc_net/jsonql.py
index 0ff57f23..04b71c89 100644
--- a/kenlm_training/cc_net/jsonql.py
+++ b/kenlm_training/cc_net/jsonql.py
@@ -7,6 +7,7 @@
 """
 Manipulate files containing one json per line.
 """
+
 import argparse
 import collections
 import contextlib
@@ -290,7 +291,7 @@ def __getstate__(self) -> Tuple[tuple, dict, bool]:
     def __setstate__(self, state: Tuple[tuple, dict, bool]):
         if self.warn_when_pickling:
             warnings.warn(f"Unpickling transformer: {type(self)}. This can be slow.")
-        (args, kwargs, expect_json) = state
+        args, kwargs, expect_json = state
         # When unpickling `__new__` isn't called so we have to doit ourselves.
         Transformer.__init__(self, state_args=args, state_kwargs=kwargs)
         type(self).__init__(self, *args, **kwargs)
@@ -880,8 +881,7 @@ def describe(source, columns=None, weights=None, **kwargs):
             continue
         if "." in k or k == ALL_DOCUMENTS:
             continue
-        for line in display_stats(stats, k, weights=weights, **kwargs):
-            yield line
+        yield from display_stats(stats, k, weights=weights, **kwargs)
 
 
 def shard(lines):
@@ -902,17 +902,13 @@ def get_or_set(dictionary, key, default):
 class SimpleIO(Protocol):
     """A subset of methods from TextIO."""
 
-    def close(self) -> None:
-        ...
+    def close(self) -> None: ...
 
-    def write(self, line: str) -> int:
-        ...
+    def write(self, line: str) -> int: ...
 
-    def __enter__(self) -> "SimpleIO":
-        ...
+    def __enter__(self) -> "SimpleIO": ...
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
 
 
 def open_read(filename: ReadableFileLike) -> Iterable[str]:
@@ -961,7 +957,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]:
     if filename.suffix == ".gz":
         file: TextIO = gzip.open(filename, "rt")  # type: ignore
     else:
-        file = open(filename, "rt")
+        file = open(filename)
 
     return _close_when_exhausted(file)
 
@@ -1015,7 +1011,7 @@ def open_write(
     if filename.suffix == ".gz":
         return BlockedGzipWriter(Path(filename), mode, block_size="64M")
 
-    return open(filename, "wt")
+    return open(filename, "w")
 
 
 def parse_size(size):
diff --git a/kenlm_training/cc_net/tools/expand_corpus.py b/kenlm_training/cc_net/tools/expand_corpus.py
index 46d16bc4..1c5f9c41 100644
--- a/kenlm_training/cc_net/tools/expand_corpus.py
+++ b/kenlm_training/cc_net/tools/expand_corpus.py
@@ -277,7 +277,7 @@ def mine(
     print("Submited job array:", jobs[0])
 
     for j in submitit.helpers.as_completed(jobs):
-        (i, o) = j.result()
+        i, o = j.result()
         print("Mined sentences from", i, "to", o)
 
     return outputs
diff --git a/kenlm_training/tests/test_jsonql.py b/kenlm_training/tests/test_jsonql.py
index 7d9768e7..ba6ea138 100644
--- a/kenlm_training/tests/test_jsonql.py
+++ b/kenlm_training/tests/test_jsonql.py
@@ -209,7 +209,7 @@ def test_split_file_middle_of_line(tmp_path: Path):
 
 def test_split_file_middle_of_char(tmp_path: Path):
     file = tmp_path / "test.txt"
-    content = "Hello\U0001F40D\nWorld\n"
+    content = "Hello\U0001f40d\nWorld\n"
     # split is here       ^^
 
     with open(file, "w") as o:
@@ -262,7 +262,7 @@ def do(self, x):
     def acc(values):
         print("acc: started")
         res = 0
-        for (x, _) in values:
+        for x, _ in values:
             res += int(x)
         print("acc: done")
         yield f"acc: result={res}"
diff --git a/perplexity_lenses/perplexity_lenses/data.py b/perplexity_lenses/perplexity_lenses/data.py
index 778749de..1d7781f2 100644
--- a/perplexity_lenses/perplexity_lenses/data.py
+++ b/perplexity_lenses/perplexity_lenses/data.py
@@ -34,9 +34,11 @@ def hub_dataset_to_dataframe(
                 {
                     text_column: sentence,
                     "perplexity": model.get_perplexity(sentence),
-                    "label": x.get("labels", [])[0]
-                    if len(x.get("labels", [])) > 0
-                    else "NONE",  # Special case for registry dataset
+                    "label": (
+                        x.get("labels", [])[0]
+                        if len(x.get("labels", [])) > 0
+                        else "NONE"
+                    ),  # Special case for registry dataset
                 }
                 for sentence in x[text_column].split("\n")
             ]
@@ -46,9 +48,9 @@ def hub_dataset_to_dataframe(
             lambda x: {
                 text_column: x[text_column],
                 "perplexity": model.get_perplexity(x[text_column]),
-                "label": x.get("labels", [])[0]
-                if len(x.get("labels", [])) > 0
-                else "NONE",  # Special case for registry dataset
+                "label": (
+                    x.get("labels", [])[0] if len(x.get("labels", [])) > 0 else "NONE"
+                ),  # Special case for registry dataset
             }
         )
     instances = []
diff --git a/pii-manager/setup.py b/pii-manager/setup.py
index c5b0714b..4c369204 100644
--- a/pii-manager/setup.py
+++ b/pii-manager/setup.py
@@ -27,7 +27,7 @@
 
 def requirements(filename="requirements.txt"):
     """Read the requirements file"""
-    with io.open(filename, "r") as f:
+    with open(filename) as f:
         return [line.strip() for line in f if line and line[0] != "#"]
 
 
@@ -35,7 +35,7 @@ def long_description():
     """
     Take the README and remove markdown hyperlinks
     """
-    with open("README.md", "rt", encoding="utf-8") as f:
+    with open("README.md", encoding="utf-8") as f:
         desc = f.read()
         desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M)
         return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X)
diff --git a/pii-manager/src/pii_manager/api/manager.py b/pii-manager/src/pii_manager/api/manager.py
index cdb3d7dd..3d32c851 100644
--- a/pii-manager/src/pii_manager/api/manager.py
+++ b/pii-manager/src/pii_manager/api/manager.py
@@ -15,7 +15,6 @@
 from ..helper.exception import InvArgException
 from ..lang import LANG_ANY, COUNTRY_ANY
 
-
 DEFAULT_TEMPLATES = {"replace": "<{name}>", "tag": "<{name}:{value}>"}
 
 
@@ -31,13 +30,11 @@ def fetch_all_tasks(
     """
     taskdict = get_taskdict(debug=debug)
     # Language-independent
-    for task in taskdict[LANG_ANY].values():
-        yield task
+    yield from taskdict[LANG_ANY].values()
 
     langdict = taskdict.get(lang, {})
     # Country-independent
-    for task in langdict.get(COUNTRY_ANY, {}).values():
-        yield task
+    yield from langdict.get(COUNTRY_ANY, {}).values()
     # Country-specific
     if country:
         if country[0] in (COUNTRY_ANY, "all"):
@@ -45,8 +42,7 @@ def fetch_all_tasks(
         for c in country:
             if c == COUNTRY_ANY:  # already included above
                 continue
-            for task in langdict.get(c, {}).values():
-                yield task
+            yield from langdict.get(c, {}).values()
 
 
 def fetch_task(
@@ -166,9 +162,7 @@ def __init__(
         self._process = (
             self.process_full
             if self.mode == "full"
-            else self.process_extract
-            if self.mode == "extract"
-            else self.process_subst
+            else self.process_extract if self.mode == "extract" else self.process_subst
         )
 
     def __repr__(self) -> str:
diff --git a/pii-manager/src/pii_manager/helper/base.py b/pii-manager/src/pii_manager/helper/base.py
index 013b4549..93fa55b2 100644
--- a/pii-manager/src/pii_manager/helper/base.py
+++ b/pii-manager/src/pii_manager/helper/base.py
@@ -10,7 +10,6 @@
 from .context import context_spec, context_check, CONTEXT_NORM_OPTIONS
 from .exception import PiiUnimplemented
 
-
 NORM_OPTIONS = dict(whitespace=True, lowercase=True)
 
 
diff --git a/pii-manager/src/pii_manager/helper/context.py b/pii-manager/src/pii_manager/helper/context.py
index 4b508055..d82976ad 100644
--- a/pii-manager/src/pii_manager/helper/context.py
+++ b/pii-manager/src/pii_manager/helper/context.py
@@ -9,7 +9,6 @@
 from .exception import InvArgException
 from .normalizer import normalize
 
-
 # Default width around a Pii where context is searched for
 DEFAULT_CONTEXT_WIDTH = 64
 
diff --git a/pii-manager/src/pii_manager/helper/json.py b/pii-manager/src/pii_manager/helper/json.py
index 54694d67..6f453515 100644
--- a/pii-manager/src/pii_manager/helper/json.py
+++ b/pii-manager/src/pii_manager/helper/json.py
@@ -3,7 +3,6 @@
 in particular PiiEntity objects
 """
 
-
 from collections.abc import Iterator
 import datetime
 import json
diff --git a/pii-manager/src/pii_manager/helper/taskdict.py b/pii-manager/src/pii_manager/helper/taskdict.py
index c733f1ff..fd6db31b 100644
--- a/pii-manager/src/pii_manager/helper/taskdict.py
+++ b/pii-manager/src/pii_manager/helper/taskdict.py
@@ -172,11 +172,11 @@ def build_subdict(task_list: List[Tuple], lang: str, country: str = None) -> Dic
             task_type = (
                 "PiiTask"
                 if _is_pii_class(src[1])
-                else "callable"
-                if callable(src[1])
-                else "regex"
-                if isinstance(src[1], str)
-                else None
+                else (
+                    "callable"
+                    if callable(src[1])
+                    else "regex" if isinstance(src[1], str) else None
+                )
             )
             # Build the dict
             td = {"pii": src[0], "type": task_type, "task": src[1]}
diff --git a/pii-manager/src/pii_manager/lang/any/credit_card.py b/pii-manager/src/pii_manager/lang/any/credit_card.py
index cc2db7d0..d523e270 100644
--- a/pii-manager/src/pii_manager/lang/any/credit_card.py
+++ b/pii-manager/src/pii_manager/lang/any/credit_card.py
@@ -15,7 +15,6 @@
 from pii_manager import PiiEnum, PiiEntity
 from pii_manager.helper import BasePiiTask
 
-
 # ----------------------------------------------------------------------------
 
 # base regex to detect candidates to credit card numbers
diff --git a/pii-manager/src/pii_manager/lang/any/email.py b/pii-manager/src/pii_manager/lang/any/email.py
index bd7d8b81..924660a2 100644
--- a/pii-manager/src/pii_manager/lang/any/email.py
+++ b/pii-manager/src/pii_manager/lang/any/email.py
@@ -4,7 +4,6 @@
 
 from pii_manager import PiiEnum
 
-
 _EMAIL_PATTERN = r"[\w\.=-]+ @ [\w\.-]+ \. [\w]{2,3}"
 
 
diff --git a/pii-manager/src/pii_manager/lang/any/ip_address.py b/pii-manager/src/pii_manager/lang/any/ip_address.py
index 67089ac9..bbe057b7 100644
--- a/pii-manager/src/pii_manager/lang/any/ip_address.py
+++ b/pii-manager/src/pii_manager/lang/any/ip_address.py
@@ -4,7 +4,6 @@
 
 from pii_manager import PiiEnum
 
-
 _IP_PATTERN = r"""
      \b
      (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \. ){3}
diff --git a/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py b/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py
index 48c284b3..9d7bbf14 100644
--- a/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py
+++ b/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py
@@ -3,7 +3,6 @@
 prefix and country code)
 """
 
-
 from pii_manager import PiiEnum
 
 PATTERN_INT_PHONE = r"""
diff --git a/pii-manager/src/pii_manager/lang/en/au/abn.py b/pii-manager/src/pii_manager/lang/en/au/abn.py
index c11b9a8f..2c5e077e 100644
--- a/pii-manager/src/pii_manager/lang/en/au/abn.py
+++ b/pii-manager/src/pii_manager/lang/en/au/abn.py
@@ -2,6 +2,7 @@
 Detection and validation of Australian business number (ABN).
 
 """
+
 import re
 
 from stdnum.au import abn
@@ -10,7 +11,6 @@
 
 from pii_manager import PiiEnum
 
-
 _ABN_PATTERN = r"\b (?: \d{2} \s \d{3} \s \d{3} \s \d{3} | \d{11} ) \b"
 _ABN_REGEX = re.compile(_ABN_PATTERN, flags=re.X)
 
diff --git a/pii-manager/src/pii_manager/lang/en/au/tfn.py b/pii-manager/src/pii_manager/lang/en/au/tfn.py
index 3f2384dc..ee198ea0 100644
--- a/pii-manager/src/pii_manager/lang/en/au/tfn.py
+++ b/pii-manager/src/pii_manager/lang/en/au/tfn.py
@@ -2,6 +2,7 @@
 Detection and validation of Australian Tax File Number (TFN).
 
 """
+
 import re
 
 from stdnum.au import tfn
@@ -10,7 +11,6 @@
 
 from pii_manager import PiiEnum
 
-
 _TFN_PATTERN = r"\b (?: \d{3} \s \d{3} \s \d{3} | \d{8,9} ) \b"
 _TFN_REGEX = re.compile(_TFN_PATTERN, flags=re.X)
 
diff --git a/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py b/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py
index fa3be47c..31007927 100644
--- a/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py
+++ b/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py
@@ -12,7 +12,6 @@
 
 from pii_manager import PiiEnum
 
-
 _SIN_REGEX = re.compile(r"\d{3}[-\ ]\d{3}[-\ ]\d{3}", flags=re.X)
 
 
diff --git a/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py b/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py
index d0569666..733ff1b8 100644
--- a/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py
+++ b/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py
@@ -12,7 +12,6 @@
 
 from pii_manager import PiiEnum
 
-
 _AADHAAR_REGEX = re.compile(r"[2-9]\d{3}\ ?\d{4}\ ?\d{4}", flags=re.X)
 
 
diff --git a/pii-manager/src/pii_manager/lang/en/us/social_security_number.py b/pii-manager/src/pii_manager/lang/en/us/social_security_number.py
index 92b3dd33..bc2a6b83 100644
--- a/pii-manager/src/pii_manager/lang/en/us/social_security_number.py
+++ b/pii-manager/src/pii_manager/lang/en/us/social_security_number.py
@@ -7,7 +7,6 @@
 
 from pii_manager import PiiEnum
 
-
 _SSN_PATTERN = r"(?!000|666|333)0*(?:[0-6][0-9][0-9]|[0-7][0-6][0-9]|[0-7][0-7][0-2])[-\ ](?!00)[0-9]{2}[-\ ](?!0000)[0-9]{4}"
 
 
diff --git a/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py b/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py
index f82aa441..9a32feea 100644
--- a/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py
+++ b/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py
@@ -3,13 +3,11 @@
 prefix and country code), for ES
 """
 
-
 from pii_manager import PiiEnum
 
 # The pattern for the regex is the same as for English
 from ...en.any.international_phone_number import PATTERN_INT_PHONE
 
-
 PII_TASKS = [
     {
         "pii": PiiEnum.PHONE_NUMBER,
diff --git a/pii-manager/src/pii_manager/lang/es/mx/curp.py b/pii-manager/src/pii_manager/lang/es/mx/curp.py
index 96453673..2ab11777 100644
--- a/pii-manager/src/pii_manager/lang/es/mx/curp.py
+++ b/pii-manager/src/pii_manager/lang/es/mx/curp.py
@@ -12,7 +12,6 @@
 
 from pii_manager import PiiEnum
 
-
 _CURP_PATTERN = r"[A-Z] [AEIOU] [A-Z]{2} \d{6} [HM] [A-Z]{5} [0-9A-Z] \d"
 _CURP_REGEX = re.compile(_CURP_PATTERN, flags=re.X)
 
diff --git a/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py b/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py
index e3c3d428..faaf4a63 100644
--- a/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py
+++ b/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py
@@ -1,4 +1,5 @@
 """
 Reuse the SIN code implemented for en
 """
+
 from pii_manager.lang.en.ca.social_insurance_number import PII_TASKS
diff --git a/pii-manager/src/pii_manager/lang/pt/br/cpf.py b/pii-manager/src/pii_manager/lang/pt/br/cpf.py
index 607de1f1..d82bc55b 100644
--- a/pii-manager/src/pii_manager/lang/pt/br/cpf.py
+++ b/pii-manager/src/pii_manager/lang/pt/br/cpf.py
@@ -13,7 +13,6 @@
 
 from pii_manager import PiiEnum
 
-
 _CPF_REGEX = re.compile(r"\d{3} \. \d{3} \. \d{3} - \d{2}", flags=re.X)
 
 
diff --git a/pii-manager/src/pii_manager/lang/pt/pt/govid.py b/pii-manager/src/pii_manager/lang/pt/pt/govid.py
index b87195d5..278c6b97 100644
--- a/pii-manager/src/pii_manager/lang/pt/pt/govid.py
+++ b/pii-manager/src/pii_manager/lang/pt/pt/govid.py
@@ -13,7 +13,6 @@
 from pii_manager import PiiEnum, PiiEntity
 from pii_manager.helper import BasePiiTask
 
-
 # regex for NIF & CC
 _NIF_PATTERN = r"(?: PT \x20?)? (?: \d{3} \x20 \d{3} \x20 \d{3} | \d{9} )"
 _CC_PATTERN = r"\d{8} \x20? \d \x20? [A-Z0-9]{2}\d"
diff --git a/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py b/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py
index eea1c277..f96d7221 100644
--- a/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py
+++ b/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py
@@ -11,7 +11,6 @@
 
 from stdnum.cn import ric
 
-
 # Detect candidates (separately) for RIC and passport-like numbers
 _GOV_ID_PATTERN = r"(?<!\d) (?: (\d{18}) | ( (?:G|D|S|P|H|M) \d{8} ) ) (?!\d)"
 
diff --git a/pii-manager/src/pii_manager/lang/zh/cn/misc.py b/pii-manager/src/pii_manager/lang/zh/cn/misc.py
index 7e1710d1..5eabfd4e 100644
--- a/pii-manager/src/pii_manager/lang/zh/cn/misc.py
+++ b/pii-manager/src/pii_manager/lang/zh/cn/misc.py
@@ -2,10 +2,8 @@
 Detection of various Chinese PII elements
 """
 
-
 from pii_manager import PiiEnum
 
-
 _PATTERNS = {
     "STREET_ADDRESS": r"""(\p{Han}{1,4} (自治区|省))?
         \p{Han}{1,4}
diff --git a/pii-manager/test/unit/api/test_file.py b/pii-manager/test/unit/api/test_file.py
index 6d5932c6..0578c67b 100644
--- a/pii-manager/test/unit/api/test_file.py
+++ b/pii-manager/test/unit/api/test_file.py
@@ -12,7 +12,7 @@ def datafile(name: str) -> str:
 
 
 def readfile(name: str) -> str:
-    with open(name, "rt", encoding="utf-8") as f:
+    with open(name, encoding="utf-8") as f:
         return f.read().strip()
 
 
diff --git a/pii-manager/test/unit/api/test_file_taskfile.py b/pii-manager/test/unit/api/test_file_taskfile.py
index 722dfa8f..3371d7b7 100644
--- a/pii-manager/test/unit/api/test_file_taskfile.py
+++ b/pii-manager/test/unit/api/test_file_taskfile.py
@@ -14,7 +14,7 @@ def datafile(name: str) -> str:
 
 
 def readfile(name: str) -> str:
-    with open(name, "rt", encoding="utf-8") as f:
+    with open(name, encoding="utf-8") as f:
         return f.read().strip()
 
 
diff --git a/pii-manager/test/unit/api/test_manager.py b/pii-manager/test/unit/api/test_manager.py
index 5f74dbf6..a4b342ce 100644
--- a/pii-manager/test/unit/api/test_manager.py
+++ b/pii-manager/test/unit/api/test_manager.py
@@ -3,7 +3,6 @@
 from pii_manager import PiiEnum
 from pii_manager.api import PiiManager
 
-
 TEST = (
     "El número de la tarjeta de crédito es 4273 9666 4581 5642",
     "El número de la tarjeta de crédito es <CREDIT_CARD>",
@@ -21,7 +20,10 @@ def test20_info():
     info = obj.task_info()
 
     exp = {
-        (PiiEnum.CREDIT_CARD, None,): [
+        (
+            PiiEnum.CREDIT_CARD,
+            None,
+        ): [
             (
                 "credit card",
                 "Credit card numbers for most international credit cards (detect & validate)",
diff --git a/pii-manager/test/unit/api/test_manager_add.py b/pii-manager/test/unit/api/test_manager_add.py
index a61e5eee..a3116c46 100644
--- a/pii-manager/test/unit/api/test_manager_add.py
+++ b/pii-manager/test/unit/api/test_manager_add.py
@@ -9,7 +9,6 @@
 from pii_manager.lang import COUNTRY_ANY
 from pii_manager.helper.base import BasePiiTask
 
-
 # ---------------------------------------------------------------------
 
 DUMMY_REGEX = {
@@ -47,7 +46,7 @@ def test110_call():
     obj = PiiManager("en", None, PiiEnum.EMAIL_ADDRESS)
     obj.add_tasks([DUMMY_REGEX])
 
-    for (doc, exp) in TEST_REGEX:
+    for doc, exp in TEST_REGEX:
         got = obj(doc)
         assert got == exp
 
@@ -86,6 +85,6 @@ def test200_call():
     obj = PiiManager("en")
     obj.add_tasks([DUMMY_CLASS])
 
-    for (doc, exp) in TEST_CLASS:
+    for doc, exp in TEST_CLASS:
         got = obj(doc)
         assert got == exp
diff --git a/pii-manager/test/unit/api/test_manager_ctx.py b/pii-manager/test/unit/api/test_manager_ctx.py
index f74701fc..c475498e 100644
--- a/pii-manager/test/unit/api/test_manager_ctx.py
+++ b/pii-manager/test/unit/api/test_manager_ctx.py
@@ -38,7 +38,7 @@ def test10_context_regex():
     """
     obj = PiiManager("en", mode="extract")
     obj.add_tasks([DUMMY_REGEX])
-    for (text, exp) in TEST:
+    for text, exp in TEST:
         got = obj(text)
         assert list(got) == exp
 
@@ -64,6 +64,6 @@ def test20_context_class():
     """
     obj = PiiManager("en", mode="extract")
     obj.add_tasks([DUMMY_CLASS])
-    for (text, exp) in TEST:
+    for text, exp in TEST:
         got = obj(text)
         assert list(got) == exp
diff --git a/pii-manager/test/unit/helper/test_context.py b/pii-manager/test/unit/helper/test_context.py
index 6a158864..39946f91 100644
--- a/pii-manager/test/unit/helper/test_context.py
+++ b/pii-manager/test/unit/helper/test_context.py
@@ -1,6 +1,7 @@
 """
 Test the context checking function
 """
+
 import pytest
 
 import pii_manager.helper.context as mod
@@ -74,7 +75,7 @@ def test10_context_true():
     """
     Check valid contexts
     """
-    for (text, context) in TEST_TRUE:
+    for text, context in TEST_TRUE:
         spec = mod.context_spec(context)
         assert mod.context_check(text, spec, 20) is True
 
@@ -83,7 +84,7 @@ def test20_context_false():
     """
     Check invalid contexts
     """
-    for (text, context) in TEST_FALSE:
+    for text, context in TEST_FALSE:
         spec = mod.context_spec(context)
         assert mod.context_check(text, spec, 20) is False
 
diff --git a/pii-manager/test/unit/helper/test_norm.py b/pii-manager/test/unit/helper/test_norm.py
index 0a1b73ba..20d72c50 100644
--- a/pii-manager/test/unit/helper/test_norm.py
+++ b/pii-manager/test/unit/helper/test_norm.py
@@ -1,6 +1,5 @@
 import pii_manager.helper.normalizer as mod
 
-
 TEST = [("the Social Security\nNumber is 34512", "the social security number is 34512")]
 
 
@@ -8,5 +7,5 @@ def test10_normalizer():
     """
     Create base object
     """
-    for (text, exp) in TEST:
+    for text, exp in TEST:
         assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp
diff --git a/pii-manager/test/unit/lang/any/test_bitcoin_address.py b/pii-manager/test/unit/lang/any/test_bitcoin_address.py
index 1f5da9f8..d65bc405 100644
--- a/pii-manager/test/unit/lang/any/test_bitcoin_address.py
+++ b/pii-manager/test/unit/lang/any/test_bitcoin_address.py
@@ -2,11 +2,9 @@
 Test bitcoin addresses
 """
 
-
 from pii_manager import PiiEnum
 from pii_manager.api import PiiManager
 
-
 TEST = [
     # A valid bitcoin address
     (
diff --git a/pii-manager/test/unit/lang/any/test_credit_card.py b/pii-manager/test/unit/lang/any/test_credit_card.py
index 35a68465..fee41526 100644
--- a/pii-manager/test/unit/lang/any/test_credit_card.py
+++ b/pii-manager/test/unit/lang/any/test_credit_card.py
@@ -5,7 +5,6 @@
 from pii_manager import PiiEnum
 from pii_manager.api import PiiManager
 
-
 TEST = [
     # A valid credit card number
     (
diff --git a/pii-manager/test/unit/lang/any/test_email.py b/pii-manager/test/unit/lang/any/test_email.py
index 835eb819..b4eff41d 100644
--- a/pii-manager/test/unit/lang/any/test_email.py
+++ b/pii-manager/test/unit/lang/any/test_email.py
@@ -5,7 +5,6 @@
 from pii_manager import PiiEnum
 from pii_manager.api import PiiManager
 
-
 TEST = [
     # A valid email address
     (
diff --git a/pii-manager/test/unit/lang/any/test_ip_address.py b/pii-manager/test/unit/lang/any/test_ip_address.py
index cd040cf9..a14b355e 100644
--- a/pii-manager/test/unit/lang/any/test_ip_address.py
+++ b/pii-manager/test/unit/lang/any/test_ip_address.py
@@ -5,7 +5,6 @@
 from pii_manager import PiiEnum
 from pii_manager.api import PiiManager
 
-
 TEST = [
     # A valid IP address
     (
diff --git a/requirements.txt b/requirements.txt
index 24e425cf..8dd4d35b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ fasttext>=0.9.2
 fsspec
 ftfy
 indexed_gzip>=1.6.1
-indexed_gzip>=1.6.1
 langid>=1.1.6
 nltk
 scikit-learn
diff --git a/tokenizer/python_script/dedup_exact_article.py b/tokenizer/python_script/dedup_exact_article.py
index e658f838..ef18e130 100644
--- a/tokenizer/python_script/dedup_exact_article.py
+++ b/tokenizer/python_script/dedup_exact_article.py
@@ -1,4 +1,5 @@
 """Taken from Teven and Leandro"""
+
 import gzip
 import os
 import shutil
@@ -9,7 +10,6 @@
 from datasets import load_from_disk
 from datasets.utils.logging import set_verbosity_info
 
-
 set_verbosity_info()
 logger = logging.getLogger(__name__)
 
diff --git a/tokenizer/python_script/dedup_lines.py b/tokenizer/python_script/dedup_lines.py
index ea3e4a81..f22f7a72 100644
--- a/tokenizer/python_script/dedup_lines.py
+++ b/tokenizer/python_script/dedup_lines.py
@@ -28,6 +28,7 @@
 
 META_COLUMNS = ["meta"]
 
+
 # filter text to remove certain lines (e.g. menu items, copyright notice)
 def filter_lines(article, skip_set, used_lines):
     # TODO discuss the strip
@@ -48,9 +49,11 @@ def filter_lines(article, skip_set, used_lines):
 def filter_lines_by_batch(texts, skip_set, used_lines, preserve_code, metadata=None):
     if preserve_code:
         filtered_lines = [
-            filter_lines(article, skip_set, used_lines)
-            if "lm_code" in eval(metadata_item)["source_dataset"]
-            else (article, "")
+            (
+                filter_lines(article, skip_set, used_lines)
+                if "lm_code" in eval(metadata_item)["source_dataset"]
+                else (article, "")
+            )
             for article, metadata_item in zip(texts, metadata)
         ]
     else:
@@ -86,8 +89,8 @@ def process_batch(batch, skip_set, used_lines, args):
 # looks at up to the first 10K pages for a seed and
 # records lines that appear in at least 1% of the unique pages
 def get_lines_to_skip(dset, n_records, pourcentage_threshold, min_repetition_threshold):
-    line_counts = defaultdict(lambda: 0)
-    seen_pages = defaultdict(lambda: 0)
+    line_counts = defaultdict(int)
+    seen_pages = defaultdict(int)
 
     seed = SeedSequence(42)
     rng = default_rng(seed)
diff --git a/tokenizer/python_script/ram_dedup_lines.py b/tokenizer/python_script/ram_dedup_lines.py
index 5e6f68b5..4690f93d 100644
--- a/tokenizer/python_script/ram_dedup_lines.py
+++ b/tokenizer/python_script/ram_dedup_lines.py
@@ -78,7 +78,7 @@ def main():
         f"Filtered out {number_of_samples_before - number_of_samples_after_filtering_none} / {number_of_samples_before}"
     )
 
-    seen = defaultdict(lambda: 0)
+    seen = defaultdict(int)
 
     def remove_duplicate_lines(examples):
         new_exemples = {"text": [], "meta": []}