From c431412d7880ed8e3d2941fee098238adf73f787 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 23 Jun 2025 16:31:39 -0400 Subject: [PATCH 1/4] Use 'src' and 'trg' mirroring silnlp when src and trg lang codes are the same --- .../huggingface/hugging_face_nmt_model_trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py index 1192243e..7df61a90 100644 --- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py +++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py @@ -158,7 +158,10 @@ def train( if isinstance(self._corpus, Dataset): train_dataset = self._corpus else: - train_dataset = self._corpus.filter_nonempty().to_hf_dataset(src_lang, tgt_lang) + if src_lang == tgt_lang: + train_dataset = self._corpus.filter_nonempty().to_hf_dataset("src", "trg") + else: + train_dataset = self._corpus.filter_nonempty().to_hf_dataset(src_lang, tgt_lang) def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes: List[str]) -> List[str]: vocab = tokenizer.get_vocab().keys() From ab1210bb929d476772392b34c82a440f65f42175 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 23 Jun 2025 17:13:16 -0400 Subject: [PATCH 2/4] Don't use local vars when finding missing characters --- .../hugging_face_nmt_model_trainer.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py index 7df61a90..6a9c57da 100644 --- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py +++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py @@ -154,14 +154,14 @@ def train( tgt_lang = self._tgt_lang if tgt_lang is None: tgt_lang = "tgt" + if src_lang == tgt_lang: + src_lang += "_src" + tgt_lang += "_trg" if isinstance(self._corpus, Dataset): train_dataset = self._corpus else: - if src_lang == tgt_lang: - train_dataset = self._corpus.filter_nonempty().to_hf_dataset("src", "trg") - else: - train_dataset = self._corpus.filter_nonempty().to_hf_dataset(src_lang, tgt_lang) + train_dataset = self._corpus.filter_nonempty().to_hf_dataset(src_lang, tgt_lang) def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes: List[str]) -> List[str]: vocab = tokenizer.get_vocab().keys() @@ -209,13 +209,19 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any: use_fast=True, ) # using unofficially supported behavior to set the normalizer + lang_codes = [] tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore if self._add_unk_src_tokens and self._add_unk_tgt_tokens: - lang_codes = [src_lang, tgt_lang] + if self._src_lang is not None: + lang_codes.append(self._src_lang) + if self._tgt_lang is not None: + lang_codes.append(self._tgt_lang) elif self._add_unk_src_tokens: - lang_codes = [src_lang] + if self._src_lang is not None: + lang_codes.append(self._src_lang) else: - lang_codes = [tgt_lang] + if self._tgt_lang is not None: + lang_codes.append(self._tgt_lang) missing_tokens = find_missing_characters(tokenizer, train_dataset, lang_codes) if missing_tokens: tokenizer = add_tokens(tokenizer, missing_tokens) From 39ca9cffedfef6e92beccaf28ecfa67b87e381aa Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 23 Jun 2025 17:24:43 -0400 Subject: [PATCH 3/4] Compress if statements --- .../huggingface/hugging_face_nmt_model_trainer.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py index 6a9c57da..fcfd805b 100644 --- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py +++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py @@ -211,17 +211,10 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any: # using unofficially supported behavior to set the normalizer lang_codes = [] tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore - if self._add_unk_src_tokens and self._add_unk_tgt_tokens: - if self._src_lang is not None: - lang_codes.append(self._src_lang) - if self._tgt_lang is not None: - lang_codes.append(self._tgt_lang) - elif self._add_unk_src_tokens: - if self._src_lang is not None: - lang_codes.append(self._src_lang) - else: - if self._tgt_lang is not None: - lang_codes.append(self._tgt_lang) + if self._add_unk_src_tokens and self._src_lang is not None: + lang_codes.append(self._src_lang) + if self._add_unk_tgt_tokens and self._tgt_lang is not None: + lang_codes.append(self._tgt_lang) missing_tokens = find_missing_characters(tokenizer, train_dataset, lang_codes) if missing_tokens: tokenizer = add_tokens(tokenizer, missing_tokens) From 84ba0ac5930a286a986eae50ffbc65ab56a40bb9 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 24 Jun 2025 13:07:49 -0400 Subject: [PATCH 4/4] Use src_lang/tgt_lang for missing characters since it operates on the same examples --- .../huggingface/hugging_face_nmt_model_trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py index fcfd805b..2d8641c7 100644 --- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py +++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py @@ -211,10 +211,10 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any: # using unofficially supported behavior to set the normalizer lang_codes = [] tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore - if self._add_unk_src_tokens and self._src_lang is not None: - lang_codes.append(self._src_lang) - if self._add_unk_tgt_tokens and self._tgt_lang is not None: - lang_codes.append(self._tgt_lang) + if self._add_unk_src_tokens: + lang_codes.append(src_lang) + if self._add_unk_tgt_tokens: + lang_codes.append(tgt_lang) missing_tokens = find_missing_characters(tokenizer, train_dataset, lang_codes) if missing_tokens: tokenizer = add_tokens(tokenizer, missing_tokens)