From 971547c49c00efdce3555b1547fc1505b90639fd Mon Sep 17 00:00:00 2001 From: Silvan Laube Date: Wed, 28 Feb 2024 17:53:42 +0100 Subject: [PATCH 1/2] add option to omit removing n-grams with space I'd much prefer an option to keep spaces than having to replace my spaces with a special character that is otherwise not used. If you do not want to add this change, please at least consider properly documenting this behaviour of removing n-grams with spaces in them, I just spent way too much time figuring out what was going on. --- polyfuzz/models/_tfidf.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py index e065f11..88ece54 100644 --- a/polyfuzz/models/_tfidf.py +++ b/polyfuzz/models/_tfidf.py @@ -37,12 +37,13 @@ class TFIDF(BaseMatcher): knn uses 1-nearest neighbor to extract the most similar strings it is significantly slower than both methods but requires little memory model_id: The name of the particular instance, used when comparing models + remove_space_ngrams: Remove n-grams that contain a space Usage: ```python from polymatcher.models import TFIDF - model = TFIDF(n_gram_range=(3, 3), clean_string=True, use_knn=False) + model = TFIDF(n_gram_range=(3, 3), clean_string=True) ``` """ def __init__(self, @@ -51,7 +52,8 @@ def __init__(self, min_similarity: float = 0.75, top_n: int = 1, cosine_method: str = "sparse", - model_id: str = None): + model_id: str = None + remove_space_ngrams = True): super().__init__(model_id) self.type = "TF-IDF" self.n_gram_range = n_gram_range @@ -61,6 +63,7 @@ def __init__(self, self.top_n = top_n self.vectorizer = None self.tf_idf_to = None + self.remove_space_ngrams = remove_space_ngrams def match(self, from_list: List[str], @@ -127,7 +130,10 @@ def _create_ngrams(self, string: str) -> List[str]: result = [] for n in range(self.n_gram_range[0], self.n_gram_range[1]+1): ngrams = zip(*[string[i:] for i in range(n)]) - ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram] + if self.remove_space_ngrams: + ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram] + else: + ngrams = [''.join(ngram) for ngram in ngrams] result.extend(ngrams) return result From 5e70154486881a80899df2b711d170880ab21532 Mon Sep 17 00:00:00 2001 From: Silvan Laube Date: Wed, 28 Feb 2024 17:57:10 +0100 Subject: [PATCH 2/2] fix typo (,) --- polyfuzz/models/_tfidf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py index 88ece54..a4213b2 100644 --- a/polyfuzz/models/_tfidf.py +++ b/polyfuzz/models/_tfidf.py @@ -52,7 +52,7 @@ def __init__(self, min_similarity: float = 0.75, top_n: int = 1, cosine_method: str = "sparse", - model_id: str = None + model_id: str = None, remove_space_ngrams = True): super().__init__(model_id) self.type = "TF-IDF"