From 971547c49c00efdce3555b1547fc1505b90639fd Mon Sep 17 00:00:00 2001
From: Silvan Laube <silvanlaube@hotmail.com>
Date: Wed, 28 Feb 2024 17:53:42 +0100
Subject: [PATCH 1/2] add option to omit removing n-grams with space

I'd much prefer an option to keep spaces than having to replace my spaces with a special character that is otherwise not used.

If you do not want to add this change, please at least consider properly documenting this behaviour of removing n-grams with spaces in them, I just spent way too much time figuring out what was going on.
---
 polyfuzz/models/_tfidf.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py
index e065f11..88ece54 100644
--- a/polyfuzz/models/_tfidf.py
+++ b/polyfuzz/models/_tfidf.py
@@ -37,12 +37,13 @@ class TFIDF(BaseMatcher):
                         knn uses 1-nearest neighbor to extract the most similar strings
                         it is significantly slower than both methods but requires little memory
         model_id: The name of the particular instance, used when comparing models
+        remove_space_ngrams: Remove n-grams that contain a space
 
     Usage:
 
     ```python
     from polymatcher.models import TFIDF
-    model = TFIDF(n_gram_range=(3, 3), clean_string=True, use_knn=False)
+    model = TFIDF(n_gram_range=(3, 3), clean_string=True)
     ```
     """
     def __init__(self,
@@ -51,7 +52,8 @@ def __init__(self,
                  min_similarity: float = 0.75,
                  top_n: int = 1,
                  cosine_method: str = "sparse",
-                 model_id: str = None):
+                 model_id: str = None
+                 remove_space_ngrams = True):
         super().__init__(model_id)
         self.type = "TF-IDF"
         self.n_gram_range = n_gram_range
@@ -61,6 +63,7 @@ def __init__(self,
         self.top_n = top_n
         self.vectorizer = None
         self.tf_idf_to = None
+        self.remove_space_ngrams = remove_space_ngrams
 
     def match(self,
               from_list: List[str],
@@ -127,7 +130,10 @@ def _create_ngrams(self, string: str) -> List[str]:
         result = []
         for n in range(self.n_gram_range[0], self.n_gram_range[1]+1):
             ngrams = zip(*[string[i:] for i in range(n)])
-            ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram]
+            if self.remove_space_ngrams:
+                ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram]
+            else:
+                ngrams = [''.join(ngram) for ngram in ngrams]
             result.extend(ngrams)
 
         return result

From 5e70154486881a80899df2b711d170880ab21532 Mon Sep 17 00:00:00 2001
From: Silvan Laube <silvanlaube@hotmail.com>
Date: Wed, 28 Feb 2024 17:57:10 +0100
Subject: [PATCH 2/2] fix typo (,)

---
 polyfuzz/models/_tfidf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py
index 88ece54..a4213b2 100644
--- a/polyfuzz/models/_tfidf.py
+++ b/polyfuzz/models/_tfidf.py
@@ -52,7 +52,7 @@ def __init__(self,
                  min_similarity: float = 0.75,
                  top_n: int = 1,
                  cosine_method: str = "sparse",
-                 model_id: str = None
+                 model_id: str = None,
                  remove_space_ngrams = True):
         super().__init__(model_id)
         self.type = "TF-IDF"