deepset-ai · tholor · May 17, 2021 · Apr 26, 2021 · Apr 26, 2021 · Apr 27, 2021
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
@@ -399,6 +399,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
 
 None
 
+<a name="elasticsearch.ElasticsearchDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
 <a name="elasticsearch.OpenDistroElasticsearchDocumentStore"></a>
 ## OpenDistroElasticsearchDocumentStore Objects
 
@@ -614,6 +632,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
 
 None
 
+<a name="memory.InMemoryDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
 <a name="sql"></a>
 # Module sql
 
@@ -803,6 +839,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
 
 None
 
+<a name="sql.SQLDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
 <a name="faiss"></a>
 # Module faiss
 
@@ -967,6 +1021,15 @@ None
 
 Delete all documents from the document store.
 
+<a name="faiss.FAISSDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete all documents from the document store.
+
 <a name="faiss.FAISSDocumentStore.query_by_embedding"></a>
 #### query\_by\_embedding
 
@@ -1191,6 +1254,25 @@ Delete all documents (from SQL AND Milvus).
 
 None
 
+<a name="milvus.MilvusDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete all documents (from SQL AND Milvus).
+
+**Arguments**:
+
+- `index`: (SQL) index name for storing the docs and metadata
+- `filters`: Optional filters to narrow down the search space.
+                Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
 <a name="milvus.MilvusDocumentStore.get_all_documents_generator"></a>
 #### get\_all\_documents\_generator
 

diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
@@ -87,26 +87,6 @@ Route files in an Indexing Pipeline to corresponding file converters.
 class TextConverter(BaseConverter)
 ```
 
-<a name="txt.TextConverter.__init__"></a>
-#### \_\_init\_\_
-
-```python
- | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
-```
-
-**Arguments**:
-
-- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
-                              The tabular structures in documents might be noise for the reader model if it
-                              does not have table parsing capability for finding answers. However, tables
-                              may also have long strings that could possible candidate for searching answers.
-                              The rows containing strings are thus retained in this option.
-- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
-                        (https://en.wikipedia.org/wiki/ISO_639-1) format.
-                        This option can be used to add test for encoding errors. If the extracted text is
-                        not one of the valid languages, then it might likely be encoding error resulting
-                        in garbled text.
-
 <a name="txt.TextConverter.convert"></a>
 #### convert
 

diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
@@ -91,7 +91,7 @@ be passed.
 Here's a sample configuration:
 
     ```yaml
-    |   version: '0.7'
+    |   version: '0.8'
     |
     |    components:    # define all the building-blocks for Pipeline
     |    - name: MyReader       # custom-name for the component; helpful for visualization & debugging
@@ -127,6 +127,20 @@ Here's a sample configuration:
                                      variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
                                      `_` sign must be used to specify nested hierarchical properties.
 
+<a name="pipeline.Pipeline.save_to_yaml"></a>
+#### save\_to\_yaml
+
+```python
+ | save_to_yaml(path: Path, return_defaults: bool = False)
+```
+
+Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`.
+
+**Arguments**:
+
+- `path`: path of the output YAML file.
+- `return_defaults`: whether to output parameters that have the default values.
+
 <a name="pipeline.BaseStandardPipeline"></a>
 ## BaseStandardPipeline Objects
 

diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
@@ -39,7 +39,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
 #### \_\_init\_\_
 
 ```python
- | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True)
+ | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0)
 ```
 
 **Arguments**:
@@ -78,6 +78,8 @@ and that FARM includes no_answer in the sorted list of predictions.
 - `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``)
 - `progress_bar`: Whether to show a tqdm progress bar or not.
                      Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
+                            The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
 
 <a name="farm.FARMReader.train"></a>
 #### train

diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
@@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
 #### train
 
 ```python
- | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
+ | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
 ```
 
 train a DensePassageRetrieval model
@@ -370,6 +370,12 @@ train a DensePassageRetrieval model
 - `epsilon`: epsilon parameter of optimizer
 - `weight_decay`: weight decay parameter of optimizer
 - `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done
+- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are:
+            "O0" (FP32)
+            "O1" (Mixed Precision)
+            "O2" (Almost FP16)
+            "O3" (Pure FP16).
+            For more information, refer to: https://nvidia.github.io/apex/amp.html
 - `optimizer_name`: what optimizer to use (default: TransformersAdamW)
 - `num_warmup_steps`: number of warmup steps
 - `optimizer_correct_bias`: Whether to correct bias in optimizer

diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md
@@ -5,7 +5,7 @@
 ## BaseTranslator Objects
 
 ```python
-class BaseTranslator(ABC)
+class BaseTranslator(BaseComponent)
 ```
 
 Abstract class for a Translator component that translates either a query or a doc from language A to language B.

diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py
@@ -81,6 +81,9 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
         documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
 
         for document in documents_objects:
+            if document.id in self.indexes[index]:
+                # TODO Make error type consistent across document stores and add user options to deal with duplicate documents (ignore, overwrite, fail)
+                raise ValueError(f"Duplicate Documents: write_documents() failed - Document with id '{document.id} already exists in index '{index}'")
             self.indexes[index][document.id] = document
 
     def _create_document_field_map(self):

diff --git a/haystack/schema.py b/haystack/schema.py
@@ -1,47 +1,66 @@
 from typing import Any, Optional, Dict, List
 from uuid import uuid4
+
+import mmh3
 import numpy as np
 from abc import abstractmethod
 
 
 class Document:
-    def __init__(self, text: str,
-                 id: Optional[str] = None,
-                 score: Optional[float] = None,
-                 probability: Optional[float] = None,
-                 question: Optional[str] = None,
-                 meta: Dict[str, Any] = None,
-                 embedding: Optional[np.ndarray] = None):
+    def __init__(
+        self,
+        text: str,
+        id: Optional[str] = None,
+        score: Optional[float] = None,
+        probability: Optional[float] = None,
+        question: Optional[str] = None,
+        meta: Dict[str, Any] = None,
+        embedding: Optional[np.ndarray] = None,
+        id_hash_keys: Optional[List[str]] = None
+    ):
         """
-        Object used to represent documents / passages in a standardized way within Haystack.
-        For example, this is what the retriever will return from the DocumentStore,
-        regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore.
+        One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
+        Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
+        many other places that manipulate or interact with document-level data.
+
+        Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
+        into smaller passages. We'll have one Document per passage in this case.
+
+        Each document has a unique ID. This can be supplied by the user or generated automatically.
+        It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
 
-        Note that there can be multiple Documents originating from one file (e.g. PDF),
-        if you split the text into smaller passages. We'll have one Document per passage in this case.
+        There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
 
-        :param id: ID used within the DocumentStore
         :param text: Text of the document
+        :param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
+                   creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
         :param score: Retriever's query score for a retrieved document
         :param probability: a pseudo probability by scaling score in the range 0 to 1
-        :param question: Question text for FAQs.
+        :param question: Question text (e.g. for FAQs where one document usually consists of one question and one answer text).
         :param meta: Meta fields for a document like name, url, or author.
         :param embedding: Vector encoding of the text
+        :param id_hash_keys: Generate the document id from a custom list of strings.
+                             If you want ensure you don't have duplicate documents in your DocumentStore but texts are
+                             not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
         """
 
         self.text = text
-        # Create a unique ID (either new one, or one from user input)
-        if id:
-            self.id = str(id)
-        else:
-            self.id = str(uuid4())
-
         self.score = score
         self.probability = probability
         self.question = question
         self.meta = meta or {}
         self.embedding = embedding
 
+        # Create a unique ID (either new one, or one from user input)
+        if id:
+            self.id = str(id)
+        else:
+            self.id = self._get_id(id_hash_keys)
+
+    def _get_id(self, id_hash_keys):
+        final_hash_key = ":".join(id_hash_keys) if id_hash_keys else self.text
+        return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
+
     def to_dict(self, field_map={}):
         inv_field_map = {v:k for k, v in field_map.items()}
         _doc: Dict[str, str] = {}

diff --git a/requirements.txt b/requirements.txt
@@ -29,4 +29,5 @@ pymilvus
 # Optional: For crawling
 #selenium
 #webdriver-manager
-SPARQLWrapper
+SPARQLWrapper
+mmh3
diff --git a/test/test_document_store.py b/test/test_document_store.py
@@ -31,6 +31,23 @@ def test_init_elastic_client():
     _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
 
 
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "sql", "milvus"], indirect=True)
+def test_write_with_duplicate_doc_ids(document_store):
+    documents = [
+        Document(
+            text="Doc1",
+            id_hash_keys=["key1"]
+        ),
+        Document(
+            text="Doc2",
+            id_hash_keys=["key1"]
+        )
+    ]
+    with pytest.raises(Exception):
+        document_store.write_documents(documents)
+
+
 @pytest.mark.elasticsearch
 def test_get_all_documents_without_filters(document_store_with_docs):
     documents = document_store_with_docs.get_all_documents()
@@ -41,19 +58,22 @@ def test_get_all_documents_without_filters(document_store_with_docs):
 
 
 @pytest.mark.elasticsearch
-def test_get_all_document_filter_duplicate_value(document_store):
+def test_get_all_document_filter_duplicate_text_value(document_store):
     documents = [
         Document(
             text="Doc1",
-            meta={"f1": "0"}
+            meta={"f1": "0"},
+            id_hash_keys=["Doc1", "1"]
         ),
         Document(
             text="Doc1",
-            meta={"f1": "1", "meta_id": "0"}
+            meta={"f1": "1", "meta_id": "0"},
+            id_hash_keys=["Doc1", "2"]
         ),
         Document(
             text="Doc2",
-            meta={"f3": "0"}
+            meta={"f3": "0"},
+            id_hash_keys=["Doc2", "3"]
         )
     ]
     document_store.write_documents(documents)