From 8d495137e3178fd6d70181d633182468f6ef48ba Mon Sep 17 00:00:00 2001
From: lalitpagaria <pagaria.lalit@gmail.com>
Date: Mon, 26 Apr 2021 23:30:17 +0200
Subject: [PATCH 1/8] using text hash as id to prevent document duplication.
 Also providing a way customize it.

---
 haystack/schema.py  | 39 ++++++++++++++++++++++++++-------------
 requirements.txt    |  3 ++-
 test/test_schema.py | 24 ++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 14 deletions(-)
 create mode 100644 test/test_schema.py

diff --git a/haystack/schema.py b/haystack/schema.py
index 6d2dcc0762..1ddf22042f 100644
--- a/haystack/schema.py
+++ b/haystack/schema.py
@@ -1,16 +1,23 @@
 from typing import Any, Optional, Dict, List
 from uuid import uuid4
+
+import mmh3
 import numpy as np
 from abc import abstractmethod
 
+
 class Document:
-    def __init__(self, text: str,
-                 id: Optional[str] = None,
-                 score: Optional[float] = None,
-                 probability: Optional[float] = None,
-                 question: Optional[str] = None,
-                 meta: Dict[str, Any] = None,
-                 embedding: Optional[np.ndarray] = None):
+    def __init__(
+        self,
+        text: str,
+        id: Optional[str] = None,
+        score: Optional[float] = None,
+        probability: Optional[float] = None,
+        question: Optional[str] = None,
+        meta: Dict[str, Any] = None,
+        embedding: Optional[np.ndarray] = None,
+        id_hash_keys: Optional[List[str]] = None
+    ):
         """
         Object used to represent documents / passages in a standardized way within Haystack.
         For example, this is what the retriever will return from the DocumentStore,
@@ -26,20 +33,26 @@ def __init__(self, text: str,
         :param question: Question text for FAQs.
         :param meta: Meta fields for a document like name, url, or author.
         :param embedding: Vector encoding of the text
+        :param id_hash_keys: Hash keys to be used for document id generation
         """
 
         self.text = text
-        # Create a unique ID (either new one, or one from user input)
-        if id:
-            self.id = str(id)
-        else:
-            self.id = str(uuid4())
-
         self.score = score
         self.probability = probability
         self.question = question
         self.meta = meta or {}
         self.embedding = embedding
+        self.id_hash_keys = id_hash_keys
+
+        # Create a unique ID (either new one, or one from user input)
+        if id:
+            self.id = str(id)
+        else:
+            self.id = self._get_id()
+
+    def _get_id(self):
+        final_hash_key = ":".join(self.id_hash_keys) if self.id_hash_keys else self.text
+        return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
 
     def to_dict(self, field_map={}):
         inv_field_map = {v:k for k, v in field_map.items()}
diff --git a/requirements.txt b/requirements.txt
index 5dfeb11ac6..7303dae0a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,5 @@ pymilvus
 # Optional: For crawling
 #selenium
 #webdriver-manager
-SPARQLWrapper
\ No newline at end of file
+SPARQLWrapper
+mmh3
\ No newline at end of file
diff --git a/test/test_schema.py b/test/test_schema.py
new file mode 100644
index 0000000000..d7cdb45ef7
--- /dev/null
+++ b/test/test_schema.py
@@ -0,0 +1,24 @@
+from haystack import Document
+
+
+def test_generate_doc_id_using_text():
+    text1 = "text1"
+    text2 = "text2"
+    doc1_text1 = Document(text=text1, meta={"name": "doc1"})
+    doc2_text1 = Document(text=text1, meta={"name": "doc2"})
+    doc3_text2 = Document(text=text2, meta={"name": "doc3"})
+
+    assert doc1_text1.id == doc2_text1.id
+    assert doc1_text1.id != doc3_text2.id
+
+
+def test_generate_doc_id_using_custom_list():
+    text1 = "text1"
+    text2 = "text2"
+
+    doc1_text1_key1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1"])
+    doc2_text1_key2 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key2"])
+    doc3_text2_key1 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1"])
+
+    assert doc1_text1_key1.id != doc2_text1_key2.id
+    assert doc1_text1_key1.id == doc3_text2_key1.id

From d1abe5be9a8c0a4b6df90c0e1960a3205a696773 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 26 Apr 2021 21:31:16 +0000
Subject: [PATCH 2/8] Add latest docstring and tutorial changes

---
 docs/_src/api/api/document_store.md | 48 +++++++++++++++++++++++------
 docs/_src/api/api/pipelines.md      |  2 +-
 docs/_src/api/api/retriever.md      |  3 +-
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 8b6a128323..9871336d59 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -93,7 +93,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
 #### \_\_init\_\_
 
 ```python
- | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False)
+ | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False)
 ```
 
 A DocumentStore using Elasticsearch to store and query the documents for our search.
@@ -110,6 +110,7 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
 - `password`: password (standard authentication via http_auth)
 - `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth)
 - `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth)
+- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
 - `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
 - `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.
 - `search_fields`: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"]
@@ -250,6 +251,15 @@ Return the number of documents in the document store.
 
 Return the number of labels in the document store
 
+<a name="elasticsearch.ElasticsearchDocumentStore.get_embedding_count"></a>
+#### get\_embedding\_count
+
+```python
+ | get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
 <a name="elasticsearch.ElasticsearchDocumentStore.get_all_documents"></a>
 #### get\_all\_documents
 
@@ -541,6 +551,15 @@ None
 
 Return the number of documents in the document store.
 
+<a name="memory.InMemoryDocumentStore.get_embedding_count"></a>
+#### get\_embedding\_count
+
+```python
+ | get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
 <a name="memory.InMemoryDocumentStore.get_label_count"></a>
 #### get\_label\_count
 
@@ -653,13 +672,6 @@ Fetch documents by specifying a list of text id strings
 
 Fetch documents by specifying a list of text vector id strings
 
-**Arguments**:
-
-- `vector_ids`: List of vector_id strings.
-- `index`: Name of the index to get the documents from. If None, the
-DocumentStore's default index (self.index) will be used.
-- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
-
 <a name="sql.SQLDocumentStore.get_all_documents_generator"></a>
 #### get\_all\_documents\_generator
 
@@ -813,7 +825,7 @@ the vector embeddings are indexed in a FAISS Index.
 #### \_\_init\_\_
 
 ```python
- | __init__(sql_url: str = "sqlite:///", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, **kwargs, ,)
+ | __init__(sql_url: str = "sqlite:///", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional["faiss.swigfaiss.Index"] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, **kwargs, ,)
 ```
 
 **Arguments**:
@@ -916,6 +928,15 @@ a large number of documents without having to load all documents in memory.
 - `return_embedding`: Whether to return the document embeddings.
 - `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
 
+<a name="faiss.FAISSDocumentStore.get_embedding_count"></a>
+#### get\_embedding\_count
+
+```python
+ | get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
 <a name="faiss.FAISSDocumentStore.train_index"></a>
 #### train\_index
 
@@ -1257,3 +1278,12 @@ Helper function to dump all vectors stored in Milvus server.
 
 List[np.array]: List of vectors.
 
+<a name="milvus.MilvusDocumentStore.get_embedding_count"></a>
+#### get\_embedding\_count
+
+```python
+ | get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
index 6ed6fce763..2c17c2f9ba 100644
--- a/docs/_src/api/api/pipelines.md
+++ b/docs/_src/api/api/pipelines.md
@@ -5,7 +5,7 @@
 ## Pipeline Objects
 
 ```python
-class Pipeline(ABC)
+class Pipeline()
 ```
 
 Pipeline brings together building blocks to build a complex search pipeline with Haystack & user-defined components.
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index e7310a3bea..bd3989e5ee 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
 #### train
 
 ```python
- | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
+ | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
 ```
 
 train a DensePassageRetrieval model
@@ -355,6 +355,7 @@ train a DensePassageRetrieval model
 - `train_filename`: training filename
 - `dev_filename`: development set filename, file to be used by model in eval step of training
 - `test_filename`: test set filename, file to be used by model in test step after training
+- `max_sample`: maximum number of input samples to convert. Can be used for debugging a smaller dataset.
 - `max_processes`: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
                       It can be set to 1 to disable the use of multiprocessing or make debugging easier.
 - `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None

From 3dd82e2438d26293807330b83db70e01342ce0f9 Mon Sep 17 00:00:00 2001
From: lalitpagaria <pagaria.lalit@gmail.com>
Date: Tue, 27 Apr 2021 11:37:09 +0200
Subject: [PATCH 3/8] Fixing duplicate value test when text is same

---
 test/test_document_store.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/test_document_store.py b/test/test_document_store.py
index d42618927a..c249e64c24 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -41,19 +41,22 @@ def test_get_all_documents_without_filters(document_store_with_docs):
 
 
 @pytest.mark.elasticsearch
-def test_get_all_document_filter_duplicate_value(document_store):
+def test_get_all_document_filter_duplicate_text_value(document_store):
     documents = [
         Document(
             text="Doc1",
-            meta={"f1": "0"}
+            meta={"f1": "0"},
+            id_hash_keys=["Doc1", "1"]
         ),
         Document(
             text="Doc1",
-            meta={"f1": "1", "meta_id": "0"}
+            meta={"f1": "1", "meta_id": "0"},
+            id_hash_keys=["Doc1", "2"]
         ),
         Document(
             text="Doc2",
-            meta={"f3": "0"}
+            meta={"f3": "0"},
+            id_hash_keys=["Doc2", "3"]
         )
     ]
     document_store.write_documents(documents)

From d7c9be7c73a00a6aef576df91aaab61e73af108e Mon Sep 17 00:00:00 2001
From: lalitpagaria <pagaria.lalit@gmail.com>
Date: Tue, 27 Apr 2021 19:46:01 +0200
Subject: [PATCH 4/8] Adding test for duplicate ids in document store

---
 test/test_document_store.py | 19 +++++++++++++++++++
 test/test_schema.py         | 10 +++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/test/test_document_store.py b/test/test_document_store.py
index c249e64c24..3a70ca76dc 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 from elasticsearch import Elasticsearch
+from elasticsearch.helpers.errors import BulkIndexError
+from sqlalchemy.exc import IntegrityError
 
 from conftest import get_document_store
 from haystack import Document, Label
@@ -31,6 +33,23 @@ def test_init_elastic_client():
     _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
 
 
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "sql", "milvus"], indirect=True)
+def test_write_with_duplicate_doc_ids(document_store):
+    documents = [
+        Document(
+            text="Doc1",
+            id_hash_keys=["key1"]
+        ),
+        Document(
+            text="Doc2",
+            id_hash_keys=["key1"]
+        )
+    ]
+    with pytest.raises(IntegrityError or BulkIndexError):
+        document_store.write_documents(documents)
+
+
 @pytest.mark.elasticsearch
 def test_get_all_documents_without_filters(document_store_with_docs):
     documents = document_store_with_docs.get_all_documents()
diff --git a/test/test_schema.py b/test/test_schema.py
index d7cdb45ef7..205b87e502 100644
--- a/test/test_schema.py
+++ b/test/test_schema.py
@@ -16,9 +16,9 @@ def test_generate_doc_id_using_custom_list():
     text1 = "text1"
     text2 = "text2"
 
-    doc1_text1_key1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1"])
-    doc2_text1_key2 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key2"])
-    doc3_text2_key1 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1"])
+    doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
+    doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
+    doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
 
-    assert doc1_text1_key1.id != doc2_text1_key2.id
-    assert doc1_text1_key1.id == doc3_text2_key1.id
+    assert doc1_text1.id == doc2_text1.id
+    assert doc1_text1.id != doc3_text2.id

From e6ce6a8aa421ef10f0a7af359ecb413f85de877f Mon Sep 17 00:00:00 2001
From: lalitpagaria <pagaria.lalit@gmail.com>
Date: Tue, 27 Apr 2021 20:39:14 +0200
Subject: [PATCH 5/8] Changing exception to generic Exception type

---
 test/test_document_store.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/test_document_store.py b/test/test_document_store.py
index 3a70ca76dc..75a4e0e413 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 from elasticsearch import Elasticsearch
-from elasticsearch.helpers.errors import BulkIndexError
-from sqlalchemy.exc import IntegrityError
 
 from conftest import get_document_store
 from haystack import Document, Label
@@ -46,7 +44,7 @@ def test_write_with_duplicate_doc_ids(document_store):
             id_hash_keys=["key1"]
         )
     ]
-    with pytest.raises(IntegrityError or BulkIndexError):
+    with pytest.raises(Exception):
         document_store.write_documents(documents)
 
 

From 6db7c1d12dd1ec700581da19f2e752905ef47c43 Mon Sep 17 00:00:00 2001
From: Malte Pietsch <malte.pietsch@deepset.ai>
Date: Mon, 17 May 2021 13:27:00 +0200
Subject: [PATCH 6/8] add exception for inmemory. update docstring Document.
 remove id_hash_keys from object attribute

---
 haystack/document_store/memory.py |  3 +++
 haystack/schema.py                | 31 +++++++++++++++++++------------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py
index 3a6c417d2e..f827bf5ea2 100644
--- a/haystack/document_store/memory.py
+++ b/haystack/document_store/memory.py
@@ -74,6 +74,9 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
         documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
 
         for document in documents_objects:
+            if document.id in self.indexes[index]:
+                # TODO Make error type consistent across document stores and add user options to deal with duplicate documents (ignore, overwrite, fail)
+                raise ValueError(f"Duplicate Documents: write_documents() failed - Document with id '{document.id} already exists in index '{index}'")
             self.indexes[index][document.id] = document
 
     def _create_document_field_map(self):
diff --git a/haystack/schema.py b/haystack/schema.py
index 1ddf22042f..9aeb6841eb 100644
--- a/haystack/schema.py
+++ b/haystack/schema.py
@@ -19,21 +19,29 @@ def __init__(
         id_hash_keys: Optional[List[str]] = None
     ):
         """
-        Object used to represent documents / passages in a standardized way within Haystack.
-        For example, this is what the retriever will return from the DocumentStore,
-        regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore.
+        One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
+        Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
+        many other places that manipulate or interact with document-level data.
 
-        Note that there can be multiple Documents originating from one file (e.g. PDF),
-        if you split the text into smaller passages. We'll have one Document per passage in this case.
+        Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
+        into smaller passages. We'll have one Document per passage in this case.
+
+        Each document has a unique ID. This can be supplied by the user or generated automatically.
+        It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
+
+        There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
 
-        :param id: ID used within the DocumentStore
         :param text: Text of the document
+        :param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
+                   creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
         :param score: Retriever's query score for a retrieved document
         :param probability: a pseudo probability by scaling score in the range 0 to 1
-        :param question: Question text for FAQs.
+        :param question: Question text (e.g. for FAQs where one document usually consists of one question and one answer text).
         :param meta: Meta fields for a document like name, url, or author.
         :param embedding: Vector encoding of the text
-        :param id_hash_keys: Hash keys to be used for document id generation
+        :param id_hash_keys: Generate the document id from a custom list of strings.
+                             If you want ensure you don't have duplicate documents in your DocumentStore but texts are
+                             not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
         """
 
         self.text = text
@@ -42,16 +50,15 @@ def __init__(
         self.question = question
         self.meta = meta or {}
         self.embedding = embedding
-        self.id_hash_keys = id_hash_keys
 
         # Create a unique ID (either new one, or one from user input)
         if id:
             self.id = str(id)
         else:
-            self.id = self._get_id()
+            self.id = self._get_id(id_hash_keys)
 
-    def _get_id(self):
-        final_hash_key = ":".join(self.id_hash_keys) if self.id_hash_keys else self.text
+    def _get_id(self, id_hash_keys):
+        final_hash_key = ":".join(id_hash_keys) if id_hash_keys else self.text
         return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
 
     def to_dict(self, field_map={}):

From 6d6457b1346d0c7d4dbc89508a2f6f4dfaf944bb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 17 May 2021 11:33:13 +0000
Subject: [PATCH 7/8] Add latest docstring and tutorial changes

---
 docs/_src/api/api/document_store.md | 82 +++++++++++++++++++++++++++++
 docs/_src/api/api/file_converter.md | 20 -------
 docs/_src/api/api/pipelines.md      | 16 +++++-
 docs/_src/api/api/reader.md         |  4 +-
 docs/_src/api/api/translator.md     |  2 +-
 5 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 9871336d59..bf4381b3d0 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -399,6 +399,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
 
 None
 
+<a name="elasticsearch.ElasticsearchDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
 <a name="elasticsearch.OpenDistroElasticsearchDocumentStore"></a>
 ## OpenDistroElasticsearchDocumentStore Objects
 
@@ -614,6 +632,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
 
 None
 
+<a name="memory.InMemoryDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
 <a name="sql"></a>
 # Module sql
 
@@ -803,6 +839,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
 
 None
 
+<a name="sql.SQLDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
 <a name="faiss"></a>
 # Module faiss
 
@@ -967,6 +1021,15 @@ None
 
 Delete all documents from the document store.
 
+<a name="faiss.FAISSDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete all documents from the document store.
+
 <a name="faiss.FAISSDocumentStore.query_by_embedding"></a>
 #### query\_by\_embedding
 
@@ -1191,6 +1254,25 @@ Delete all documents (from SQL AND Milvus).
 
 None
 
+<a name="milvus.MilvusDocumentStore.delete_documents"></a>
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete all documents (from SQL AND Milvus).
+
+**Arguments**:
+
+- `index`: (SQL) index name for storing the docs and metadata
+- `filters`: Optional filters to narrow down the search space.
+                Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
 <a name="milvus.MilvusDocumentStore.get_all_documents_generator"></a>
 #### get\_all\_documents\_generator
 
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
index 5079e6df13..472afc9c8e 100644
--- a/docs/_src/api/api/file_converter.md
+++ b/docs/_src/api/api/file_converter.md
@@ -87,26 +87,6 @@ Route files in an Indexing Pipeline to corresponding file converters.
 class TextConverter(BaseConverter)
 ```
 
-<a name="txt.TextConverter.__init__"></a>
-#### \_\_init\_\_
-
-```python
- | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
-```
-
-**Arguments**:
-
-- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
-                              The tabular structures in documents might be noise for the reader model if it
-                              does not have table parsing capability for finding answers. However, tables
-                              may also have long strings that could possible candidate for searching answers.
-                              The rows containing strings are thus retained in this option.
-- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
-                        (https://en.wikipedia.org/wiki/ISO_639-1) format.
-                        This option can be used to add test for encoding errors. If the extracted text is
-                        not one of the valid languages, then it might likely be encoding error resulting
-                        in garbled text.
-
 <a name="txt.TextConverter.convert"></a>
 #### convert
 
diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
index 2c17c2f9ba..f3331cb722 100644
--- a/docs/_src/api/api/pipelines.md
+++ b/docs/_src/api/api/pipelines.md
@@ -91,7 +91,7 @@ be passed.
 Here's a sample configuration:
 
     ```yaml
-    |   version: '0.7'
+    |   version: '0.8'
     |
     |    components:    # define all the building-blocks for Pipeline
     |    - name: MyReader       # custom-name for the component; helpful for visualization & debugging
@@ -127,6 +127,20 @@ Here's a sample configuration:
                                      variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
                                      `_` sign must be used to specify nested hierarchical properties.
 
+<a name="pipeline.Pipeline.save_to_yaml"></a>
+#### save\_to\_yaml
+
+```python
+ | save_to_yaml(path: Path, return_defaults: bool = False)
+```
+
+Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`.
+
+**Arguments**:
+
+- `path`: path of the output YAML file.
+- `return_defaults`: whether to output parameters that have the default values.
+
 <a name="pipeline.BaseStandardPipeline"></a>
 ## BaseStandardPipeline Objects
 
diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
index 69718999f3..bee395b357 100644
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@@ -39,7 +39,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
 #### \_\_init\_\_
 
 ```python
- | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True)
+ | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0)
 ```
 
 **Arguments**:
@@ -78,6 +78,8 @@ and that FARM includes no_answer in the sorted list of predictions.
 - `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``)
 - `progress_bar`: Whether to show a tqdm progress bar or not.
                      Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
+                            The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
 
 <a name="farm.FARMReader.train"></a>
 #### train
diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md
index 45bdeedaea..4f67f176e0 100644
--- a/docs/_src/api/api/translator.md
+++ b/docs/_src/api/api/translator.md
@@ -5,7 +5,7 @@
 ## BaseTranslator Objects
 
 ```python
-class BaseTranslator(ABC)
+class BaseTranslator(BaseComponent)
 ```
 
 Abstract class for a Translator component that translates either a query or a doc from language A to language B.

From 31a98eb69756f3cb5770e5a363032a6c95511568 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 17 May 2021 15:30:35 +0000
Subject: [PATCH 8/8] Add latest docstring and tutorial changes

---
 docs/_src/api/api/retriever.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index bd3989e5ee..31e70169f2 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
 #### train
 
 ```python
- | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
+ | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
 ```
 
 train a DensePassageRetrieval model
@@ -370,6 +370,12 @@ train a DensePassageRetrieval model
 - `epsilon`: epsilon parameter of optimizer
 - `weight_decay`: weight decay parameter of optimizer
 - `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done
+- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are:
+            "O0" (FP32)
+            "O1" (Mixed Precision)
+            "O2" (Almost FP16)
+            "O3" (Pure FP16).
+            For more information, refer to: https://nvidia.github.io/apex/amp.html
 - `optimizer_name`: what optimizer to use (default: TransformersAdamW)
 - `num_warmup_steps`: number of warmup steps
 - `optimizer_correct_bias`: Whether to correct bias in optimizer