diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 9871336d59..bf4381b3d0 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -399,6 +399,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
None
+
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
## OpenDistroElasticsearchDocumentStore Objects
@@ -614,6 +632,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
None
+
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
# Module sql
@@ -803,6 +839,24 @@ Delete documents in an index. All documents are deleted if no filters are passed
None
+
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
# Module faiss
@@ -967,6 +1021,15 @@ None
Delete all documents from the document store.
+
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete all documents from the document store.
+
#### query\_by\_embedding
@@ -1191,6 +1254,25 @@ Delete all documents (from SQL AND Milvus).
None
+
+#### delete\_documents
+
+```python
+ | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
+```
+
+Delete all documents (from SQL AND Milvus).
+
+**Arguments**:
+
+- `index`: (SQL) index name for storing the docs and metadata
+- `filters`: Optional filters to narrow down the search space.
+ Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
#### get\_all\_documents\_generator
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
index 5079e6df13..472afc9c8e 100644
--- a/docs/_src/api/api/file_converter.md
+++ b/docs/_src/api/api/file_converter.md
@@ -87,26 +87,6 @@ Route files in an Indexing Pipeline to corresponding file converters.
class TextConverter(BaseConverter)
```
-
-#### \_\_init\_\_
-
-```python
- | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
-```
-
-**Arguments**:
-
-- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
- The tabular structures in documents might be noise for the reader model if it
- does not have table parsing capability for finding answers. However, tables
- may also have long strings that could possible candidate for searching answers.
- The rows containing strings are thus retained in this option.
-- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
- (https://en.wikipedia.org/wiki/ISO_639-1) format.
- This option can be used to add test for encoding errors. If the extracted text is
- not one of the valid languages, then it might likely be encoding error resulting
- in garbled text.
-
#### convert
diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
index 2c17c2f9ba..f3331cb722 100644
--- a/docs/_src/api/api/pipelines.md
+++ b/docs/_src/api/api/pipelines.md
@@ -91,7 +91,7 @@ be passed.
Here's a sample configuration:
```yaml
- | version: '0.7'
+ | version: '0.8'
|
| components: # define all the building-blocks for Pipeline
| - name: MyReader # custom-name for the component; helpful for visualization & debugging
@@ -127,6 +127,20 @@ Here's a sample configuration:
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
`_` sign must be used to specify nested hierarchical properties.
+
+#### save\_to\_yaml
+
+```python
+ | save_to_yaml(path: Path, return_defaults: bool = False)
+```
+
+Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`.
+
+**Arguments**:
+
+- `path`: path of the output YAML file.
+- `return_defaults`: whether to output parameters that have the default values.
+
## BaseStandardPipeline Objects
diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
index 69718999f3..bee395b357 100644
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@@ -39,7 +39,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
#### \_\_init\_\_
```python
- | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True)
+ | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0)
```
**Arguments**:
@@ -78,6 +78,8 @@ and that FARM includes no_answer in the sorted list of predictions.
- `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``)
- `progress_bar`: Whether to show a tqdm progress bar or not.
Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
+ The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
#### train
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index bd3989e5ee..31e70169f2 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
#### train
```python
- | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
+ | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
```
train a DensePassageRetrieval model
@@ -370,6 +370,12 @@ train a DensePassageRetrieval model
- `epsilon`: epsilon parameter of optimizer
- `weight_decay`: weight decay parameter of optimizer
- `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done
+- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are:
+ "O0" (FP32)
+ "O1" (Mixed Precision)
+ "O2" (Almost FP16)
+ "O3" (Pure FP16).
+ For more information, refer to: https://nvidia.github.io/apex/amp.html
- `optimizer_name`: what optimizer to use (default: TransformersAdamW)
- `num_warmup_steps`: number of warmup steps
- `optimizer_correct_bias`: Whether to correct bias in optimizer
diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md
index 45bdeedaea..4f67f176e0 100644
--- a/docs/_src/api/api/translator.md
+++ b/docs/_src/api/api/translator.md
@@ -5,7 +5,7 @@
## BaseTranslator Objects
```python
-class BaseTranslator(ABC)
+class BaseTranslator(BaseComponent)
```
Abstract class for a Translator component that translates either a query or a doc from language A to language B.
diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py
index e8b57cbd95..73693a54e3 100644
--- a/haystack/document_store/memory.py
+++ b/haystack/document_store/memory.py
@@ -81,6 +81,9 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
for document in documents_objects:
+ if document.id in self.indexes[index]:
+ # TODO Make error type consistent across document stores and add user options to deal with duplicate documents (ignore, overwrite, fail)
+ raise ValueError(f"Duplicate Documents: write_documents() failed - Document with id '{document.id} already exists in index '{index}'")
self.indexes[index][document.id] = document
def _create_document_field_map(self):
diff --git a/haystack/schema.py b/haystack/schema.py
index af8361aac0..2450945f34 100644
--- a/haystack/schema.py
+++ b/haystack/schema.py
@@ -1,47 +1,66 @@
from typing import Any, Optional, Dict, List
from uuid import uuid4
+
+import mmh3
import numpy as np
from abc import abstractmethod
class Document:
- def __init__(self, text: str,
- id: Optional[str] = None,
- score: Optional[float] = None,
- probability: Optional[float] = None,
- question: Optional[str] = None,
- meta: Dict[str, Any] = None,
- embedding: Optional[np.ndarray] = None):
+ def __init__(
+ self,
+ text: str,
+ id: Optional[str] = None,
+ score: Optional[float] = None,
+ probability: Optional[float] = None,
+ question: Optional[str] = None,
+ meta: Dict[str, Any] = None,
+ embedding: Optional[np.ndarray] = None,
+ id_hash_keys: Optional[List[str]] = None
+ ):
"""
- Object used to represent documents / passages in a standardized way within Haystack.
- For example, this is what the retriever will return from the DocumentStore,
- regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore.
+ One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
+ Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
+ many other places that manipulate or interact with document-level data.
+
+ Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
+ into smaller passages. We'll have one Document per passage in this case.
+
+ Each document has a unique ID. This can be supplied by the user or generated automatically.
+ It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
- Note that there can be multiple Documents originating from one file (e.g. PDF),
- if you split the text into smaller passages. We'll have one Document per passage in this case.
+ There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
- :param id: ID used within the DocumentStore
:param text: Text of the document
+ :param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
+ creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
:param score: Retriever's query score for a retrieved document
:param probability: a pseudo probability by scaling score in the range 0 to 1
- :param question: Question text for FAQs.
+ :param question: Question text (e.g. for FAQs where one document usually consists of one question and one answer text).
:param meta: Meta fields for a document like name, url, or author.
:param embedding: Vector encoding of the text
+ :param id_hash_keys: Generate the document id from a custom list of strings.
+ If you want ensure you don't have duplicate documents in your DocumentStore but texts are
+ not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
"""
self.text = text
- # Create a unique ID (either new one, or one from user input)
- if id:
- self.id = str(id)
- else:
- self.id = str(uuid4())
-
self.score = score
self.probability = probability
self.question = question
self.meta = meta or {}
self.embedding = embedding
+ # Create a unique ID (either new one, or one from user input)
+ if id:
+ self.id = str(id)
+ else:
+ self.id = self._get_id(id_hash_keys)
+
+ def _get_id(self, id_hash_keys):
+ final_hash_key = ":".join(id_hash_keys) if id_hash_keys else self.text
+ return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
+
def to_dict(self, field_map={}):
inv_field_map = {v:k for k, v in field_map.items()}
_doc: Dict[str, str] = {}
diff --git a/requirements.txt b/requirements.txt
index 5dfeb11ac6..7303dae0a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,5 @@ pymilvus
# Optional: For crawling
#selenium
#webdriver-manager
-SPARQLWrapper
\ No newline at end of file
+SPARQLWrapper
+mmh3
\ No newline at end of file
diff --git a/test/test_document_store.py b/test/test_document_store.py
index b8950c79e0..856e671a04 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -31,6 +31,23 @@ def test_init_elastic_client():
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
+@pytest.mark.elasticsearch
+@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "sql", "milvus"], indirect=True)
+def test_write_with_duplicate_doc_ids(document_store):
+ documents = [
+ Document(
+ text="Doc1",
+ id_hash_keys=["key1"]
+ ),
+ Document(
+ text="Doc2",
+ id_hash_keys=["key1"]
+ )
+ ]
+ with pytest.raises(Exception):
+ document_store.write_documents(documents)
+
+
@pytest.mark.elasticsearch
def test_get_all_documents_without_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
@@ -41,19 +58,22 @@ def test_get_all_documents_without_filters(document_store_with_docs):
@pytest.mark.elasticsearch
-def test_get_all_document_filter_duplicate_value(document_store):
+def test_get_all_document_filter_duplicate_text_value(document_store):
documents = [
Document(
text="Doc1",
- meta={"f1": "0"}
+ meta={"f1": "0"},
+ id_hash_keys=["Doc1", "1"]
),
Document(
text="Doc1",
- meta={"f1": "1", "meta_id": "0"}
+ meta={"f1": "1", "meta_id": "0"},
+ id_hash_keys=["Doc1", "2"]
),
Document(
text="Doc2",
- meta={"f3": "0"}
+ meta={"f3": "0"},
+ id_hash_keys=["Doc2", "3"]
)
]
document_store.write_documents(documents)
diff --git a/test/test_schema.py b/test/test_schema.py
new file mode 100644
index 0000000000..205b87e502
--- /dev/null
+++ b/test/test_schema.py
@@ -0,0 +1,24 @@
+from haystack import Document
+
+
+def test_generate_doc_id_using_text():
+ text1 = "text1"
+ text2 = "text2"
+ doc1_text1 = Document(text=text1, meta={"name": "doc1"})
+ doc2_text1 = Document(text=text1, meta={"name": "doc2"})
+ doc3_text2 = Document(text=text2, meta={"name": "doc3"})
+
+ assert doc1_text1.id == doc2_text1.id
+ assert doc1_text1.id != doc3_text2.id
+
+
+def test_generate_doc_id_using_custom_list():
+ text1 = "text1"
+ text2 = "text2"
+
+ doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
+ doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
+ doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
+
+ assert doc1_text1.id == doc2_text1.id
+ assert doc1_text1.id != doc3_text2.id