diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 9871336d59..bf4381b3d0 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -399,6 +399,24 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + ## OpenDistroElasticsearchDocumentStore Objects @@ -614,6 +632,24 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + # Module sql @@ -803,6 +839,24 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + # Module faiss @@ -967,6 +1021,15 @@ None Delete all documents from the document store. + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete all documents from the document store. + #### query\_by\_embedding @@ -1191,6 +1254,25 @@ Delete all documents (from SQL AND Milvus). None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete all documents (from SQL AND Milvus). + +**Arguments**: + +- `index`: (SQL) index name for storing the docs and metadata +- `filters`: Optional filters to narrow down the search space. + Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + #### get\_all\_documents\_generator diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index 5079e6df13..472afc9c8e 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -87,26 +87,6 @@ Route files in an Indexing Pipeline to corresponding file converters. class TextConverter(BaseConverter) ``` - -#### \_\_init\_\_ - -```python - | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) -``` - -**Arguments**: - -- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. - The tabular structures in documents might be noise for the reader model if it - does not have table parsing capability for finding answers. However, tables - may also have long strings that could possible candidate for searching answers. - The rows containing strings are thus retained in this option. -- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 - (https://en.wikipedia.org/wiki/ISO_639-1) format. - This option can be used to add test for encoding errors. If the extracted text is - not one of the valid languages, then it might likely be encoding error resulting - in garbled text. - #### convert diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 2c17c2f9ba..f3331cb722 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -91,7 +91,7 @@ be passed. Here's a sample configuration: ```yaml - | version: '0.7' + | version: '0.8' | | components: # define all the building-blocks for Pipeline | - name: MyReader # custom-name for the component; helpful for visualization & debugging @@ -127,6 +127,20 @@ Here's a sample configuration: variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. + +#### save\_to\_yaml + +```python + | save_to_yaml(path: Path, return_defaults: bool = False) +``` + +Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`. + +**Arguments**: + +- `path`: path of the output YAML file. +- `return_defaults`: whether to output parameters that have the default values. + ## BaseStandardPipeline Objects diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index 69718999f3..bee395b357 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -39,7 +39,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf #### \_\_init\_\_ ```python - | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True) + | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0) ``` **Arguments**: @@ -78,6 +78,8 @@ and that FARM includes no_answer in the sorted list of predictions. - `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) - `progress_bar`: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered. + The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. #### train diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index bd3989e5ee..31e70169f2 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim) #### train ```python - | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") + | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") ``` train a DensePassageRetrieval model @@ -370,6 +370,12 @@ train a DensePassageRetrieval model - `epsilon`: epsilon parameter of optimizer - `weight_decay`: weight decay parameter of optimizer - `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done +- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are: + "O0" (FP32) + "O1" (Mixed Precision) + "O2" (Almost FP16) + "O3" (Pure FP16). + For more information, refer to: https://nvidia.github.io/apex/amp.html - `optimizer_name`: what optimizer to use (default: TransformersAdamW) - `num_warmup_steps`: number of warmup steps - `optimizer_correct_bias`: Whether to correct bias in optimizer diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md index 45bdeedaea..4f67f176e0 100644 --- a/docs/_src/api/api/translator.md +++ b/docs/_src/api/api/translator.md @@ -5,7 +5,7 @@ ## BaseTranslator Objects ```python -class BaseTranslator(ABC) +class BaseTranslator(BaseComponent) ``` Abstract class for a Translator component that translates either a query or a doc from language A to language B. diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py index e8b57cbd95..73693a54e3 100644 --- a/haystack/document_store/memory.py +++ b/haystack/document_store/memory.py @@ -81,6 +81,9 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents] for document in documents_objects: + if document.id in self.indexes[index]: + # TODO Make error type consistent across document stores and add user options to deal with duplicate documents (ignore, overwrite, fail) + raise ValueError(f"Duplicate Documents: write_documents() failed - Document with id '{document.id} already exists in index '{index}'") self.indexes[index][document.id] = document def _create_document_field_map(self): diff --git a/haystack/schema.py b/haystack/schema.py index af8361aac0..2450945f34 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -1,47 +1,66 @@ from typing import Any, Optional, Dict, List from uuid import uuid4 + +import mmh3 import numpy as np from abc import abstractmethod class Document: - def __init__(self, text: str, - id: Optional[str] = None, - score: Optional[float] = None, - probability: Optional[float] = None, - question: Optional[str] = None, - meta: Dict[str, Any] = None, - embedding: Optional[np.ndarray] = None): + def __init__( + self, + text: str, + id: Optional[str] = None, + score: Optional[float] = None, + probability: Optional[float] = None, + question: Optional[str] = None, + meta: Dict[str, Any] = None, + embedding: Optional[np.ndarray] = None, + id_hash_keys: Optional[List[str]] = None + ): """ - Object used to represent documents / passages in a standardized way within Haystack. - For example, this is what the retriever will return from the DocumentStore, - regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore. + One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. + Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in + many other places that manipulate or interact with document-level data. + + Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text + into smaller passages. We'll have one Document per passage in this case. + + Each document has a unique ID. This can be supplied by the user or generated automatically. + It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) - Note that there can be multiple Documents originating from one file (e.g. PDF), - if you split the text into smaller passages. We'll have one Document per passage in this case. + There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. - :param id: ID used within the DocumentStore :param text: Text of the document + :param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by + creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`. :param score: Retriever's query score for a retrieved document :param probability: a pseudo probability by scaling score in the range 0 to 1 - :param question: Question text for FAQs. + :param question: Question text (e.g. for FAQs where one document usually consists of one question and one answer text). :param meta: Meta fields for a document like name, url, or author. :param embedding: Vector encoding of the text + :param id_hash_keys: Generate the document id from a custom list of strings. + If you want ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. """ self.text = text - # Create a unique ID (either new one, or one from user input) - if id: - self.id = str(id) - else: - self.id = str(uuid4()) - self.score = score self.probability = probability self.question = question self.meta = meta or {} self.embedding = embedding + # Create a unique ID (either new one, or one from user input) + if id: + self.id = str(id) + else: + self.id = self._get_id(id_hash_keys) + + def _get_id(self, id_hash_keys): + final_hash_key = ":".join(id_hash_keys) if id_hash_keys else self.text + return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) + def to_dict(self, field_map={}): inv_field_map = {v:k for k, v in field_map.items()} _doc: Dict[str, str] = {} diff --git a/requirements.txt b/requirements.txt index 5dfeb11ac6..7303dae0a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,5 @@ pymilvus # Optional: For crawling #selenium #webdriver-manager -SPARQLWrapper \ No newline at end of file +SPARQLWrapper +mmh3 \ No newline at end of file diff --git a/test/test_document_store.py b/test/test_document_store.py index b8950c79e0..856e671a04 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -31,6 +31,23 @@ def test_init_elastic_client(): _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test") +@pytest.mark.elasticsearch +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "sql", "milvus"], indirect=True) +def test_write_with_duplicate_doc_ids(document_store): + documents = [ + Document( + text="Doc1", + id_hash_keys=["key1"] + ), + Document( + text="Doc2", + id_hash_keys=["key1"] + ) + ] + with pytest.raises(Exception): + document_store.write_documents(documents) + + @pytest.mark.elasticsearch def test_get_all_documents_without_filters(document_store_with_docs): documents = document_store_with_docs.get_all_documents() @@ -41,19 +58,22 @@ def test_get_all_documents_without_filters(document_store_with_docs): @pytest.mark.elasticsearch -def test_get_all_document_filter_duplicate_value(document_store): +def test_get_all_document_filter_duplicate_text_value(document_store): documents = [ Document( text="Doc1", - meta={"f1": "0"} + meta={"f1": "0"}, + id_hash_keys=["Doc1", "1"] ), Document( text="Doc1", - meta={"f1": "1", "meta_id": "0"} + meta={"f1": "1", "meta_id": "0"}, + id_hash_keys=["Doc1", "2"] ), Document( text="Doc2", - meta={"f3": "0"} + meta={"f3": "0"}, + id_hash_keys=["Doc2", "3"] ) ] document_store.write_documents(documents) diff --git a/test/test_schema.py b/test/test_schema.py new file mode 100644 index 0000000000..205b87e502 --- /dev/null +++ b/test/test_schema.py @@ -0,0 +1,24 @@ +from haystack import Document + + +def test_generate_doc_id_using_text(): + text1 = "text1" + text2 = "text2" + doc1_text1 = Document(text=text1, meta={"name": "doc1"}) + doc2_text1 = Document(text=text1, meta={"name": "doc2"}) + doc3_text2 = Document(text=text2, meta={"name": "doc3"}) + + assert doc1_text1.id == doc2_text1.id + assert doc1_text1.id != doc3_text2.id + + +def test_generate_doc_id_using_custom_list(): + text1 = "text1" + text2 = "text2" + + doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1]) + doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1]) + doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2]) + + assert doc1_text1.id == doc2_text1.id + assert doc1_text1.id != doc3_text2.id