From 8d495137e3178fd6d70181d633182468f6ef48ba Mon Sep 17 00:00:00 2001 From: lalitpagaria Date: Mon, 26 Apr 2021 23:30:17 +0200 Subject: [PATCH 1/8] using text hash as id to prevent document duplication. Also providing a way customize it. --- haystack/schema.py | 39 ++++++++++++++++++++++++++------------- requirements.txt | 3 ++- test/test_schema.py | 24 ++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 14 deletions(-) create mode 100644 test/test_schema.py diff --git a/haystack/schema.py b/haystack/schema.py index 6d2dcc0762..1ddf22042f 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -1,16 +1,23 @@ from typing import Any, Optional, Dict, List from uuid import uuid4 + +import mmh3 import numpy as np from abc import abstractmethod + class Document: - def __init__(self, text: str, - id: Optional[str] = None, - score: Optional[float] = None, - probability: Optional[float] = None, - question: Optional[str] = None, - meta: Dict[str, Any] = None, - embedding: Optional[np.ndarray] = None): + def __init__( + self, + text: str, + id: Optional[str] = None, + score: Optional[float] = None, + probability: Optional[float] = None, + question: Optional[str] = None, + meta: Dict[str, Any] = None, + embedding: Optional[np.ndarray] = None, + id_hash_keys: Optional[List[str]] = None + ): """ Object used to represent documents / passages in a standardized way within Haystack. For example, this is what the retriever will return from the DocumentStore, @@ -26,20 +33,26 @@ def __init__(self, text: str, :param question: Question text for FAQs. :param meta: Meta fields for a document like name, url, or author. :param embedding: Vector encoding of the text + :param id_hash_keys: Hash keys to be used for document id generation """ self.text = text - # Create a unique ID (either new one, or one from user input) - if id: - self.id = str(id) - else: - self.id = str(uuid4()) - self.score = score self.probability = probability self.question = question self.meta = meta or {} self.embedding = embedding + self.id_hash_keys = id_hash_keys + + # Create a unique ID (either new one, or one from user input) + if id: + self.id = str(id) + else: + self.id = self._get_id() + + def _get_id(self): + final_hash_key = ":".join(self.id_hash_keys) if self.id_hash_keys else self.text + return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) def to_dict(self, field_map={}): inv_field_map = {v:k for k, v in field_map.items()} diff --git a/requirements.txt b/requirements.txt index 5dfeb11ac6..7303dae0a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,5 @@ pymilvus # Optional: For crawling #selenium #webdriver-manager -SPARQLWrapper \ No newline at end of file +SPARQLWrapper +mmh3 \ No newline at end of file diff --git a/test/test_schema.py b/test/test_schema.py new file mode 100644 index 0000000000..d7cdb45ef7 --- /dev/null +++ b/test/test_schema.py @@ -0,0 +1,24 @@ +from haystack import Document + + +def test_generate_doc_id_using_text(): + text1 = "text1" + text2 = "text2" + doc1_text1 = Document(text=text1, meta={"name": "doc1"}) + doc2_text1 = Document(text=text1, meta={"name": "doc2"}) + doc3_text2 = Document(text=text2, meta={"name": "doc3"}) + + assert doc1_text1.id == doc2_text1.id + assert doc1_text1.id != doc3_text2.id + + +def test_generate_doc_id_using_custom_list(): + text1 = "text1" + text2 = "text2" + + doc1_text1_key1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1"]) + doc2_text1_key2 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key2"]) + doc3_text2_key1 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1"]) + + assert doc1_text1_key1.id != doc2_text1_key2.id + assert doc1_text1_key1.id == doc3_text2_key1.id From d1abe5be9a8c0a4b6df90c0e1960a3205a696773 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 26 Apr 2021 21:31:16 +0000 Subject: [PATCH 2/8] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 48 +++++++++++++++++++++++------ docs/_src/api/api/pipelines.md | 2 +- docs/_src/api/api/retriever.md | 3 +- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 8b6a128323..9871336d59 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -93,7 +93,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore) #### \_\_init\_\_ ```python - | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False) + | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False) ``` A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -110,6 +110,7 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea - `password`: password (standard authentication via http_auth) - `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth) - `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth) +- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) - `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. - `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. - `search_fields`: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] @@ -250,6 +251,15 @@ Return the number of documents in the document store. Return the number of labels in the document store + +#### get\_embedding\_count + +```python + | get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +``` + +Return the count of embeddings in the document store. + #### get\_all\_documents @@ -541,6 +551,15 @@ None Return the number of documents in the document store. + +#### get\_embedding\_count + +```python + | get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int +``` + +Return the count of embeddings in the document store. + #### get\_label\_count @@ -653,13 +672,6 @@ Fetch documents by specifying a list of text id strings Fetch documents by specifying a list of text vector id strings -**Arguments**: - -- `vector_ids`: List of vector_id strings. -- `index`: Name of the index to get the documents from. If None, the -DocumentStore's default index (self.index) will be used. -- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. - #### get\_all\_documents\_generator @@ -813,7 +825,7 @@ the vector embeddings are indexed in a FAISS Index. #### \_\_init\_\_ ```python - | __init__(sql_url: str = "sqlite:///", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, **kwargs, ,) + | __init__(sql_url: str = "sqlite:///", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional["faiss.swigfaiss.Index"] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, **kwargs, ,) ``` **Arguments**: @@ -916,6 +928,15 @@ a large number of documents without having to load all documents in memory. - `return_embedding`: Whether to return the document embeddings. - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +#### get\_embedding\_count + +```python + | get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +``` + +Return the count of embeddings in the document store. + #### train\_index @@ -1257,3 +1278,12 @@ Helper function to dump all vectors stored in Milvus server. List[np.array]: List of vectors. + +#### get\_embedding\_count + +```python + | get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +``` + +Return the count of embeddings in the document store. + diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 6ed6fce763..2c17c2f9ba 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -5,7 +5,7 @@ ## Pipeline Objects ```python -class Pipeline(ABC) +class Pipeline() ``` Pipeline brings together building blocks to build a complex search pipeline with Haystack & user-defined components. diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index e7310a3bea..bd3989e5ee 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim) #### train ```python - | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") + | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") ``` train a DensePassageRetrieval model @@ -355,6 +355,7 @@ train a DensePassageRetrieval model - `train_filename`: training filename - `dev_filename`: development set filename, file to be used by model in eval step of training - `test_filename`: test set filename, file to be used by model in test step after training +- `max_sample`: maximum number of input samples to convert. Can be used for debugging a smaller dataset. - `max_processes`: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. It can be set to 1 to disable the use of multiprocessing or make debugging easier. - `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None From 3dd82e2438d26293807330b83db70e01342ce0f9 Mon Sep 17 00:00:00 2001 From: lalitpagaria Date: Tue, 27 Apr 2021 11:37:09 +0200 Subject: [PATCH 3/8] Fixing duplicate value test when text is same --- test/test_document_store.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index d42618927a..c249e64c24 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -41,19 +41,22 @@ def test_get_all_documents_without_filters(document_store_with_docs): @pytest.mark.elasticsearch -def test_get_all_document_filter_duplicate_value(document_store): +def test_get_all_document_filter_duplicate_text_value(document_store): documents = [ Document( text="Doc1", - meta={"f1": "0"} + meta={"f1": "0"}, + id_hash_keys=["Doc1", "1"] ), Document( text="Doc1", - meta={"f1": "1", "meta_id": "0"} + meta={"f1": "1", "meta_id": "0"}, + id_hash_keys=["Doc1", "2"] ), Document( text="Doc2", - meta={"f3": "0"} + meta={"f3": "0"}, + id_hash_keys=["Doc2", "3"] ) ] document_store.write_documents(documents) From d7c9be7c73a00a6aef576df91aaab61e73af108e Mon Sep 17 00:00:00 2001 From: lalitpagaria Date: Tue, 27 Apr 2021 19:46:01 +0200 Subject: [PATCH 4/8] Adding test for duplicate ids in document store --- test/test_document_store.py | 19 +++++++++++++++++++ test/test_schema.py | 10 +++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index c249e64c24..3a70ca76dc 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1,6 +1,8 @@ import numpy as np import pytest from elasticsearch import Elasticsearch +from elasticsearch.helpers.errors import BulkIndexError +from sqlalchemy.exc import IntegrityError from conftest import get_document_store from haystack import Document, Label @@ -31,6 +33,23 @@ def test_init_elastic_client(): _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test") +@pytest.mark.elasticsearch +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "sql", "milvus"], indirect=True) +def test_write_with_duplicate_doc_ids(document_store): + documents = [ + Document( + text="Doc1", + id_hash_keys=["key1"] + ), + Document( + text="Doc2", + id_hash_keys=["key1"] + ) + ] + with pytest.raises(IntegrityError or BulkIndexError): + document_store.write_documents(documents) + + @pytest.mark.elasticsearch def test_get_all_documents_without_filters(document_store_with_docs): documents = document_store_with_docs.get_all_documents() diff --git a/test/test_schema.py b/test/test_schema.py index d7cdb45ef7..205b87e502 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -16,9 +16,9 @@ def test_generate_doc_id_using_custom_list(): text1 = "text1" text2 = "text2" - doc1_text1_key1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1"]) - doc2_text1_key2 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key2"]) - doc3_text2_key1 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1"]) + doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1]) + doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1]) + doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2]) - assert doc1_text1_key1.id != doc2_text1_key2.id - assert doc1_text1_key1.id == doc3_text2_key1.id + assert doc1_text1.id == doc2_text1.id + assert doc1_text1.id != doc3_text2.id From e6ce6a8aa421ef10f0a7af359ecb413f85de877f Mon Sep 17 00:00:00 2001 From: lalitpagaria Date: Tue, 27 Apr 2021 20:39:14 +0200 Subject: [PATCH 5/8] Changing exception to generic Exception type --- test/test_document_store.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index 3a70ca76dc..75a4e0e413 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1,8 +1,6 @@ import numpy as np import pytest from elasticsearch import Elasticsearch -from elasticsearch.helpers.errors import BulkIndexError -from sqlalchemy.exc import IntegrityError from conftest import get_document_store from haystack import Document, Label @@ -46,7 +44,7 @@ def test_write_with_duplicate_doc_ids(document_store): id_hash_keys=["key1"] ) ] - with pytest.raises(IntegrityError or BulkIndexError): + with pytest.raises(Exception): document_store.write_documents(documents) From 6db7c1d12dd1ec700581da19f2e752905ef47c43 Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Mon, 17 May 2021 13:27:00 +0200 Subject: [PATCH 6/8] add exception for inmemory. update docstring Document. remove id_hash_keys from object attribute --- haystack/document_store/memory.py | 3 +++ haystack/schema.py | 31 +++++++++++++++++++------------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py index 3a6c417d2e..f827bf5ea2 100644 --- a/haystack/document_store/memory.py +++ b/haystack/document_store/memory.py @@ -74,6 +74,9 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents] for document in documents_objects: + if document.id in self.indexes[index]: + # TODO Make error type consistent across document stores and add user options to deal with duplicate documents (ignore, overwrite, fail) + raise ValueError(f"Duplicate Documents: write_documents() failed - Document with id '{document.id} already exists in index '{index}'") self.indexes[index][document.id] = document def _create_document_field_map(self): diff --git a/haystack/schema.py b/haystack/schema.py index 1ddf22042f..9aeb6841eb 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -19,21 +19,29 @@ def __init__( id_hash_keys: Optional[List[str]] = None ): """ - Object used to represent documents / passages in a standardized way within Haystack. - For example, this is what the retriever will return from the DocumentStore, - regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore. + One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. + Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in + many other places that manipulate or interact with document-level data. - Note that there can be multiple Documents originating from one file (e.g. PDF), - if you split the text into smaller passages. We'll have one Document per passage in this case. + Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text + into smaller passages. We'll have one Document per passage in this case. + + Each document has a unique ID. This can be supplied by the user or generated automatically. + It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) + + There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. - :param id: ID used within the DocumentStore :param text: Text of the document + :param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by + creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`. :param score: Retriever's query score for a retrieved document :param probability: a pseudo probability by scaling score in the range 0 to 1 - :param question: Question text for FAQs. + :param question: Question text (e.g. for FAQs where one document usually consists of one question and one answer text). :param meta: Meta fields for a document like name, url, or author. :param embedding: Vector encoding of the text - :param id_hash_keys: Hash keys to be used for document id generation + :param id_hash_keys: Generate the document id from a custom list of strings. + If you want ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. """ self.text = text @@ -42,16 +50,15 @@ def __init__( self.question = question self.meta = meta or {} self.embedding = embedding - self.id_hash_keys = id_hash_keys # Create a unique ID (either new one, or one from user input) if id: self.id = str(id) else: - self.id = self._get_id() + self.id = self._get_id(id_hash_keys) - def _get_id(self): - final_hash_key = ":".join(self.id_hash_keys) if self.id_hash_keys else self.text + def _get_id(self, id_hash_keys): + final_hash_key = ":".join(id_hash_keys) if id_hash_keys else self.text return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) def to_dict(self, field_map={}): From 6d6457b1346d0c7d4dbc89508a2f6f4dfaf944bb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 17 May 2021 11:33:13 +0000 Subject: [PATCH 7/8] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 82 +++++++++++++++++++++++++++++ docs/_src/api/api/file_converter.md | 20 ------- docs/_src/api/api/pipelines.md | 16 +++++- docs/_src/api/api/reader.md | 4 +- docs/_src/api/api/translator.md | 2 +- 5 files changed, 101 insertions(+), 23 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 9871336d59..bf4381b3d0 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -399,6 +399,24 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + ## OpenDistroElasticsearchDocumentStore Objects @@ -614,6 +632,24 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + # Module sql @@ -803,6 +839,24 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + # Module faiss @@ -967,6 +1021,15 @@ None Delete all documents from the document store. + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete all documents from the document store. + #### query\_by\_embedding @@ -1191,6 +1254,25 @@ Delete all documents (from SQL AND Milvus). None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete all documents (from SQL AND Milvus). + +**Arguments**: + +- `index`: (SQL) index name for storing the docs and metadata +- `filters`: Optional filters to narrow down the search space. + Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + #### get\_all\_documents\_generator diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index 5079e6df13..472afc9c8e 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -87,26 +87,6 @@ Route files in an Indexing Pipeline to corresponding file converters. class TextConverter(BaseConverter) ``` - -#### \_\_init\_\_ - -```python - | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) -``` - -**Arguments**: - -- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. - The tabular structures in documents might be noise for the reader model if it - does not have table parsing capability for finding answers. However, tables - may also have long strings that could possible candidate for searching answers. - The rows containing strings are thus retained in this option. -- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 - (https://en.wikipedia.org/wiki/ISO_639-1) format. - This option can be used to add test for encoding errors. If the extracted text is - not one of the valid languages, then it might likely be encoding error resulting - in garbled text. - #### convert diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 2c17c2f9ba..f3331cb722 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -91,7 +91,7 @@ be passed. Here's a sample configuration: ```yaml - | version: '0.7' + | version: '0.8' | | components: # define all the building-blocks for Pipeline | - name: MyReader # custom-name for the component; helpful for visualization & debugging @@ -127,6 +127,20 @@ Here's a sample configuration: variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. + +#### save\_to\_yaml + +```python + | save_to_yaml(path: Path, return_defaults: bool = False) +``` + +Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`. + +**Arguments**: + +- `path`: path of the output YAML file. +- `return_defaults`: whether to output parameters that have the default values. + ## BaseStandardPipeline Objects diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index 69718999f3..bee395b357 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -39,7 +39,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf #### \_\_init\_\_ ```python - | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True) + | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0) ``` **Arguments**: @@ -78,6 +78,8 @@ and that FARM includes no_answer in the sorted list of predictions. - `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) - `progress_bar`: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered. + The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. #### train diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md index 45bdeedaea..4f67f176e0 100644 --- a/docs/_src/api/api/translator.md +++ b/docs/_src/api/api/translator.md @@ -5,7 +5,7 @@ ## BaseTranslator Objects ```python -class BaseTranslator(ABC) +class BaseTranslator(BaseComponent) ``` Abstract class for a Translator component that translates either a query or a doc from language A to language B. From 31a98eb69756f3cb5770e5a363032a6c95511568 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 17 May 2021 15:30:35 +0000 Subject: [PATCH 8/8] Add latest docstring and tutorial changes --- docs/_src/api/api/retriever.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index bd3989e5ee..31e70169f2 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -344,7 +344,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim) #### train ```python - | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") + | train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_sample: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") ``` train a DensePassageRetrieval model @@ -370,6 +370,12 @@ train a DensePassageRetrieval model - `epsilon`: epsilon parameter of optimizer - `weight_decay`: weight decay parameter of optimizer - `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done +- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are: + "O0" (FP32) + "O1" (Mixed Precision) + "O2" (Almost FP16) + "O3" (Pure FP16). + For more information, refer to: https://nvidia.github.io/apex/amp.html - `optimizer_name`: what optimizer to use (default: TransformersAdamW) - `num_warmup_steps`: number of warmup steps - `optimizer_correct_bias`: Whether to correct bias in optimizer