Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 11 additions & 44 deletions haystack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def get_answers(self, question, top_k_reader=1, top_k_retriever=10, filters=None
:return:
"""

# 1) Optional: reduce the search space via document tags
if filters:
query = """
SELECT id FROM document WHERE id in (
Expand All @@ -61,49 +62,15 @@ def get_answers(self, question, top_k_reader=1, top_k_retriever=10, filters=None
else:
candidate_doc_ids = None

retrieved_scores = self.retriever.retrieve(question, top_k=top_k_retriever)
# 2) Apply retriever to get fast candidate paragraphs
paragraphs, meta_data = self.retriever.retrieve(question, top_k=top_k_retriever, candidate_doc_ids=candidate_doc_ids)

inference_dicts = self._convert_retrieved_text_to_reader_format(
retrieved_scores, question, candidate_doc_ids=candidate_doc_ids
)
results = self.reader.predict(inference_dicts, top_k=top_k_reader)
return results["results"]
# 3) Apply reader to get granular answer(s)
logger.info(f"Applying the reader now to look for the answer in detail ...")
results = self.reader.predict(question=question,
paragrahps=paragraphs,
meta_data_paragraphs=meta_data,
top_k=top_k_reader)

return results

def _convert_retrieved_text_to_reader_format(
self, retrieved_scores, question, candidate_doc_ids=None, verbose=True
):
"""
The reader expect the input as:
{
"text": "FARM is a home for all species of pretrained language models (e.g. BERT) that can be adapted to
different domain languages or down-stream tasks. With FARM you can easily create SOTA NLP models for tasks
like document classification, NER or question answering.",
"document_id": 127,
"questions" : ["What can you do with FARM?"]
}

:param retrieved_scores: tfidf scores as returned by the retriever
:param question: question string
:param verbose: enable verbose logging
"""
df_sliced = self.retriever.df.loc[retrieved_scores.keys()]
if verbose:
logger.info(
f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
)
logger.info(
f"Applying the reader now to look for the answer in detail ..."
)
inference_dicts = []
for idx, row in df_sliced.iterrows():
if candidate_doc_ids and row["document_id"] not in candidate_doc_ids:
continue
inference_dicts.append(
{
"text": row["text"],
"document_id": row["document_id"],
"questions": [question],
}
)

return inference_dicts
17 changes: 15 additions & 2 deletions haystack/indexing/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,28 @@
logger = logging.getLogger(__name__)


def write_documents_to_db(document_dir, clean_func=None):
def write_documents_to_db(document_dir, clean_func=None, only_empty_db=False):
"""
Write all text files(.txt) in the sub-directories of the given path to the connected database.

:param document_dir: path for the documents to be written to the database
:return:
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param only_empty_db: If true, docs will only be written if db is completely empty.
Useful to avoid indexing the same initial docs again and again.
:return: None
"""
file_paths = Path(document_dir).glob("**/*.txt")
n_docs = 0

# check if db has already docs
if only_empty_db:
n_docs = db.session.query(Document).count()
if n_docs > 0:
logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... "
"(Disable `only_empty_db`, if you want to add docs anyway.)")
return None

# read and add docs
for path in file_paths:
with open(path) as doc:
text = doc.read()
Expand Down
108 changes: 65 additions & 43 deletions haystack/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
no_answer_shift=-100,
batch_size=16,
use_gpu=True,
n_best_per_passage=2
):
"""
Load a saved FARM model in Inference mode.
Expand All @@ -27,56 +28,77 @@ def __init__(
self.model = Inferencer.load(model_dir, batch_size=batch_size, gpu=use_gpu)
self.model.model.prediction_heads[0].context_size = context_size
self.model.model.prediction_heads[0].no_answer_shift = no_answer_shift
self.model.model.prediction_heads[0].n_best = n_best_per_passage

def predict(self, input_dicts, top_k=None):

def predict(self, question, paragrahps, meta_data_paragraphs=None, top_k=None, max_processes=1):
"""
Run inference on the loaded model for the given input dicts.
Use loaded QA model to find answers for a question in the supplied paragraphs.

Returns dictionaries containing answers sorted by (desc.) probability
Example:
{'question': 'Who is the father of Arya Stark?',
'answers': [
{'answer': 'Eddard,',
'context': " She travels with her father, Eddard, to King's Landing when he is ",
'offset_answer_start': 147,
'offset_answer_end': 154,
'probability': 0.9787139466668613,
'score': None,
'document_id': None
},
...
]
}

:param input_dicts: list of input dicts
:param question: question string
:param paragraphs: list of strings in which to search for the answer
:param meta_data_paragraphs: list of dicts containing meta data for the paragraphs.
len(paragraphs) == len(meta_data_paragraphs)
:param top_k: the maximum number of answers to return
:return:
:param max_processes: max number of parallel processes
:return: dict containing question and answers
"""
results = self.model.inference_from_dicts(
dicts=input_dicts, rest_api_schema=True, use_multiprocessing=False

if meta_data_paragraphs is None:
meta_data_paragraphs = len(paragrahps) * [None]
assert len(paragrahps) == len(meta_data_paragraphs)

# convert input to FARM format
input_dicts = []
for paragraph, meta_data in zip(paragrahps, meta_data_paragraphs):
cur = {"text": paragraph,
"questions": [question],
"document_id": meta_data["document_id"]
}
input_dicts.append(cur)

# get answers from QA model (Top 5 per input paragraph)
predictions = self.model.inference_from_dicts(
dicts=input_dicts, rest_api_schema=True, max_processes=max_processes
)

# The FARM Inferencer as of now do not support multi document QA.
# The QA inference is done for each text independently and the
# results are sorted descending by their `score`.

all_predictions = []
for res in results:
all_predictions.extend(res["predictions"])

all_answers = []
for pred in all_predictions:
answers = pred["answers"]
for a in answers:
# Two sets of offset fields are returned by FARM -- context level and document level.
# For the API, only context level offsets are relevant.
a["offset_start"] = a["offset_answer_start"] - a["offset_context_start"]
a["offset_end"] = a["offset_context_end"] - a["offset_answer_end"]
all_answers.extend(answers)

# remove all null answers (where an answers in not found in the text)
all_answers = [ans for ans in all_answers if ans["answer"]]

scores = np.asarray([ans["score"] for ans in all_answers])
probabilities = expit(scores / 8)
for ans, prob in zip(all_answers, probabilities):
ans["probability"] = prob

# sort answers by their `probability`
sorted_answers = sorted(
all_answers, key=lambda k: k["probability"], reverse=True
# assemble answers from all the different paragraphs & format them
answers = []
for pred in predictions:
for a in pred["predictions"][0]["answers"]:
if a["answer"]: #skip "no answer"
cur = {"answer": a["answer"],
"score": a["score"],
"probability": expit(np.asarray([a["score"]]) / 8), #just a pseudo prob for now
"context": a["context"],
"offset_start": a["offset_answer_start"] - a["offset_context_start"],
"offset_end": a["offset_answer_start"] - a["offset_context_start"],
"document_id": a["document_id"]}
answers.append(cur)

# sort answers by their `probability` and select top-k
answers = sorted(
answers, key=lambda k: k["probability"], reverse=True
)
answers = answers[:top_k]

# all predictions here are for the same questions, so the the metadata from
# the first prediction in the list is taken.
if all_predictions:
resp = all_predictions[0] # get the first prediction dict
resp["answers"] = sorted_answers[:top_k]
else:
resp = []
result = {"question": question,
"answers": answers}

return {"results": [resp]}
return result
102 changes: 102 additions & 0 deletions haystack/reader/transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from transformers import pipeline


class TransformersReader:
"""
A reader using the QA Pipeline class from huggingface's Transformers.
Easily load any of the pretrained community QA models from here: https://huggingface.co/models
"""

def __init__(
self,
model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased",
context_size=30,
#no_answer_shift=-100,
#batch_size=16,
use_gpu=0,
n_best_per_passage=2
):
"""
Load a QA model from Transformers.
Available models include:
- distilbert-base-uncased-distilled-squad
- bert-large-cased-whole-word-masking-finetuned-squad
- bert-large-uncased-whole-word-masking-finetuned-squad

See https://huggingface.co/models for full list of available QA models

:param model: name of the model
:param tokenizer: name of the tokenizer (usually the same as model)
:param context_size: num of chars (before and after the answer) to return as "context" for each answer.
The context usually helps users to understand if the answer really makes sense.
:param use_gpu: < 1 -> use cpu
>= 1 -> num of gpus to use
"""
self.model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=use_gpu)
self.context_size = context_size
self.n_best_per_passage = n_best_per_passage
#TODO param to modify bias for no_answer


def predict(self, question, paragrahps, meta_data_paragraphs=None, top_k=None):
"""
Use loaded QA model to find answers for a question in the supplied paragraphs.

Returns dictionaries containing answers sorted by (desc.) probability
Example:
{'question': 'Who is the father of Arya Stark?',
'answers': [
{'answer': 'Eddard,',
'context': " She travels with her father, Eddard, to King's Landing when he is ",
'offset_answer_start': 147,
'offset_answer_end': 154,
'probability': 0.9787139466668613,
'score': None,
'document_id': None
},
...
]
}

:param question: question string
:param paragraphs: list of strings in which to search for the answer
:param meta_data_paragraphs: list of dicts containing meta data for the paragraphs.
len(paragraphs) == len(meta_data_paragraphs)
:param top_k: the maximum number of answers to return
:param max_processes: max number of parallel processes
:return: dict containing question and answers

"""
#TODO pass metadata

# get top-answers for each candidate passage
answers = []
for p in paragrahps:
query = {"context": p, "question": question}
predictions = self.model(query, topk=self.n_best_per_passage)
# assemble and format all answers
for pred in predictions:
if pred["answer"]:
context_start = max(0, pred["start"] - self.context_size)
context_end = min(len(p), pred["end"] + self.context_size)
answers.append({
"answer": pred["answer"],
"context": p[context_start:context_end],
"offset_answer_start": pred["start"],
"offset_answer_end": pred["end"],
"probability": pred["score"],
"score": None,
"document_id": None
})

# sort answers by their `probability` and select top-k
answers = sorted(
answers, key=lambda k: k["probability"], reverse=True
)
answers = answers[:top_k]

results = {"question": question,
"answers": answers}

return results
32 changes: 27 additions & 5 deletions haystack/retriever/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class TfidfRetriever(BaseRetriever):
Split documents into smaller units (eg, paragraphs or pages) to reduce the
computations when text is passed on to a Reader for QA.

It uses sklearn TfidfVectorizer to compute a tf-idf matrix.
It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
"""

def __init__(self):
Expand Down Expand Up @@ -69,15 +69,37 @@ def _get_all_paragraphs(self):
logger.info(f"Found {len(paragraphs)} candidate paragraphs from {len(documents)} docs in DB")
return paragraphs

def retrieve(self, query, candidate_doc_ids=None, top_k=10):
def _calc_scores(self, query):
question_vector = self.vectorizer.transform([query])

scores = self.tfidf_matrix.dot(question_vector.T).toarray()
idx_scores = [(idx, score) for idx, score in enumerate(scores)]
top_k_scores = OrderedDict(
sorted(idx_scores, key=(lambda tup: tup[1]), reverse=True)[:top_k]
indices_and_scores = OrderedDict(
sorted(idx_scores, key=(lambda tup: tup[1]), reverse=True)
)
return top_k_scores
return indices_and_scores

def retrieve(self, query, candidate_doc_ids=None, top_k=10, verbose=True):
# get scores
indices_and_scores = self._calc_scores(query)

# rank & filter paragraphs
df_sliced = self.df.loc[indices_and_scores.keys()]
if candidate_doc_ids:
df_sliced = df_sliced[df_sliced.document_id.isin(candidate_doc_ids)]
df_sliced = df_sliced[:top_k]

if verbose:
logger.info(
f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
)

# get actual content for the top candidates
paragraphs = list(df_sliced.text.values)
meta_data = [{"document_id": row["document_id"], "paragraph_id": row["paragraph_id"]}
for idx, row in df_sliced.iterrows()]

return paragraphs, meta_data

def fit(self):
self.df = pd.DataFrame.from_dict(self.paragraphs)
Expand Down
2 changes: 1 addition & 1 deletion haystack/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def create_db():


def print_answers(results, details="all"):
answers = results[0]["answers"]
answers = results["answers"]
pp = pprint.PrettyPrinter(indent=4)
if details != "all":
if details == "minimal":
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
farm==0.3.2
# FARM (incl. transformers 2.3.0 with pipelines)
#farm -e git+https://github.com/deepset-ai/FARM.git@1d30237b037050ef0ac5516f427443cdd18a4d43
-e git://github.com/deepset-ai/FARM.git@1d30237b037050ef0ac5516f427443cdd18a4d43#egg=farm
flask
flask_cors
flask_restplus
Expand Down
Loading