From 0b6418a8e78597df947372a7d3033484ac9e9f28 Mon Sep 17 00:00:00 2001 From: Krista Pratico Date: Wed, 12 Aug 2020 15:02:58 -0700 Subject: [PATCH 1/5] update type hint for field_elements --- .../azure/ai/formrecognizer/_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py index dbec302aec3c..53d608b5f226 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py @@ -271,7 +271,8 @@ class FieldData(FormElement): When `include_field_elements` is set to true, a list of elements constituting this field or value is returned. The list constitutes of elements such as lines and words. - :vartype field_elements: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine] + :vartype field_elements: list[Union[~azure.ai.formrecognizer.FormElement, ~azure.ai.formrecognizer.FormWord, + ~azure.ai.formrecognizer.FormLine]] """ def __init__(self, **kwargs): @@ -503,7 +504,8 @@ class FormTableCell(FormElement): elements constituting this cell is returned. The list constitutes of elements such as lines and words. For calls to begin_recognize_content(), this list is always populated. - :vartype field_elements: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine] + :vartype field_elements: list[Union[~azure.ai.formrecognizer.FormElement, ~azure.ai.formrecognizer.FormWord, + ~azure.ai.formrecognizer.FormLine]] """ def __init__(self, **kwargs): From 87787c9d040a45f10ce4cda954a941ba2a2f0eab Mon Sep 17 00:00:00 2001 From: Krista Pratico Date: Wed, 12 Aug 2020 15:48:35 -0700 Subject: [PATCH 2/5] adding kind to FormElement --- .../azure/ai/formrecognizer/_models.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py index 53d608b5f226..836426855ed4 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py @@ -151,11 +151,16 @@ class FormElement(object): Units are in pixels for images and inches for PDF. :ivar int page_number: The 1-based number of the page in which this content is present. + :ivar str kind: + The kind of form element. Possible kinds are "word" or "line" which + correspond to a :class:`~azure.ai.formrecognizer.FormWord` or + :class:`~azure.ai.formrecognizer.FormLine`, respectively. """ def __init__(self, **kwargs): self.bounding_box = kwargs.get("bounding_box", None) self.page_number = kwargs.get("page_number", None) self.text = kwargs.get("text", None) + self.kind = kwargs.get("kind", None) class RecognizedForm(object): @@ -254,7 +259,7 @@ def __repr__(self): )[:1024] -class FieldData(FormElement): +class FieldData(object): """Contains the data for the form field. This includes the text, location of the text on the form, and a collection of the elements that make up the text. @@ -276,7 +281,9 @@ class FieldData(FormElement): """ def __init__(self, **kwargs): - super(FieldData, self).__init__(**kwargs) + self.page_number = kwargs.get("page_number", None) + self.text = kwargs.get("text", None) + self.bounding_box = kwargs.get("bounding_box", None) self.field_elements = kwargs.get("field_elements", None) @classmethod @@ -387,10 +394,11 @@ class FormLine(FormElement): A list of the words that make up the line. :ivar int page_number: The 1-based number of the page in which this content is present. + :ivar str kind: For FormLine, this is "line". """ def __init__(self, **kwargs): - super(FormLine, self).__init__(**kwargs) + super(FormLine, self).__init__(kind="line", **kwargs) self.words = kwargs.get("words", None) @classmethod @@ -404,12 +412,13 @@ def _from_generated(cls, line, page): ) def __repr__(self): - return "FormLine(text={}, bounding_box={}, words={}, page_number={})" \ + return "FormLine(text={}, bounding_box={}, words={}, page_number={}, kind={})" \ .format( self.text, self.bounding_box, repr(self.words), - self.page_number + self.page_number, + self.kind )[:1024] @@ -426,10 +435,11 @@ class FormWord(FormElement): Measures the degree of certainty of the recognition result. Value is between [0.0, 1.0]. :ivar int page_number: The 1-based number of the page in which this content is present. + :ivar str kind: For FormWord, this is "word". """ def __init__(self, **kwargs): - super(FormWord, self).__init__(**kwargs) + super(FormWord, self).__init__(kind="word", **kwargs) self.confidence = kwargs.get("confidence", None) @classmethod @@ -442,12 +452,13 @@ def _from_generated(cls, word, page): ) def __repr__(self): - return "FormWord(text={}, bounding_box={}, confidence={}, page_number={})" \ + return "FormWord(text={}, bounding_box={}, confidence={}, page_number={}, kind={})" \ .format( self.text, self.bounding_box, self.confidence, - self.page_number + self.page_number, + self.kind )[:1024] @@ -480,7 +491,7 @@ def __repr__(self): )[:1024] -class FormTableCell(FormElement): +class FormTableCell(object): """Represents a cell contained in a table recognized from the input document. :ivar str text: Text content of the cell. @@ -509,14 +520,16 @@ class FormTableCell(FormElement): """ def __init__(self, **kwargs): - super(FormTableCell, self).__init__(**kwargs) + self.text = kwargs.get("text", None) self.row_index = kwargs.get("row_index", None) self.column_index = kwargs.get("column_index", None) self.row_span = kwargs.get("row_span", 1) self.column_span = kwargs.get("column_span", 1) + self.bounding_box = kwargs.get("bounding_box", None) self.confidence = kwargs.get("confidence", None) self.is_header = kwargs.get("is_header", False) self.is_footer = kwargs.get("is_footer", False) + self.page_number = kwargs.get("page_number", None) self.field_elements = kwargs.get("field_elements", None) @classmethod From 1856542185800dae8cf1b7a701cebbaadfec7528 Mon Sep 17 00:00:00 2001 From: Krista Pratico Date: Wed, 12 Aug 2020 15:57:45 -0700 Subject: [PATCH 3/5] update tests --- .../azure/ai/formrecognizer/_models.py | 2 +- .../azure-ai-formrecognizer/tests/test_repr.py | 4 ++-- .../azure-ai-formrecognizer/tests/testcase.py | 16 +++++++++++----- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py index 836426855ed4..b0c95e6ffd4c 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py @@ -491,7 +491,7 @@ def __repr__(self): )[:1024] -class FormTableCell(object): +class FormTableCell(object): # pylint:disable=too-many-instance-attributes """Represents a cell contained in a table recognized from the input document. :ivar str text: Text content of the cell. diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_repr.py b/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_repr.py index 92446178008c..c8da3a6c2a8d 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_repr.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_repr.py @@ -28,7 +28,7 @@ def bounding_box(): @pytest.fixture def form_word(bounding_box): model = _models.FormWord(text="Word", bounding_box=bounding_box[0], confidence=0.5, page_number=1) - model_repr = "FormWord(text=Word, bounding_box={}, confidence=0.5, page_number=1)".format(bounding_box[1])[:1024] + model_repr = "FormWord(text=Word, bounding_box={}, confidence=0.5, page_number=1, kind=word)".format(bounding_box[1])[:1024] assert repr(model) == model_repr return model, model_repr @@ -36,7 +36,7 @@ def form_word(bounding_box): @pytest.fixture def form_line(bounding_box, form_word): model = _models.FormLine(text="Word Word", bounding_box=bounding_box[0], words=[form_word[0], form_word[0]], page_number=1) - model_repr = "FormLine(text=Word Word, bounding_box={}, words=[{}, {}], page_number=1)".format(bounding_box[1], form_word[1], form_word[1])[:1024] + model_repr = "FormLine(text=Word Word, bounding_box={}, words=[{}, {}], page_number=1, kind=line)".format(bounding_box[1], form_word[1], form_word[1])[:1024] assert repr(model) == model_repr return model, model_repr diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/tests/testcase.py b/sdk/formrecognizer/azure-ai-formrecognizer/tests/testcase.py index 56a1074904b1..11bb8dff5cc6 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/tests/testcase.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/tests/testcase.py @@ -176,9 +176,11 @@ def assertFormPagesTransformCorrect(self, pages, actual_read, page_result=None, if not page.lines and not actual_page.lines: continue for p, a in zip(page.lines, actual_page.lines): + self.assertEqual(p.kind, "line") self.assertEqual(p.text, a.text) self.assertBoundingBoxTransformCorrect(p.bounding_box, a.bounding_box) for wp, wa, in zip(p.words, a.words): + self.assertEqual(wp.kind, "word") self.assertEqual(wp.text, wa.text) self.assertEqual(wp.confidence, wa.confidence if wa.confidence is not None else 1.0) self.assertBoundingBoxTransformCorrect(wp.bounding_box, wa.bounding_box) @@ -204,13 +206,14 @@ def assertBoundingBoxTransformCorrect(self, box, actual): def assertFieldElementsTransFormCorrect(self, field_elements, actual_elements, read_result): if field_elements is None and actual_elements is None: return - for receipt, actual in zip(field_elements, actual_elements): + for element, actual in zip(field_elements, actual_elements): nums = [int(s) for s in re.findall(r'\d+', actual)] read, line, word = nums[0:3] - text_element = read_result[read].lines[line].words[word] - self.assertEqual(receipt.text, text_element.text) - self.assertEqual(receipt.confidence, text_element.confidence if text_element.confidence is not None else 1.0) - self.assertBoundingBoxTransformCorrect(receipt.bounding_box, text_element.bounding_box) + actual_element = read_result[read].lines[line].words[word] + self.assertEqual(element.text, actual_element.text) + self.assertEqual(element.confidence, actual_element.confidence if actual_element.confidence is not None else 1.0) + self.assertEqual(element.kind, "word") + self.assertBoundingBoxTransformCorrect(element.bounding_box, actual_element.bounding_box) def assertLabeledFormFieldDictTransformCorrect(self, form_fields, actual_fields, read_results=None): if actual_fields is None: @@ -368,7 +371,9 @@ def assertFormPagesHasValues(self, pages): self.assertIsNotNone(line.text) self.assertIsNotNone(line.page_number) self.assertBoundingBoxHasPoints(line.bounding_box) + self.assertEqual(line.kind, "line") for word in line.words: + self.assertEqual(word.kind, "word") self.assertFormWordHasValues(word, page.page_number) if page.tables: @@ -386,6 +391,7 @@ def assertFormPagesHasValues(self, pages): self.assertFieldElementsHasValues(cell.field_elements, page.page_number) def assertFormWordHasValues(self, word, page_number): + self.assertEqual(word.kind, "word") self.assertIsNotNone(word.confidence) self.assertIsNotNone(word.text) self.assertBoundingBoxHasPoints(word.bounding_box) From a26892ff45caa34391af19fc41475461e075bf30 Mon Sep 17 00:00:00 2001 From: Krista Pratico Date: Wed, 12 Aug 2020 16:01:55 -0700 Subject: [PATCH 4/5] update samples --- .../sample_get_bounding_boxes_async.py | 19 +++++++++---------- .../samples/sample_get_bounding_boxes.py | 19 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/samples/async_samples/sample_get_bounding_boxes_async.py b/sdk/formrecognizer/azure-ai-formrecognizer/samples/async_samples/sample_get_bounding_boxes_async.py index 402e37e76044..877dd08edda1 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/samples/async_samples/sample_get_bounding_boxes_async.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/samples/async_samples/sample_get_bounding_boxes_async.py @@ -38,7 +38,6 @@ class GetBoundingBoxesSampleAsync(object): async def get_bounding_boxes(self): from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer.aio import FormRecognizerClient - from azure.ai.formrecognizer import FormWord, FormLine endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] @@ -86,19 +85,19 @@ async def get_bounding_boxes(self): # field_elements is only populated if you set include_field_elements to True in your call # to begin_recognize_custom_forms # It is a heterogeneous list of FormWord and FormLine. - for content in cell.field_elements: - if isinstance(content, FormWord): + for element in cell.field_elements: + if element.kind == "word": print("......Word '{}' within bounding box '{}' has a confidence of {}".format( - content.text, - format_bounding_box(content.bounding_box), - content.confidence + element.text, + format_bounding_box(element.bounding_box), + element.confidence )) - elif isinstance(content, FormLine): + elif element.kind == "line": print("......Line '{}' within bounding box '{}' has the following words: ".format( - content.text, - format_bounding_box(content.bounding_box) + element.text, + format_bounding_box(element.bounding_box) )) - for word in content.words: + for word in element.words: print(".........Word '{}' within bounding box '{}' has a confidence of {}".format( word.text, format_bounding_box(word.bounding_box), diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/samples/sample_get_bounding_boxes.py b/sdk/formrecognizer/azure-ai-formrecognizer/samples/sample_get_bounding_boxes.py index cff103a81b44..0ead56067ab4 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/samples/sample_get_bounding_boxes.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/samples/sample_get_bounding_boxes.py @@ -37,7 +37,6 @@ class GetBoundingBoxesSample(object): def get_bounding_boxes(self): from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import FormRecognizerClient - from azure.ai.formrecognizer import FormWord, FormLine endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] @@ -83,19 +82,19 @@ def get_bounding_boxes(self): # field_elements is only populated if you set include_field_elements to True in your call # to begin_recognize_custom_forms # It is a heterogeneous list of FormWord and FormLine. - for content in cell.field_elements: - if isinstance(content, FormWord): + for element in cell.field_elements: + if element.kind == "word": print("......Word '{}' within bounding box '{}' has a confidence of {}".format( - content.text, - format_bounding_box(content.bounding_box), - content.confidence + element.text, + format_bounding_box(element.bounding_box), + element.confidence )) - elif isinstance(content, FormLine): + elif element.kind == "line": print("......Line '{}' within bounding box '{}' has the following words: ".format( - content.text, - format_bounding_box(content.bounding_box) + element.text, + format_bounding_box(element.bounding_box) )) - for word in content.words: + for word in element.words: print(".........Word '{}' within bounding box '{}' has a confidence of {}".format( word.text, format_bounding_box(word.bounding_box), From ff5f2d288a8dd2c8ab0182bfcfceeec0741deb23 Mon Sep 17 00:00:00 2001 From: Krista Pratico Date: Wed, 12 Aug 2020 16:03:49 -0700 Subject: [PATCH 5/5] changelog --- sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md b/sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md index 769158c9bb5b..7c4f013ae11b 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md +++ b/sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md @@ -6,6 +6,7 @@ - Client-level, keyword argument `api_version` can be used to specify the service API version to use. Currently only v2.0 is supported. See the enum `FormRecognizerApiVersion` for supported API versions. +- `FormWord` and `FormLine` now have attribute `kind` which specifies the kind of element it is, e.g. "word" or "line" ## 3.0.0b1 (2020-08-11)