Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Client-level, keyword argument `api_version` can be used to specify the service API version to use. Currently only v2.0
is supported. See the enum `FormRecognizerApiVersion` for supported API versions.
- `FormWord` and `FormLine` now have attribute `kind` which specifies the kind of element it is, e.g. "word" or "line"

## 3.0.0b1 (2020-08-11)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,16 @@ class FormElement(object):
Units are in pixels for images and inches for PDF.
:ivar int page_number:
The 1-based number of the page in which this content is present.
:ivar str kind:
The kind of form element. Possible kinds are "word" or "line" which
correspond to a :class:`~azure.ai.formrecognizer.FormWord` or
:class:`~azure.ai.formrecognizer.FormLine`, respectively.
"""
def __init__(self, **kwargs):
self.bounding_box = kwargs.get("bounding_box", None)
self.page_number = kwargs.get("page_number", None)
self.text = kwargs.get("text", None)
self.kind = kwargs.get("kind", None)


class RecognizedForm(object):
Expand Down Expand Up @@ -254,7 +259,7 @@ def __repr__(self):
)[:1024]


class FieldData(FormElement):
class FieldData(object):
"""Contains the data for the form field. This includes the text,
location of the text on the form, and a collection of the
elements that make up the text.
Expand All @@ -271,11 +276,14 @@ class FieldData(FormElement):
When `include_field_elements` is set to true, a list of
elements constituting this field or value is returned. The list
constitutes of elements such as lines and words.
:vartype field_elements: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine]
:vartype field_elements: list[Union[~azure.ai.formrecognizer.FormElement, ~azure.ai.formrecognizer.FormWord,
~azure.ai.formrecognizer.FormLine]]
"""

def __init__(self, **kwargs):
super(FieldData, self).__init__(**kwargs)
self.page_number = kwargs.get("page_number", None)
self.text = kwargs.get("text", None)
self.bounding_box = kwargs.get("bounding_box", None)
self.field_elements = kwargs.get("field_elements", None)

@classmethod
Expand Down Expand Up @@ -386,10 +394,11 @@ class FormLine(FormElement):
A list of the words that make up the line.
:ivar int page_number:
The 1-based number of the page in which this content is present.
:ivar str kind: For FormLine, this is "line".
"""

def __init__(self, **kwargs):
super(FormLine, self).__init__(**kwargs)
super(FormLine, self).__init__(kind="line", **kwargs)
self.words = kwargs.get("words", None)

@classmethod
Expand All @@ -403,12 +412,13 @@ def _from_generated(cls, line, page):
)

def __repr__(self):
return "FormLine(text={}, bounding_box={}, words={}, page_number={})" \
return "FormLine(text={}, bounding_box={}, words={}, page_number={}, kind={})" \
.format(
self.text,
self.bounding_box,
repr(self.words),
self.page_number
self.page_number,
self.kind
)[:1024]


Expand All @@ -425,10 +435,11 @@ class FormWord(FormElement):
Measures the degree of certainty of the recognition result. Value is between [0.0, 1.0].
:ivar int page_number:
The 1-based number of the page in which this content is present.
:ivar str kind: For FormWord, this is "word".
"""

def __init__(self, **kwargs):
super(FormWord, self).__init__(**kwargs)
super(FormWord, self).__init__(kind="word", **kwargs)
self.confidence = kwargs.get("confidence", None)

@classmethod
Expand All @@ -441,12 +452,13 @@ def _from_generated(cls, word, page):
)

def __repr__(self):
return "FormWord(text={}, bounding_box={}, confidence={}, page_number={})" \
return "FormWord(text={}, bounding_box={}, confidence={}, page_number={}, kind={})" \
.format(
self.text,
self.bounding_box,
self.confidence,
self.page_number
self.page_number,
self.kind
)[:1024]


Expand Down Expand Up @@ -479,7 +491,7 @@ def __repr__(self):
)[:1024]


class FormTableCell(FormElement):
class FormTableCell(object): # pylint:disable=too-many-instance-attributes
"""Represents a cell contained in a table recognized from the input document.

:ivar str text: Text content of the cell.
Expand All @@ -503,18 +515,21 @@ class FormTableCell(FormElement):
elements constituting this cell is returned. The list
constitutes of elements such as lines and words.
For calls to begin_recognize_content(), this list is always populated.
:vartype field_elements: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine]
:vartype field_elements: list[Union[~azure.ai.formrecognizer.FormElement, ~azure.ai.formrecognizer.FormWord,
~azure.ai.formrecognizer.FormLine]]
"""

def __init__(self, **kwargs):
super(FormTableCell, self).__init__(**kwargs)
self.text = kwargs.get("text", None)
self.row_index = kwargs.get("row_index", None)
self.column_index = kwargs.get("column_index", None)
self.row_span = kwargs.get("row_span", 1)
self.column_span = kwargs.get("column_span", 1)
self.bounding_box = kwargs.get("bounding_box", None)
self.confidence = kwargs.get("confidence", None)
self.is_header = kwargs.get("is_header", False)
self.is_footer = kwargs.get("is_footer", False)
self.page_number = kwargs.get("page_number", None)
self.field_elements = kwargs.get("field_elements", None)

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class GetBoundingBoxesSampleAsync(object):
async def get_bounding_boxes(self):
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer.aio import FormRecognizerClient
from azure.ai.formrecognizer import FormWord, FormLine

endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]
Expand Down Expand Up @@ -86,19 +85,19 @@ async def get_bounding_boxes(self):
# field_elements is only populated if you set include_field_elements to True in your call
# to begin_recognize_custom_forms
# It is a heterogeneous list of FormWord and FormLine.
for content in cell.field_elements:
if isinstance(content, FormWord):
for element in cell.field_elements:
if element.kind == "word":
print("......Word '{}' within bounding box '{}' has a confidence of {}".format(
content.text,
format_bounding_box(content.bounding_box),
content.confidence
element.text,
format_bounding_box(element.bounding_box),
element.confidence
))
elif isinstance(content, FormLine):
elif element.kind == "line":
print("......Line '{}' within bounding box '{}' has the following words: ".format(
content.text,
format_bounding_box(content.bounding_box)
element.text,
format_bounding_box(element.bounding_box)
))
for word in content.words:
for word in element.words:
print(".........Word '{}' within bounding box '{}' has a confidence of {}".format(
word.text,
format_bounding_box(word.bounding_box),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class GetBoundingBoxesSample(object):
def get_bounding_boxes(self):
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient
from azure.ai.formrecognizer import FormWord, FormLine

endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]
Expand Down Expand Up @@ -83,19 +82,19 @@ def get_bounding_boxes(self):
# field_elements is only populated if you set include_field_elements to True in your call
# to begin_recognize_custom_forms
# It is a heterogeneous list of FormWord and FormLine.
for content in cell.field_elements:
if isinstance(content, FormWord):
for element in cell.field_elements:
if element.kind == "word":
print("......Word '{}' within bounding box '{}' has a confidence of {}".format(
content.text,
format_bounding_box(content.bounding_box),
content.confidence
element.text,
format_bounding_box(element.bounding_box),
element.confidence
))
elif isinstance(content, FormLine):
elif element.kind == "line":
print("......Line '{}' within bounding box '{}' has the following words: ".format(
content.text,
format_bounding_box(content.bounding_box)
element.text,
format_bounding_box(element.bounding_box)
))
for word in content.words:
for word in element.words:
print(".........Word '{}' within bounding box '{}' has a confidence of {}".format(
word.text,
format_bounding_box(word.bounding_box),
Expand Down
4 changes: 2 additions & 2 deletions sdk/formrecognizer/azure-ai-formrecognizer/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def bounding_box():
@pytest.fixture
def form_word(bounding_box):
model = _models.FormWord(text="Word", bounding_box=bounding_box[0], confidence=0.5, page_number=1)
model_repr = "FormWord(text=Word, bounding_box={}, confidence=0.5, page_number=1)".format(bounding_box[1])[:1024]
model_repr = "FormWord(text=Word, bounding_box={}, confidence=0.5, page_number=1, kind=word)".format(bounding_box[1])[:1024]
assert repr(model) == model_repr
return model, model_repr


@pytest.fixture
def form_line(bounding_box, form_word):
model = _models.FormLine(text="Word Word", bounding_box=bounding_box[0], words=[form_word[0], form_word[0]], page_number=1)
model_repr = "FormLine(text=Word Word, bounding_box={}, words=[{}, {}], page_number=1)".format(bounding_box[1], form_word[1], form_word[1])[:1024]
model_repr = "FormLine(text=Word Word, bounding_box={}, words=[{}, {}], page_number=1, kind=line)".format(bounding_box[1], form_word[1], form_word[1])[:1024]
assert repr(model) == model_repr
return model, model_repr

Expand Down
16 changes: 11 additions & 5 deletions sdk/formrecognizer/azure-ai-formrecognizer/tests/testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,11 @@ def assertFormPagesTransformCorrect(self, pages, actual_read, page_result=None,
if not page.lines and not actual_page.lines:
continue
for p, a in zip(page.lines, actual_page.lines):
self.assertEqual(p.kind, "line")
self.assertEqual(p.text, a.text)
self.assertBoundingBoxTransformCorrect(p.bounding_box, a.bounding_box)
for wp, wa, in zip(p.words, a.words):
self.assertEqual(wp.kind, "word")
self.assertEqual(wp.text, wa.text)
self.assertEqual(wp.confidence, wa.confidence if wa.confidence is not None else 1.0)
self.assertBoundingBoxTransformCorrect(wp.bounding_box, wa.bounding_box)
Expand All @@ -204,13 +206,14 @@ def assertBoundingBoxTransformCorrect(self, box, actual):
def assertFieldElementsTransFormCorrect(self, field_elements, actual_elements, read_result):
if field_elements is None and actual_elements is None:
return
for receipt, actual in zip(field_elements, actual_elements):
for element, actual in zip(field_elements, actual_elements):
nums = [int(s) for s in re.findall(r'\d+', actual)]
read, line, word = nums[0:3]
text_element = read_result[read].lines[line].words[word]
self.assertEqual(receipt.text, text_element.text)
self.assertEqual(receipt.confidence, text_element.confidence if text_element.confidence is not None else 1.0)
self.assertBoundingBoxTransformCorrect(receipt.bounding_box, text_element.bounding_box)
actual_element = read_result[read].lines[line].words[word]
self.assertEqual(element.text, actual_element.text)
self.assertEqual(element.confidence, actual_element.confidence if actual_element.confidence is not None else 1.0)
self.assertEqual(element.kind, "word")
self.assertBoundingBoxTransformCorrect(element.bounding_box, actual_element.bounding_box)

def assertLabeledFormFieldDictTransformCorrect(self, form_fields, actual_fields, read_results=None):
if actual_fields is None:
Expand Down Expand Up @@ -368,7 +371,9 @@ def assertFormPagesHasValues(self, pages):
self.assertIsNotNone(line.text)
self.assertIsNotNone(line.page_number)
self.assertBoundingBoxHasPoints(line.bounding_box)
self.assertEqual(line.kind, "line")
for word in line.words:
self.assertEqual(word.kind, "word")
self.assertFormWordHasValues(word, page.page_number)

if page.tables:
Expand All @@ -386,6 +391,7 @@ def assertFormPagesHasValues(self, pages):
self.assertFieldElementsHasValues(cell.field_elements, page.page_number)

def assertFormWordHasValues(self, word, page_number):
self.assertEqual(word.kind, "word")
self.assertIsNotNone(word.confidence)
self.assertIsNotNone(word.text)
self.assertBoundingBoxHasPoints(word.bounding_box)
Expand Down