From 609a2e1bf7ca2136297e0e55729b4f870b2037ae Mon Sep 17 00:00:00 2001 From: Catalina Peralta Date: Tue, 19 Oct 2021 12:21:55 -0700 Subject: [PATCH] adding get_words to DocumentLine --- .../azure/ai/formrecognizer/_models.py | 32 +- ...document.test_document_line_get_words.yaml | 343 ++++++++++++++++++ .../tests/test_document.py | 15 + 3 files changed, 388 insertions(+), 2 deletions(-) create mode 100644 sdk/formrecognizer/azure-ai-formrecognizer/tests/recordings/test_document.test_document_line_get_words.yaml diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py index e5260d3414d4..5a1749bb48a9 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py @@ -2656,10 +2656,12 @@ def __init__(self, **kwargs): self.content = kwargs.get("content", None) self.bounding_box = kwargs.get("bounding_box", None) self.spans = kwargs.get("spans", None) + self._parent = kwargs.get("_parent", None) @classmethod - def _from_generated(cls, line): + def _from_generated(cls, line, parent): return cls( + _parent=parent, content=line.content, bounding_box=get_bounding_box(line), spans=prepare_document_spans(line.spans), @@ -2708,6 +2710,22 @@ def from_dict(cls, data): else [], ) + def get_words( + self, mode + ): # pylint: disable=unused-argument,no-self-use + # type: (str) -> list[DocumentWord] + """Get the child elements found in the span of this DocumentLine. + :param str mode: Required. Mode used to search for words. Can be either "overlap" (default) or "contains". + :return: list[DocumentWord] + :rtype: list[DocumentWord] + """ + # TODO pending mode switch + result = [] + for elem in self._parent.words: + if in_span(elem, self.spans): + result.append(elem) + return result + class DocumentPage(object): """Content and layout elements extracted from a page of the input. @@ -2756,7 +2774,7 @@ def _from_generated(cls, page): width=page.width, height=page.height, unit=page.unit, - lines=[DocumentLine._from_generated(line) for line in page.lines] + lines=[DocumentLine._from_generated(line, page) for line in page.lines] if page.lines else [], words=[DocumentWord._from_generated(word) for word in page.words] @@ -4039,3 +4057,13 @@ def from_dict(cls, data): innererror=DocumentAnalysisInnerError.from_dict(data.get("innererror")) # type: ignore if data.get("innererror") else None ) + +def in_span(element, spans): + # type: (Any, list[Point]) -> bool + if hasattr(element, "span"): + for span in spans: + if element.span.offset >= span.offset and ( + element.span.offset + element.span.length + ) <= (span.offset + span.length): + return True + return False diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/tests/recordings/test_document.test_document_line_get_words.yaml b/sdk/formrecognizer/azure-ai-formrecognizer/tests/recordings/test_document.test_document_line_get_words.yaml new file mode 100644 index 000000000000..a046da2be958 --- /dev/null +++ b/sdk/formrecognizer/azure-ai-formrecognizer/tests/recordings/test_document.test_document_line_get_words.yaml @@ -0,0 +1,343 @@ +interactions: +- request: + body: '!!! The request body has been omitted from the recording because its size + 147362 is larger than 128KB. !!!' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '147362' + Content-Type: + - application/octet-stream + User-Agent: + - azsdk-python-ai-formrecognizer/3.2.0b2 Python/3.8.5 (Windows-10-10.0.19041-SP0) + method: POST + uri: https://region.api.cognitive.microsoft.com/formrecognizer/documentModels/prebuilt-document:analyze?stringIndexType=unicodeCodePoint&api-version=2021-09-30-preview + response: + body: + string: '' + headers: + apim-request-id: + - 4f291d5f-04e0-475f-8c9f-d7941d3fa90f + content-length: + - '0' + date: + - Tue, 19 Oct 2021 19:20:31 GMT + operation-location: + - https://region.api.cognitive.microsoft.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/4f291d5f-04e0-475f-8c9f-d7941d3fa90f?api-version=2021-09-30-preview + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-content-type-options: + - nosniff + x-envoy-upstream-service-time: + - '418' + status: + code: 202 + message: Accepted +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - azsdk-python-ai-formrecognizer/3.2.0b2 Python/3.8.5 (Windows-10-10.0.19041-SP0) + method: GET + uri: https://region.api.cognitive.microsoft.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/4f291d5f-04e0-475f-8c9f-d7941d3fa90f?api-version=2021-09-30-preview + response: + body: + string: '{"status": "running", "createdDateTime": "2021-10-19T19:20:31Z", "lastUpdatedDateTime": + "2021-10-19T19:20:34Z"}' + headers: + apim-request-id: + - 3b613302-b8da-424b-aea7-cc216c8d45c0 + content-type: + - application/json; charset=utf-8 + date: + - Tue, 19 Oct 2021 19:20:36 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + transfer-encoding: + - chunked + x-content-type-options: + - nosniff + x-envoy-upstream-service-time: + - '66' + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - azsdk-python-ai-formrecognizer/3.2.0b2 Python/3.8.5 (Windows-10-10.0.19041-SP0) + method: GET + uri: https://region.api.cognitive.microsoft.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/4f291d5f-04e0-475f-8c9f-d7941d3fa90f?api-version=2021-09-30-preview + response: + body: + string: '{"status": "succeeded", "createdDateTime": "2021-10-19T19:20:31Z", + "lastUpdatedDateTime": "2021-10-19T19:20:36Z", "analyzeResult": {"apiVersion": + "2021-09-30-preview", "modelId": "prebuilt-document", "stringIndexType": "unicodeCodePoint", + "content": "Contoso\nAddress:\n1 Redmond way Suite\n6000 Redmond, WA\n99243\nInvoice + For: Microsoft\n1020 Enterprise Way\nSunnayvale, CA 87659\nInvoice Number\nInvoice + Date\nInvoice Due Date\nCharges\nVAT ID\n34278587\n6/18/2017\n6/24/2017\n$56,651.49\nPT", + "pages": [{"pageNumber": 1, "angle": 0, "width": 8.5, "height": 11, "unit": + "inch", "words": [{"content": "Contoso", "boundingBox": [0.5384, 1.1583, 1.4466, + 1.1583, 1.4466, 1.3534, 0.5384, 1.3534], "confidence": 1, "span": {"offset": + 0, "length": 7}}, {"content": "Address:", "boundingBox": [0.7994, 1.5143, + 1.3836, 1.5143, 1.3836, 1.6154, 0.7994, 1.6154], "confidence": 1, "span": + {"offset": 8, "length": 8}}, {"content": "1", "boundingBox": [0.8106, 1.708, + 0.8463, 1.708, 0.8463, 1.8053, 0.8106, 1.8053], "confidence": 1, "span": {"offset": + 17, "length": 1}}, {"content": "Redmond", "boundingBox": [0.923, 1.7047, 1.5018, + 1.7047, 1.5018, 1.8068, 0.923, 1.8068], "confidence": 1, "span": {"offset": + 19, "length": 7}}, {"content": "way", "boundingBox": [1.5506, 1.7309, 1.7949, + 1.7309, 1.7949, 1.8342, 1.5506, 1.8342], "confidence": 1, "span": {"offset": + 27, "length": 3}}, {"content": "Suite", "boundingBox": [1.8415, 1.7033, 2.1445, + 1.7033, 2.1445, 1.8078, 1.8415, 1.8078], "confidence": 1, "span": {"offset": + 31, "length": 5}}, {"content": "6000", "boundingBox": [0.8019, 1.896, 1.0991, + 1.896, 1.0991, 1.9994, 0.8019, 1.9994], "confidence": 1, "span": {"offset": + 37, "length": 4}}, {"content": "Redmond,", "boundingBox": [1.1537, 1.8964, + 1.7689, 1.8964, 1.7689, 2.0171, 1.1537, 2.0171], "confidence": 1, "span": + {"offset": 42, "length": 8}}, {"content": "WA", "boundingBox": [1.8196, 1.8976, + 2.0384, 1.8976, 2.0384, 1.9969, 1.8196, 1.9969], "confidence": 1, "span": + {"offset": 51, "length": 2}}, {"content": "99243", "boundingBox": [0.8025, + 2.0876, 1.175, 2.0876, 1.175, 2.1911, 0.8025, 2.1911], "confidence": 1, "span": + {"offset": 54, "length": 5}}, {"content": "Invoice", "boundingBox": [4.4033, + 1.5143, 4.8234, 1.5143, 4.8234, 1.6155, 4.4033, 1.6155], "confidence": 1, + "span": {"offset": 60, "length": 7}}, {"content": "For:", "boundingBox": [4.8793, + 1.5143, 5.1013, 1.5143, 5.1013, 1.6154, 4.8793, 1.6154], "confidence": 1, + "span": {"offset": 68, "length": 4}}, {"content": "Microsoft", "boundingBox": + [5.2045, 1.5114, 5.8155, 1.5114, 5.8155, 1.6151, 5.2045, 1.6151], "confidence": + 1, "span": {"offset": 73, "length": 9}}, {"content": "1020", "boundingBox": + [5.2036, 1.716, 5.4935, 1.716, 5.4935, 1.8185, 5.2036, 1.8185], "confidence": + 1, "span": {"offset": 83, "length": 4}}, {"content": "Enterprise", "boundingBox": + [5.5488, 1.7164, 6.2178, 1.7164, 6.2178, 1.8441, 5.5488, 1.8441], "confidence": + 1, "span": {"offset": 88, "length": 10}}, {"content": "Way", "boundingBox": + [6.2618, 1.7164, 6.5436, 1.7164, 6.5436, 1.8459, 6.2618, 1.8459], "confidence": + 1, "span": {"offset": 99, "length": 3}}, {"content": "Sunnayvale,", "boundingBox": + [5.196, 1.9047, 5.9894, 1.9047, 5.9894, 2.0359, 5.196, 2.0359], "confidence": + 1, "span": {"offset": 103, "length": 11}}, {"content": "CA", "boundingBox": + [6.0427, 1.9047, 6.2354, 1.9047, 6.2354, 2.0085, 6.0427, 2.0085], "confidence": + 1, "span": {"offset": 115, "length": 2}}, {"content": "87659", "boundingBox": + [6.2801, 1.906, 6.6526, 1.906, 6.6526, 2.0086, 6.2801, 2.0086], "confidence": + 1, "span": {"offset": 118, "length": 5}}, {"content": "Invoice", "boundingBox": + [0.5439, 2.8733, 1.0098, 2.8733, 1.0098, 2.9754, 0.5439, 2.9754], "confidence": + 1, "span": {"offset": 124, "length": 7}}, {"content": "Number", "boundingBox": + [1.0611, 2.8743, 1.5729, 2.8743, 1.5729, 2.9754, 1.0611, 2.9754], "confidence": + 1, "span": {"offset": 132, "length": 6}}, {"content": "Invoice", "boundingBox": + [1.9491, 2.8733, 2.415, 2.8733, 2.415, 2.9754, 1.9491, 2.9754], "confidence": + 1, "span": {"offset": 139, "length": 7}}, {"content": "Date", "boundingBox": + [2.4673, 2.8743, 2.7527, 2.8743, 2.7527, 2.9754, 2.4673, 2.9754], "confidence": + 1, "span": {"offset": 147, "length": 4}}, {"content": "Invoice", "boundingBox": + [3.3495, 2.8733, 3.8155, 2.8733, 3.8155, 2.9754, 3.3495, 2.9754], "confidence": + 1, "span": {"offset": 152, "length": 7}}, {"content": "Due", "boundingBox": + [3.8677, 2.8743, 4.1149, 2.8743, 4.1149, 2.9754, 3.8677, 2.9754], "confidence": + 1, "span": {"offset": 160, "length": 3}}, {"content": "Date", "boundingBox": + [4.1678, 2.8743, 4.4547, 2.8743, 4.4547, 2.9754, 4.1678, 2.9754], "confidence": + 1, "span": {"offset": 164, "length": 4}}, {"content": "Charges", "boundingBox": + [4.7468, 2.8717, 5.289, 2.8717, 5.289, 3.0035, 4.7468, 3.0035], "confidence": + 1, "span": {"offset": 169, "length": 7}}, {"content": "VAT", "boundingBox": + [6.141, 2.873, 6.4147, 2.873, 6.4147, 2.9736, 6.141, 2.9736], "confidence": + 1, "span": {"offset": 177, "length": 3}}, {"content": "ID", "boundingBox": + [6.4655, 2.873, 6.5875, 2.873, 6.5875, 2.9736, 6.4655, 2.9736], "confidence": + 1, "span": {"offset": 181, "length": 2}}, {"content": "34278587", "boundingBox": + [0.5397, 3.411, 1.1457, 3.411, 1.1457, 3.5144, 0.5397, 3.5144], "confidence": + 1, "span": {"offset": 184, "length": 8}}, {"content": "6/18/2017", "boundingBox": + [1.9455, 3.41, 2.551, 3.41, 2.551, 3.5144, 1.9455, 3.5144], "confidence": + 1, "span": {"offset": 193, "length": 9}}, {"content": "6/24/2017", "boundingBox": + [3.346, 3.41, 3.9514, 3.41, 3.9514, 3.5144, 3.346, 3.5144], "confidence": + 1, "span": {"offset": 203, "length": 9}}, {"content": "$56,651.49", "boundingBox": + [5.3871, 3.4047, 6.0702, 3.4047, 6.0702, 3.5321, 5.3871, 3.5321], "confidence": + 1, "span": {"offset": 213, "length": 10}}, {"content": "PT", "boundingBox": + [6.2285, 3.4114, 6.3919, 3.4114, 6.3919, 3.5119, 6.2285, 3.5119], "confidence": + 1, "span": {"offset": 224, "length": 2}}], "selectionMarks": [], "lines": + [{"content": "Contoso", "boundingBox": [0.5384, 1.1583, 1.4466, 1.1583, 1.4466, + 1.3534, 0.5384, 1.3534], "spans": [{"offset": 0, "length": 7}]}, {"content": + "Address:", "boundingBox": [0.7994, 1.5143, 1.3836, 1.5143, 1.3836, 1.6154, + 0.7994, 1.6154], "spans": [{"offset": 8, "length": 8}]}, {"content": "1 Redmond + way Suite", "boundingBox": [0.8106, 1.7033, 2.1445, 1.7033, 2.1445, 1.8342, + 0.8106, 1.8342], "spans": [{"offset": 17, "length": 19}]}, {"content": "6000 + Redmond, WA", "boundingBox": [0.8019, 1.896, 2.0384, 1.896, 2.0384, 2.0171, + 0.8019, 2.0171], "spans": [{"offset": 37, "length": 16}]}, {"content": "99243", + "boundingBox": [0.8025, 2.0876, 1.175, 2.0876, 1.175, 2.1911, 0.8025, 2.1911], + "spans": [{"offset": 54, "length": 5}]}, {"content": "Invoice For: Microsoft", + "boundingBox": [4.4033, 1.5114, 5.8155, 1.5114, 5.8155, 1.6155, 4.4033, 1.6155], + "spans": [{"offset": 60, "length": 22}]}, {"content": "1020 Enterprise Way", + "boundingBox": [5.2036, 1.716, 6.5436, 1.716, 6.5436, 1.8459, 5.2036, 1.8459], + "spans": [{"offset": 83, "length": 19}]}, {"content": "Sunnayvale, CA 87659", + "boundingBox": [5.196, 1.9047, 6.6526, 1.9047, 6.6526, 2.0359, 5.196, 2.0359], + "spans": [{"offset": 103, "length": 20}]}, {"content": "Invoice Number", "boundingBox": + [0.5439, 2.8733, 1.5729, 2.8733, 1.5729, 2.9754, 0.5439, 2.9754], "spans": + [{"offset": 124, "length": 14}]}, {"content": "Invoice Date", "boundingBox": + [1.9491, 2.8733, 2.7527, 2.8733, 2.7527, 2.9754, 1.9491, 2.9754], "spans": + [{"offset": 139, "length": 12}]}, {"content": "Invoice Due Date", "boundingBox": + [3.3495, 2.8733, 4.4547, 2.8733, 4.4547, 2.9754, 3.3495, 2.9754], "spans": + [{"offset": 152, "length": 16}]}, {"content": "Charges", "boundingBox": [4.7468, + 2.8717, 5.289, 2.8717, 5.289, 3.0035, 4.7468, 3.0035], "spans": [{"offset": + 169, "length": 7}]}, {"content": "VAT ID", "boundingBox": [6.141, 2.873, 6.5875, + 2.873, 6.5875, 2.9736, 6.141, 2.9736], "spans": [{"offset": 177, "length": + 6}]}, {"content": "34278587", "boundingBox": [0.5397, 3.411, 1.1457, 3.411, + 1.1457, 3.5144, 0.5397, 3.5144], "spans": [{"offset": 184, "length": 8}]}, + {"content": "6/18/2017", "boundingBox": [1.9455, 3.41, 2.551, 3.41, 2.551, + 3.5144, 1.9455, 3.5144], "spans": [{"offset": 193, "length": 9}]}, {"content": + "6/24/2017", "boundingBox": [3.346, 3.41, 3.9514, 3.41, 3.9514, 3.5144, 3.346, + 3.5144], "spans": [{"offset": 203, "length": 9}]}, {"content": "$56,651.49", + "boundingBox": [5.3871, 3.4047, 6.0702, 3.4047, 6.0702, 3.5321, 5.3871, 3.5321], + "spans": [{"offset": 213, "length": 10}]}, {"content": "PT", "boundingBox": + [6.2285, 3.4114, 6.3919, 3.4114, 6.3919, 3.5119, 6.2285, 3.5119], "spans": + [{"offset": 224, "length": 2}]}], "spans": [{"offset": 0, "length": 226}]}], + "tables": [{"rowCount": 3, "columnCount": 5, "cells": [{"kind": "columnHeader", + "rowIndex": 0, "columnIndex": 0, "rowSpan": 1, "columnSpan": 1, "content": + "Invoice Number", "boundingRegions": [{"pageNumber": 1, "boundingBox": [0.497, + 2.7887, 1.9036, 2.7887, 1.8965, 3.3133, 0.5041, 3.3133]}], "spans": [{"offset": + 124, "length": 14}]}, {"kind": "columnHeader", "rowIndex": 0, "columnIndex": + 1, "rowSpan": 1, "columnSpan": 1, "content": "Invoice Date", "boundingRegions": + [{"pageNumber": 1, "boundingBox": [1.9036, 2.7887, 3.296, 2.7887, 3.3031, + 3.3205, 1.8965, 3.3133]}], "spans": [{"offset": 139, "length": 12}]}, {"kind": + "columnHeader", "rowIndex": 0, "columnIndex": 2, "rowSpan": 1, "columnSpan": + 1, "content": "Invoice Due Date", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [3.296, 2.7887, 4.7026, 2.7887, 4.7026, 3.3205, 3.3031, 3.3205]}], "spans": + [{"offset": 152, "length": 16}]}, {"kind": "columnHeader", "rowIndex": 0, + "columnIndex": 3, "rowSpan": 1, "columnSpan": 1, "content": "Charges", "boundingRegions": + [{"pageNumber": 1, "boundingBox": [4.7026, 2.7887, 6.1021, 2.7887, 6.1021, + 3.3133, 4.7026, 3.3205]}], "spans": [{"offset": 169, "length": 7}]}, {"kind": + "columnHeader", "rowIndex": 0, "columnIndex": 4, "rowSpan": 1, "columnSpan": + 1, "content": "VAT ID", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [6.1021, 2.7887, 7.4945, 2.7887, 7.4945, 3.3133, 6.1021, 3.3133]}], "spans": + [{"offset": 177, "length": 6}]}, {"rowIndex": 1, "columnIndex": 0, "rowSpan": + 2, "columnSpan": 1, "content": "34278587", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [0.5041, 3.3133, 1.8965, 3.3133, 1.8965, 3.8523, 0.5113, + 3.8523]}], "spans": [{"offset": 184, "length": 8}]}, {"rowIndex": 1, "columnIndex": + 1, "rowSpan": 2, "columnSpan": 1, "content": "6/18/2017", "boundingRegions": + [{"pageNumber": 1, "boundingBox": [1.8965, 3.3133, 3.3031, 3.3205, 3.3031, + 3.8523, 1.8965, 3.8523]}], "spans": [{"offset": 193, "length": 9}]}, {"rowIndex": + 1, "columnIndex": 2, "rowSpan": 2, "columnSpan": 1, "content": "6/24/2017", + "boundingRegions": [{"pageNumber": 1, "boundingBox": [3.3031, 3.3205, 4.7026, + 3.3205, 4.7026, 3.8523, 3.3031, 3.8523]}], "spans": [{"offset": 203, "length": + 9}]}, {"rowIndex": 1, "columnIndex": 3, "rowSpan": 2, "columnSpan": 1, "content": + "$56,651.49", "boundingRegions": [{"pageNumber": 1, "boundingBox": [4.7026, + 3.3205, 6.1021, 3.3133, 6.1021, 3.8523, 4.7026, 3.8523]}], "spans": [{"offset": + 213, "length": 10}]}, {"rowIndex": 1, "columnIndex": 4, "rowSpan": 2, "columnSpan": + 1, "content": "PT", "boundingRegions": [{"pageNumber": 1, "boundingBox": [6.1021, + 3.3133, 7.4945, 3.3133, 7.4945, 3.8523, 6.1021, 3.8523]}], "spans": [{"offset": + 224, "length": 2}]}], "boundingRegions": [{"pageNumber": 1, "boundingBox": + [0.5052, 2.7836, 7.4995, 2.7844, 7.4985, 3.8596, 0.5038, 3.859]}], "spans": + [{"offset": 124, "length": 102}]}], "keyValuePairs": [{"key": {"content": + "Address:", "boundingRegions": [{"pageNumber": 1, "boundingBox": [0.7994, + 1.5143, 1.3836, 1.5143, 1.3836, 1.6154, 0.7994, 1.6154]}], "spans": [{"offset": + 8, "length": 8}]}, "value": {"content": "1 Redmond way Suite 6000 Redmond, + WA 99243", "boundingRegions": [{"pageNumber": 1, "boundingBox": [0.8019, 1.7033, + 2.1445, 1.7033, 2.1445, 2.1911, 0.8019, 2.1911]}], "spans": [{"offset": 17, + "length": 42}]}, "confidence": 0.959}, {"key": {"content": "Invoice For:", + "boundingRegions": [{"pageNumber": 1, "boundingBox": [4.4033, 1.5143, 5.1013, + 1.5143, 5.1013, 1.6155, 4.4033, 1.6155]}], "spans": [{"offset": 60, "length": + 12}]}, "value": {"content": "Microsoft 1020 Enterprise Way Sunnayvale, CA + 87659", "boundingRegions": [{"pageNumber": 1, "boundingBox": [5.196, 1.5114, + 6.6526, 1.5114, 6.6526, 2.0359, 5.196, 2.0359]}], "spans": [{"offset": 73, + "length": 50}]}, "confidence": 0.761}, {"key": {"content": "Invoice Number", + "boundingRegions": [{"pageNumber": 1, "boundingBox": [0.5439, 2.8733, 1.5729, + 2.8733, 1.5729, 2.9754, 0.5439, 2.9754]}], "spans": [{"offset": 124, "length": + 14}]}, "value": {"content": "34278587", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [0.5397, 3.411, 1.1457, 3.411, 1.1457, 3.5144, 0.5397, 3.5144]}], + "spans": [{"offset": 184, "length": 8}]}, "confidence": 0.972}, {"key": {"content": + "Invoice Date", "boundingRegions": [{"pageNumber": 1, "boundingBox": [1.9491, + 2.8733, 2.7527, 2.8733, 2.7527, 2.9754, 1.9491, 2.9754]}], "spans": [{"offset": + 139, "length": 12}]}, "value": {"content": "6/18/2017", "boundingRegions": + [{"pageNumber": 1, "boundingBox": [1.9455, 3.41, 2.551, 3.41, 2.551, 3.5144, + 1.9455, 3.5144]}], "spans": [{"offset": 193, "length": 9}]}, "confidence": + 0.972}, {"key": {"content": "Invoice Due Date", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [3.3495, 2.8733, 4.4547, 2.8733, 4.4547, 2.9754, 3.3495, + 2.9754]}], "spans": [{"offset": 152, "length": 16}]}, "value": {"content": + "6/24/2017", "boundingRegions": [{"pageNumber": 1, "boundingBox": [3.346, + 3.41, 3.9514, 3.41, 3.9514, 3.5144, 3.346, 3.5144]}], "spans": [{"offset": + 203, "length": 9}]}, "confidence": 0.951}, {"key": {"content": "Charges", + "boundingRegions": [{"pageNumber": 1, "boundingBox": [4.7468, 2.8717, 5.289, + 2.8717, 5.289, 3.0035, 4.7468, 3.0035]}], "spans": [{"offset": 169, "length": + 7}]}, "value": {"content": "$56,651.49", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [5.3871, 3.4047, 6.0702, 3.4047, 6.0702, 3.5321, 5.3871, + 3.5321]}], "spans": [{"offset": 213, "length": 10}]}, "confidence": 0.339}, + {"key": {"content": "VAT ID", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [6.141, 2.873, 6.5875, 2.873, 6.5875, 2.9736, 6.141, 2.9736]}], "spans": [{"offset": + 177, "length": 6}]}, "value": {"content": "PT", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [6.2285, 3.4114, 6.3919, 3.4114, 6.3919, 3.5119, 6.2285, + 3.5119]}], "spans": [{"offset": 224, "length": 2}]}, "confidence": 0.972}], + "entities": [{"category": "Quantity", "subCategory": "Number", "content": + "34278587", "boundingRegions": [{"pageNumber": 1, "boundingBox": [0.5397, + 3.411, 1.1457, 3.411, 1.1457, 3.5144, 0.5397, 3.5144]}], "confidence": 0.8, + "spans": [{"offset": 184, "length": 8}]}, {"category": "DateTime", "subCategory": + "Date", "content": "6/18/2017", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [1.9455, 3.41, 2.551, 3.41, 2.551, 3.5144, 1.9455, 3.5144]}], "confidence": + 0.8, "spans": [{"offset": 193, "length": 9}]}, {"category": "DateTime", "subCategory": + "Date", "content": "6/24/2017", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [3.346, 3.41, 3.9514, 3.41, 3.9514, 3.5144, 3.346, 3.5144]}], "confidence": + 0.8, "spans": [{"offset": 203, "length": 9}]}, {"category": "Quantity", "subCategory": + "Currency", "content": "$56,651.49", "boundingRegions": [{"pageNumber": 1, + "boundingBox": [5.3871, 3.4047, 6.0702, 3.4047, 6.0702, 3.5321, 5.3871, 3.5321]}], + "confidence": 0.8, "spans": [{"offset": 213, "length": 10}]}, {"category": + "Organization", "content": "PT", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [6.2285, 3.4114, 6.3919, 3.4114, 6.3919, 3.5119, 6.2285, 3.5119]}], "confidence": + 0.91, "spans": [{"offset": 224, "length": 2}]}, {"category": "Organization", + "content": "Contoso", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [0.5384, 1.1583, 1.4466, 1.1583, 1.4466, 1.3534, 0.5384, 1.3534]}], "confidence": + 0.44, "spans": [{"offset": 0, "length": 7}]}, {"category": "Address", "content": + "1 Redmond way Suite 6000 Redmond, WA 99243", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [0.8106, 1.7033, 2.1445, 1.7033, 2.1445, 1.8342, 0.8106, + 1.8342]}, {"pageNumber": 1, "boundingBox": [0.8019, 1.896, 2.0384, 1.896, + 2.0384, 2.0171, 0.8019, 2.0171]}, {"pageNumber": 1, "boundingBox": [0.8025, + 2.0876, 1.175, 2.0876, 1.175, 2.1911, 0.8025, 2.1911]}], "confidence": 0.74, + "spans": [{"offset": 17, "length": 42}]}, {"category": "Quantity", "subCategory": + "Number", "content": "1", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [0.8106, 1.708, 0.8463, 1.708, 0.8463, 1.8053, 0.8106, 1.8053]}], "confidence": + 0.8, "spans": [{"offset": 17, "length": 1}]}, {"category": "Quantity", "subCategory": + "Number", "content": "6000", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [0.8019, 1.896, 1.0991, 1.896, 1.0991, 1.9994, 0.8019, 1.9994]}], "confidence": + 0.8, "spans": [{"offset": 37, "length": 4}]}, {"category": "Quantity", "subCategory": + "Number", "content": "99243", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [0.8025, 2.0876, 1.175, 2.0876, 1.175, 2.1911, 0.8025, 2.1911]}], "confidence": + 0.8, "spans": [{"offset": 54, "length": 5}]}, {"category": "Organization", + "content": "Microsoft", "boundingRegions": [{"pageNumber": 1, "boundingBox": + [5.2045, 1.5114, 5.8155, 1.5114, 5.8155, 1.6151, 5.2045, 1.6151]}], "confidence": + 0.78, "spans": [{"offset": 73, "length": 9}]}, {"category": "Address", "content": + "1020 Enterprise Way Sunnayvale, CA 87659", "boundingRegions": [{"pageNumber": + 1, "boundingBox": [5.2036, 1.716, 6.5436, 1.716, 6.5436, 1.8459, 5.2036, 1.8459]}, + {"pageNumber": 1, "boundingBox": [5.196, 1.9047, 6.6526, 1.9047, 6.6526, 2.0359, + 5.196, 2.0359]}], "confidence": 0.86, "spans": [{"offset": 83, "length": 40}]}, + {"category": "Quantity", "subCategory": "Number", "content": "1020", "boundingRegions": + [{"pageNumber": 1, "boundingBox": [5.2036, 1.716, 5.4935, 1.716, 5.4935, 1.8185, + 5.2036, 1.8185]}], "confidence": 0.8, "spans": [{"offset": 83, "length": 4}]}, + {"category": "Quantity", "subCategory": "Number", "content": "87659", "boundingRegions": + [{"pageNumber": 1, "boundingBox": [6.2801, 1.906, 6.6526, 1.906, 6.6526, 2.0086, + 6.2801, 2.0086]}], "confidence": 0.8, "spans": [{"offset": 118, "length": + 5}]}]}}' + headers: + apim-request-id: + - ba9a4f39-0ac7-428e-9a68-5fa6baef4153 + content-type: + - application/json; charset=utf-8 + date: + - Tue, 19 Oct 2021 19:20:42 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + transfer-encoding: + - chunked + x-content-type-options: + - nosniff + x-envoy-upstream-service-time: + - '378' + status: + code: 200 + message: OK +version: 1 diff --git a/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_document.py b/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_document.py index 232bc6bbaf36..63214439f95c 100644 --- a/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_document.py +++ b/sdk/formrecognizer/azure-ai-formrecognizer/tests/test_document.py @@ -52,6 +52,21 @@ def callback(raw_response, _, headers): # check page range assert len(raw_analyze_result.pages) == len(returned_model.pages) + @FormRecognizerPreparer() + @DocumentAnalysisClientPreparer() + def test_document_line_get_words(self, client): + with open(self.invoice_pdf, "rb") as fd: + document = fd.read() + + poller = client.begin_analyze_document("prebuilt-document", document) + result = poller.result() + + words = result.pages[0].lines[0].get_words("") + assert words[0].content == "Contoso" + words = result.pages[0].lines[2].get_words("") + assert len(words) == 4 + + @FormRecognizerPreparer() @DocumentAnalysisClientPreparer() def test_document_stream_transform_jpg(self, client):