From 457461115eb854fb3b3c802f747d728e1736edf5 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Tue, 24 May 2022 13:54:55 -0400 Subject: [PATCH 1/3] ENSY-70-insert-namespaces Why these changes are being introduced: * Alma MARCXML lacks namespaces in the collection element which are required for validation by POD How this addresses that need: * Insert namespaces with replace string method * Add fixture and unit test for new functionality Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/ENSY-70 --- fixtures/pod_with_namespaces.xml | 264 +++++++++++++++++++++++++++++++ ppod.py | 24 ++- test_ppod.py | 12 +- 3 files changed, 295 insertions(+), 5 deletions(-) create mode 100644 fixtures/pod_with_namespaces.xml diff --git a/fixtures/pod_with_namespaces.xml b/fixtures/pod_with_namespaces.xml new file mode 100644 index 0000000..dd40f58 --- /dev/null +++ b/fixtures/pod_with_namespaces.xml @@ -0,0 +1,264 @@ + + + + 01168nam 2200385Ia 4500 + 20210714130252.0 + 990603s1999 ne a 000 0 eng d + 990008915090106761 + + 9071570231 (pbk.) + + + (MCM)000891509 + + + (MCM)000891509MIT01 + + + (OCoLC)41479068 + + + FBR + FBR + MYG + + + MYGG + + + NA2750 + .M34 1999 + + + 720.1 + 21 + + + Magyar, P\xc3\xa9ter. + + + Thought palaces / + Peter Magyar. + + + Amsterdam : + Architectura & Natura Press, + c1999. + + + 333 p. : + ill. ; 24 cm. + + + text + txt + rdacontent + + + unspecified + z + rdamedia + + + unspecified + zu + rdacarrier + + + Includes bibliographical references (p. 16). + + + 10891509 + + + Architectural design. + + + Architecture + Philosophy. + + + Thought palaces + 2000005118 200006007 + + + RTC + STACK + NA2750.M34 1999 + 22527227150006761 + + + 1 + 0 + STACK + false + BOOK + 39080015616003 + STACK + 23527227120006761 + 01 + NA2750.M34 1999 + RTC + RTC + + + + 02007nam 2200481 4500 + 20210822074144.0 + 740426s1974 nyu b 101 0 eng + 990000101910106761 + + 73088720 + + + 0914362097 + + + (MCM)000010191 + + + (MCM)000010191MIT01 + + + (OCoLC)00902533 + + + DLC + DLC + MYG + + + a-cc--- + + + MYGH + [769195] + MYGR + [758319] + + + R601 + .M43 + + + R601.M43 + + + Medicine and society in China; + report of a conference sponsored jointly by the National Library of Medicine and the Josiah Macy, Jr. Foundation. Edited by John Z. Bowers and Elizabeth F. Purcell. + + + New York, + Josiah Macy, Jr. Foundation + [1974] + + + vii, 176 p. + 23 cm. + + + text + txt + rdacontent + + + unmediated + n + rdamedia + + + volume + nc + rdacarrier + + + The Macy Foundation series on medicine and public health in China + + + committed to retain + 20170930 + 20421231 + HathiTrust + https://www.hathitrust.org/shared%5Fprint%5Fprogram + MCM + + + Includes bibliographical references. + + + Medicine + China + History + Congresses. + + + Public health + China + Congresses. + + + Conference papers and proceedings. + lcgft + + + Bowers, John Z., + 1913-1993. + + + Purcell, Elizabeth. + + + National Library of Medicine (U.S.) + + + Josiah Macy, Jr. Foundation. + + + Conference on Medicine and Society in China + (1973) + + + Macy Foundation series on medicine and public health in China. + + + LSA + OCC + R601.M43 + 22527225770006761 + + + HUM + STACK + R601.M43 + 22527225790006761 + + + 1 + 0 + STACK + false + BOOK + 39080000528593 + STACK + 23527225780006761 + 01 + R601.M43 + HUM + HUM + + + 1 + 0 + OCC + false + BOOK + 39080019409462 + OCC + 23527225760006761 + 15 + R601.M43 + LSA + LSA + + + \ No newline at end of file diff --git a/ppod.py b/ppod.py index e10136d..2838162 100644 --- a/ppod.py +++ b/ppod.py @@ -26,13 +26,29 @@ def lambda_handler(event: dict, context: object) -> dict: for s3_file in s3_files: logger.info("Processing file: %s", s3_file) s3_file_content = smart_open.open(f"s3://{bucket}/{s3_file}", "rb") - files = extract_files_from_tar(s3_file_content) - for file in files: - file # do a thing - file_count += 1 + xml_files = extract_files_from_tar(s3_file_content) + for xml_file in xml_files: + if xml_file: + add_namespaces_to_alma_marcxml(xml_file) + # post modified_xml to POD + file_count += 1 return {"files-processed": file_count} +def add_namespaces_to_alma_marcxml(xml_file: IO[bytes]) -> str: + collection_element_with_namespaces = ( + "\n" + ) + return ( + xml_file.read() + .decode("utf-8") + .replace("\n", collection_element_with_namespaces) + ) + + def extract_files_from_tar( tar_file: IO[bytes], ) -> Generator[Optional[IO[bytes]], None, None]: diff --git a/test_ppod.py b/test_ppod.py index 36f7531..6c08a07 100644 --- a/test_ppod.py +++ b/test_ppod.py @@ -2,7 +2,12 @@ import pytest -from ppod import extract_files_from_tar, filter_files_in_bucket, lambda_handler +from ppod import ( + add_namespaces_to_alma_marcxml, + extract_files_from_tar, + filter_files_in_bucket, + lambda_handler, +) def test_ppod_configures_sentry_if_dsn_present( @@ -45,6 +50,11 @@ def test_ppod_no_matching_files_raises_exception(mocked_s3): lambda_handler(request_data, {}) +def test_add_namespaces_to_alma_marcxml(): + modified_xml = add_namespaces_to_alma_marcxml(open("fixtures/pod.xml", "rb")) + assert modified_xml == open("fixtures/pod_with_namespaces.xml", "r").read() + + def test_extract_files_from_tar(): files = extract_files_from_tar(open("fixtures/pod.tar.gz", "rb")) assert next(files).read() == open("fixtures/pod.xml", "rb").read() From d2285c7a85c551496282d4f4a80c768f551a3e2a Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 26 May 2022 14:33:37 -0400 Subject: [PATCH 2/3] Updates based on discussion in PR #3 * Add context manager to mocked_s3 fixture * Add empty tar file fixture * Update fixtures to match expected format of MARCXML files * Add context manager to lambda_handler * Change dash to underscore for lambda_handler output due to Step Function requirements * Add exception for failed tar file extraction * Update add_namespace_to_alma_marcxml for more efficient processing * Add test for an empty tar file * Add context managers to tests --- conftest.py | 37 +++-- fixtures/empty.tar.gz | Bin 0 -> 529 bytes fixtures/pod.tar.gz | Bin 1707 -> 1018 bytes fixtures/pod.xml | 264 +------------------------------ fixtures/pod_with_namespaces.xml | 264 +------------------------------ ppod.py | 43 ++--- test_ppod.py | 24 ++- 7 files changed, 69 insertions(+), 563 deletions(-) create mode 100644 fixtures/empty.tar.gz diff --git a/conftest.py b/conftest.py index c8c6330..10e70a8 100644 --- a/conftest.py +++ b/conftest.py @@ -21,22 +21,31 @@ def request_data_matching_file(): @pytest.fixture(scope="session") def mocked_s3(aws_credentials): with mock_s3(): - s3 = boto3.client("s3", region_name="us-east-1") - s3.create_bucket(Bucket="ppod") - s3.put_object( - Body=open("fixtures/pod.tar.gz", "rb"), - Bucket="ppod", - Key="upload/pod.tar.gz", - ) - s3.create_bucket(Bucket="no_files") - s3.create_bucket(Bucket="a_lot_of_files") - for i in range(1001): + with open("fixtures/pod.tar.gz", "rb") as pod_tar, open( + "fixtures/empty.tar.gz", "rb" + ) as empty_tar: + s3 = boto3.client("s3", region_name="us-east-1") + s3.create_bucket(Bucket="ppod") s3.put_object( - Body=str(i), - Bucket="a_lot_of_files", - Key=f"upload/{i}.txt", + Body=pod_tar, + Bucket="ppod", + Key="upload/pod.tar.gz", ) - yield s3 + s3.create_bucket(Bucket="empty_tar") + s3.put_object( + Body=empty_tar, + Bucket="empty_tar", + Key="upload/empty.tar.gz", + ) + s3.create_bucket(Bucket="no_files") + s3.create_bucket(Bucket="a_lot_of_files") + for i in range(1001): + s3.put_object( + Body=str(i), + Bucket="a_lot_of_files", + Key=f"upload/{i}.txt", + ) + yield s3 @pytest.fixture(autouse=True) diff --git a/fixtures/empty.tar.gz b/fixtures/empty.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..fbcc5ec708b67d3d2856bc2b9830515912aed8c8 GIT binary patch literal 529 zcmV+s0`C1EiwFQLrH^6&1MQeWZ`v>vfbEQFj9oZQJNmL~W;;&O9x5disv>o2)l}7X z>Y@@hBuFY6B#!y%bMR}$(gie$GFsjzSzmzX7blMGm|2>S@^La4X5&FJ{X(7t&bjZp zY+=t^m`i00FrlP!d?|%17#GUtk}&sKT2ofFE+#c7$>q;rp!vm!0)H;pOpZ|GH<-<2;|*x7oyw z@9xHleKJb*gVIlj*-`&E{Ao%j?1d|7>CTRH!n5zI60VzMTHC!#ub1ljzxmaF-S`)Z zb5&jc9Z%u+a^HlaMhTFwVK5ClOG1let|3?ty6+;`{sM{fr8-k|+y8Ku=E=c`LfMxzH{Ipb;fdg=1j4uKR Tf*=TjAlu?Mrp|*X04e|gsO$-@ literal 0 HcmV?d00001 diff --git a/fixtures/pod.tar.gz b/fixtures/pod.tar.gz index 28ac0d11e52e4fda39f14028514dbc393d193e04..f2d2b265b67106243c5e2dd9a960c9112415738e 100644 GIT binary patch literal 1018 zcmVo|%1MOH_Z`(E$?(=>Hp}izPD)TOCb5vw5HW-VW4)q@PNJ+HK zRhBYI4$}Vm9r;!xc4-E1uy*hP94gS^cRrpQN!?d@_&6_Ky`*4_CFiIcBPo|7rmO!! z2&If7!YOAFp_~we2_q7}Lij~lt!2@)x&|s{rpu-(zp4jnlZgXowHUM=zZ2=df#{$*MX}`i0TCsS`{hA~HQk zh*FHXB1j`HFz)x(3NX2B>#CSoQ{<@C0F1E+5{fCo5#fX}B2)+yb`=j#Q$Z3-0`nRY z#}NX#*9+Z^DDD10J|PS%m?Go@M*uE{cqxiS#4*7#lH}>cTtn=4W)bj_Epu`mpydPY z9+G~uNLO0mJ#!uCAOSwH#Da36`n}ETq~F^D?qgFg?9V_uQdq?*A~6t07p99&U6nRN z>BsQ$on0KdH*>|#Dq$c`!GK{EL9y<1`Q*KgjmV^7+E=?yBjzQ7w$d=T`H z8=|;lEoqR1W2o@1-T4|<)YG^iv=<78bqZbDH+Ko%u9>(UIj+sbv@=s)q1&SRl%sc{ zgTp_NkZU!Y&8k8-z6}8y?;fMWj(E42o7!f&K!q)hu8}ULCapEUlr21?;QE{mg3glRI@j3~xqP?j>YhX8E#Y*q7y_wO0swIJysy zk+ek-qMzL?sYKb_n z>B%^I%>=zfy39L3^XQvsBKF-o$n(w`Qta2+;m&olYE%uh(q-rJrS5OxOK*f{odLM#h0$J>8mMw(r?#>0$1PO~VBn+i7kvNVW zx}T480@{?V{Ynn_b$=JeuJ>vBjF`@j<&fP|I7ygYas2ketOOC9(bI()pTqp^g_$s+ oBt2!A>Gu`p-sX#9Z|5`PfBj|jEN3~(S^nShFIDz6jsP400Fk8o4gdfE literal 1707 zcmV;c22}YUiwFRg#A;#y1MOPfbJI8w=b67klbO3|XIe*+zo89Na`b>}Xor>u4(>s* zl~{*kd1R+;`0H<-^dnFwSTaqInd+f&(p_nHwY%E=$+?WJ%Srm=J_p9wa~xEC`kpuc z#B}k!dh%V{M#P~ucBx}K7-3=)A3s6(UZuJk)1qVwMBxd`3Yo2>1G3R*%{&Wb(9P#B zGWK4;?xS0Tt$lt(2H^`0Uadx}R=#fvZV zA5zX@u6mddFUZ&gAxbf}1NRL>j*IbL_ln?_Ba&IEWI7T&jZw*9J&fH>k77!&PaI-n z>QW0hmkEEsA9Q+Qh&^l(+3%qF1Farz>E}GfbhuyvfS82 zMgZG25nyZ3A8g+!{jjSK-kf2waS@LD@~FS<5XT3n3e1;xj2z(b<@>rN>!>3w>bTPz zeEhS8I-!X=uw;uhLn9NuJAm^AH?_;K)3XM)gR0!oLQ;AyDb|qGr`W1>@H%=5Y0^9z z2)?l;(b+#Xo-wt9PCj2o_UB6$mRwoJLQu!OBLx1j5NC;;j*}AQEM*ZdP*)>DPB=)6 z7HYH*+%xjTgGm9a$8a(}({RTF6(ypClXt2Z`khVk=ja`qe>qXSD0Vcok1DS=SKG!$ zYTHoU<(8JtB26vy96^Z~O#YRSZ0wQ z_%+th?yH_|6CMlJLbG6lnoTvgjVUF#GFHyTj)`(_vM8O#kmiPBmBHM&q;Yiw@%sL(uTk8do*nf6YDz!4 zt`gUles!GCY*J`ZeM+HGs|+*m2~7x_*GL6(3{#~)D&G>Dbd zcf4}1yA0~AMtHz5{td$81O)zPfbays?L%kn=p#J7gB|J>B=nt%ESpUi^uy{8!Q=|z zK}HAdhvuogC9bnD^uwGkSHCg?G2eR{r(T7xIW0WitW&8(Y_;wZR)sa1gaP!!k*F>pvKSRI61`k^A1Ss@jV(Z5n;WjaH{87dQAK}14k&>bC#p<-%QaegCA zMf6q{f+c9cqS+35t1NUVr&&Dbw9u;b}rNDq;d0BDHUxNK665&8;dhYb5*b}#W^ZCNKznpnk;9;d^$`; zgc8nD$ZOY0+8k30Y!iVaIhlyE%DkmSikD32BqoZ9k3+k0n;%y!4kgrXe1eZUz1OTv z#F?5Fje~aD=_O^E7tgxgi;D{@sU|3^2`s6`-6COdg#G3o<_Zq6`DX=mPMXdR4JFj{ z|3hSU*V zx7A?I^6LT(A}Hd#dWWZ8qPoc-QK!*(WPYgnX1B_Vx^m~b&TL%HLGpqI-rgl)Xtw~< z&4j)xdUtM1;T9s?VngL3(*q5Q9`_sLOQZFjd>zZ}4UOyjt#4?|?D6_TZE>=~ z9=caoZ7dQ#-dSzn&I)6f25z{4J1f@@?)%P4*WD5={}(H%e@S=L$ZZ?DKn)!nIv&+< zMhz{NUD4BbMdIGGD{34~VkPyOd(-N=@a8@FhkmvHaXgO4@i^9w{{YA0{>cC~007?^ BQjP!s diff --git a/fixtures/pod.xml b/fixtures/pod.xml index 590d695..d03d0cb 100644 --- a/fixtures/pod.xml +++ b/fixtures/pod.xml @@ -1,264 +1,2 @@ - - - 01168nam 2200385Ia 4500 - 20210714130252.0 - 990603s1999 ne a 000 0 eng d - 990008915090106761 - - 9071570231 (pbk.) - - - (MCM)000891509 - - - (MCM)000891509MIT01 - - - (OCoLC)41479068 - - - FBR - FBR - MYG - - - MYGG - - - NA2750 - .M34 1999 - - - 720.1 - 21 - - - Magyar, P\xc3\xa9ter. - - - Thought palaces / - Peter Magyar. - - - Amsterdam : - Architectura & Natura Press, - c1999. - - - 333 p. : - ill. ; 24 cm. - - - text - txt - rdacontent - - - unspecified - z - rdamedia - - - unspecified - zu - rdacarrier - - - Includes bibliographical references (p. 16). - - - 10891509 - - - Architectural design. - - - Architecture - Philosophy. - - - Thought palaces - 2000005118 200006007 - - - RTC - STACK - NA2750.M34 1999 - 22527227150006761 - - - 1 - 0 - STACK - false - BOOK - 39080015616003 - STACK - 23527227120006761 - 01 - NA2750.M34 1999 - RTC - RTC - - - - 02007nam 2200481 4500 - 20210822074144.0 - 740426s1974 nyu b 101 0 eng - 990000101910106761 - - 73088720 - - - 0914362097 - - - (MCM)000010191 - - - (MCM)000010191MIT01 - - - (OCoLC)00902533 - - - DLC - DLC - MYG - - - a-cc--- - - - MYGH - [769195] - MYGR - [758319] - - - R601 - .M43 - - - R601.M43 - - - Medicine and society in China; - report of a conference sponsored jointly by the National Library of Medicine and the Josiah Macy, Jr. Foundation. Edited by John Z. Bowers and Elizabeth F. Purcell. - - - New York, - Josiah Macy, Jr. Foundation - [1974] - - - vii, 176 p. - 23 cm. - - - text - txt - rdacontent - - - unmediated - n - rdamedia - - - volume - nc - rdacarrier - - - The Macy Foundation series on medicine and public health in China - - - committed to retain - 20170930 - 20421231 - HathiTrust - https://www.hathitrust.org/shared%5Fprint%5Fprogram - MCM - - - Includes bibliographical references. - - - Medicine - China - History - Congresses. - - - Public health - China - Congresses. - - - Conference papers and proceedings. - lcgft - - - Bowers, John Z., - 1913-1993. - - - Purcell, Elizabeth. - - - National Library of Medicine (U.S.) - - - Josiah Macy, Jr. Foundation. - - - Conference on Medicine and Society in China - (1973) - - - Macy Foundation series on medicine and public health in China. - - - LSA - OCC - R601.M43 - 22527225770006761 - - - HUM - STACK - R601.M43 - 22527225790006761 - - - 1 - 0 - STACK - false - BOOK - 39080000528593 - STACK - 23527225780006761 - 01 - R601.M43 - HUM - HUM - - - 1 - 0 - OCC - false - BOOK - 39080019409462 - OCC - 23527225760006761 - 15 - R601.M43 - LSA - LSA - - - \ No newline at end of file +01715cam 2200481 a 450020210714133158.0961210s1997 paua b 001 0 eng 990008205740106761 960524589780898719574 (electronic bk.)0898719577 (electronic bk.)0898713617 (pbk.)(MCM)000820574(MCM)000820574MIT01bke00000429(OCoLC)36084666DLCDLCC#PMYGMYGGQA184.T74 1997512/.521Trefethen, Lloyd N.(Lloyd Nicholas)Numerical linear algebra /Lloyd N. Trefethen, David Bau, III.Philadelphia :SIAM,c1997.xii, 361 p. :ill. ;26 cm.texttxtrdacontentunmediatednrdamediavolumencrdacarrierIncludes bibliographical references (p. 343-352) and index.10820574Algebras, Linear.Numerical calculations.Numerical linear algebra / also by David Bau III9800001258 980001447Numerical linear algebra /9800004031 980004751Bau, David.SCISTACKQA184.T74 19972249932843000676110STACKfalseBOOK39080022916099STACK2349932841000676101QA184.T74 1997SCISCI00STACKfalseBOOK39080028515432STACK2349932840000676101QA184.T74 1997SCISCI00STACKfalseBOOK39080013586232STACK2349932842000676101QA184.T74 1997SCISCI \ No newline at end of file diff --git a/fixtures/pod_with_namespaces.xml b/fixtures/pod_with_namespaces.xml index dd40f58..292d5e7 100644 --- a/fixtures/pod_with_namespaces.xml +++ b/fixtures/pod_with_namespaces.xml @@ -1,264 +1,2 @@ - - - 01168nam 2200385Ia 4500 - 20210714130252.0 - 990603s1999 ne a 000 0 eng d - 990008915090106761 - - 9071570231 (pbk.) - - - (MCM)000891509 - - - (MCM)000891509MIT01 - - - (OCoLC)41479068 - - - FBR - FBR - MYG - - - MYGG - - - NA2750 - .M34 1999 - - - 720.1 - 21 - - - Magyar, P\xc3\xa9ter. - - - Thought palaces / - Peter Magyar. - - - Amsterdam : - Architectura & Natura Press, - c1999. - - - 333 p. : - ill. ; 24 cm. - - - text - txt - rdacontent - - - unspecified - z - rdamedia - - - unspecified - zu - rdacarrier - - - Includes bibliographical references (p. 16). - - - 10891509 - - - Architectural design. - - - Architecture - Philosophy. - - - Thought palaces - 2000005118 200006007 - - - RTC - STACK - NA2750.M34 1999 - 22527227150006761 - - - 1 - 0 - STACK - false - BOOK - 39080015616003 - STACK - 23527227120006761 - 01 - NA2750.M34 1999 - RTC - RTC - - - - 02007nam 2200481 4500 - 20210822074144.0 - 740426s1974 nyu b 101 0 eng - 990000101910106761 - - 73088720 - - - 0914362097 - - - (MCM)000010191 - - - (MCM)000010191MIT01 - - - (OCoLC)00902533 - - - DLC - DLC - MYG - - - a-cc--- - - - MYGH - [769195] - MYGR - [758319] - - - R601 - .M43 - - - R601.M43 - - - Medicine and society in China; - report of a conference sponsored jointly by the National Library of Medicine and the Josiah Macy, Jr. Foundation. Edited by John Z. Bowers and Elizabeth F. Purcell. - - - New York, - Josiah Macy, Jr. Foundation - [1974] - - - vii, 176 p. - 23 cm. - - - text - txt - rdacontent - - - unmediated - n - rdamedia - - - volume - nc - rdacarrier - - - The Macy Foundation series on medicine and public health in China - - - committed to retain - 20170930 - 20421231 - HathiTrust - https://www.hathitrust.org/shared%5Fprint%5Fprogram - MCM - - - Includes bibliographical references. - - - Medicine - China - History - Congresses. - - - Public health - China - Congresses. - - - Conference papers and proceedings. - lcgft - - - Bowers, John Z., - 1913-1993. - - - Purcell, Elizabeth. - - - National Library of Medicine (U.S.) - - - Josiah Macy, Jr. Foundation. - - - Conference on Medicine and Society in China - (1973) - - - Macy Foundation series on medicine and public health in China. - - - LSA - OCC - R601.M43 - 22527225770006761 - - - HUM - STACK - R601.M43 - 22527225790006761 - - - 1 - 0 - STACK - false - BOOK - 39080000528593 - STACK - 23527225780006761 - 01 - R601.M43 - HUM - HUM - - - 1 - 0 - OCC - false - BOOK - 39080019409462 - OCC - 23527225760006761 - 15 - R601.M43 - LSA - LSA - - - \ No newline at end of file +01715cam 2200481 a 450020210714133158.0961210s1997 paua b 001 0 eng 990008205740106761 960524589780898719574 (electronic bk.)0898719577 (electronic bk.)0898713617 (pbk.)(MCM)000820574(MCM)000820574MIT01bke00000429(OCoLC)36084666DLCDLCC#PMYGMYGGQA184.T74 1997512/.521Trefethen, Lloyd N.(Lloyd Nicholas)Numerical linear algebra /Lloyd N. Trefethen, David Bau, III.Philadelphia :SIAM,c1997.xii, 361 p. :ill. ;26 cm.texttxtrdacontentunmediatednrdamediavolumencrdacarrierIncludes bibliographical references (p. 343-352) and index.10820574Algebras, Linear.Numerical calculations.Numerical linear algebra / also by David Bau III9800001258 980001447Numerical linear algebra /9800004031 980004751Bau, David.SCISTACKQA184.T74 19972249932843000676110STACKfalseBOOK39080022916099STACK2349932841000676101QA184.T74 1997SCISCI00STACKfalseBOOK39080028515432STACK2349932840000676101QA184.T74 1997SCISCI00STACKfalseBOOK39080013586232STACK2349932842000676101QA184.T74 1997SCISCI \ No newline at end of file diff --git a/ppod.py b/ppod.py index 2838162..03ac789 100644 --- a/ppod.py +++ b/ppod.py @@ -1,6 +1,7 @@ import logging import os import tarfile +from io import StringIO from typing import IO, Generator, Optional import sentry_sdk @@ -25,28 +26,34 @@ def lambda_handler(event: dict, context: object) -> dict: ) for s3_file in s3_files: logger.info("Processing file: %s", s3_file) - s3_file_content = smart_open.open(f"s3://{bucket}/{s3_file}", "rb") - xml_files = extract_files_from_tar(s3_file_content) - for xml_file in xml_files: - if xml_file: - add_namespaces_to_alma_marcxml(xml_file) - # post modified_xml to POD - file_count += 1 - return {"files-processed": file_count} + with smart_open.open(f"s3://{bucket}/{s3_file}", "rb") as s3_file_content: + xml_files = extract_files_from_tar(s3_file_content) + for xml_file in xml_files: + if xml_file: + add_namespaces_to_alma_marcxml(xml_file) + # post modified_xml to POD + file_count += 1 + else: + raise ValueError(f"No files extracted from {s3_file}") + return {"files_processed": file_count} -def add_namespaces_to_alma_marcxml(xml_file: IO[bytes]) -> str: +def add_namespaces_to_alma_marcxml(xml_file: IO[bytes]) -> StringIO: collection_element_with_namespaces = ( - "\n" - ) - return ( - xml_file.read() - .decode("utf-8") - .replace("\n", collection_element_with_namespaces) + '\n' + '<' ) + xml_string = StringIO() + xml_line = xml_file.read(52) + if xml_line == b'\n<': + xml_string.write(collection_element_with_namespaces) + xml_file.seek(52) + xml_string.write(xml_file.read().decode("utf-8")) + xml_string.seek(0) + return xml_string def extract_files_from_tar( diff --git a/test_ppod.py b/test_ppod.py index 6c08a07..902c7ae 100644 --- a/test_ppod.py +++ b/test_ppod.py @@ -33,7 +33,7 @@ def test_ppod_doesnt_configure_sentry_if_dsn_not_present( def test_ppod_matching_files(mocked_s3, request_data_matching_file): output = lambda_handler(request_data_matching_file, {}) - assert output == {"files-processed": 1} + assert output == {"files_processed": 1} def test_ppod_no_files_raises_exception( @@ -44,6 +44,14 @@ def test_ppod_no_files_raises_exception( lambda_handler(request_data_matching_file, {}) +def test_ppod_empty_tar_raises_exception( + monkeypatch, mocked_s3, request_data_matching_file +): + monkeypatch.setenv("BUCKET", "empty_tar") + with pytest.raises(ValueError): + lambda_handler(request_data_matching_file, {}) + + def test_ppod_no_matching_files_raises_exception(mocked_s3): request_data = {"filename-prefix": "download/"} with pytest.raises(KeyError): @@ -51,13 +59,19 @@ def test_ppod_no_matching_files_raises_exception(mocked_s3): def test_add_namespaces_to_alma_marcxml(): - modified_xml = add_namespaces_to_alma_marcxml(open("fixtures/pod.xml", "rb")) - assert modified_xml == open("fixtures/pod_with_namespaces.xml", "r").read() + with open("fixtures/pod.xml", "rb") as pod_xml, open( + "fixtures/pod_with_namespaces.xml", "r" + ) as pod_xml_namespaces: + modified_xml = add_namespaces_to_alma_marcxml(pod_xml) + assert modified_xml.read() == pod_xml_namespaces.read() def test_extract_files_from_tar(): - files = extract_files_from_tar(open("fixtures/pod.tar.gz", "rb")) - assert next(files).read() == open("fixtures/pod.xml", "rb").read() + with open("fixtures/pod.tar.gz", "rb") as pod_tar, open( + "fixtures/pod.xml", "rb" + ) as pod_xml: + files = extract_files_from_tar(pod_tar) + assert next(files).read() == pod_xml.read() def test_filter_files_in_bucket_with_1001_matching_file(mocked_s3): From b295b2b863033bfad319a7c59219b33794154976 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Fri, 27 May 2022 10:37:52 -0400 Subject: [PATCH 3/3] Updates based on further discussion in PR#3 * Add invalid XML fixture * Update add_namespaces_to_xml function with streaming chunks to avoid memory issues and change output to BytesIO * Update unit test for new approach * Add test for invalid xml --- fixtures/invalid.xml | 1 + ppod.py | 29 ++++++++++++++++++----------- test_ppod.py | 7 ++++++- 3 files changed, 25 insertions(+), 12 deletions(-) create mode 100644 fixtures/invalid.xml diff --git a/fixtures/invalid.xml b/fixtures/invalid.xml new file mode 100644 index 0000000..1875a99 --- /dev/null +++ b/fixtures/invalid.xml @@ -0,0 +1 @@ +01715cam 2200481 a 450020210714133158.0961210s1997 paua b 001 0 eng 990008205740106761 960524589780898719574 (electronic bk.)0898719577 (electronic bk.)0898713617 (pbk.)(MCM)000820574(MCM)000820574MIT01bke00000429(OCoLC)36084666DLCDLCC#PMYGMYGGQA184.T74 1997512/.521Trefethen, Lloyd N.(Lloyd Nicholas)Numerical linear algebra /Lloyd N. Trefethen, David Bau, III.Philadelphia :SIAM,c1997.xii, 361 p. :ill. ;26 cm.texttxtrdacontentunmediatednrdamediavolumencrdacarrierIncludes bibliographical references (p. 343-352) and index.10820574Algebras, Linear.Numerical calculations.Numerical linear algebra / also by David Bau III9800001258 980001447Numerical linear algebra /9800004031 980004751Bau, David.SCISTACKQA184.T74 19972249932843000676110STACKfalseBOOK39080022916099STACK2349932841000676101QA184.T74 1997SCISCI00STACKfalseBOOK39080028515432STACK2349932840000676101QA184.T74 1997SCISCI00STACKfalseBOOK39080013586232STACK2349932842000676101QA184.T74 1997SCISCI \ No newline at end of file diff --git a/ppod.py b/ppod.py index 03ac789..9cfebf3 100644 --- a/ppod.py +++ b/ppod.py @@ -1,7 +1,7 @@ import logging import os import tarfile -from io import StringIO +from io import BytesIO from typing import IO, Generator, Optional import sentry_sdk @@ -38,22 +38,29 @@ def lambda_handler(event: dict, context: object) -> dict: return {"files_processed": file_count} -def add_namespaces_to_alma_marcxml(xml_file: IO[bytes]) -> StringIO: +def add_namespaces_to_alma_marcxml(xml_file: IO[bytes]) -> BytesIO: collection_element_with_namespaces = ( '\n' '<' + 'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">' ) - xml_string = StringIO() - xml_line = xml_file.read(52) - if xml_line == b'\n<': - xml_string.write(collection_element_with_namespaces) - xml_file.seek(52) - xml_string.write(xml_file.read().decode("utf-8")) - xml_string.seek(0) - return xml_string + output = BytesIO() + first_chunk = xml_file.read(51) + decoded = first_chunk.decode("utf-8") + if decoded != '\n': + raise ValueError( + "XML file does not have expected XML declaration or collection element" + ) + output.write(collection_element_with_namespaces.encode()) + while True: + chunk = xml_file.read(16384) + if not chunk: + break + output.write(chunk) + output.seek(0) + return output def extract_files_from_tar( diff --git a/test_ppod.py b/test_ppod.py index 902c7ae..fb29dc7 100644 --- a/test_ppod.py +++ b/test_ppod.py @@ -60,12 +60,17 @@ def test_ppod_no_matching_files_raises_exception(mocked_s3): def test_add_namespaces_to_alma_marcxml(): with open("fixtures/pod.xml", "rb") as pod_xml, open( - "fixtures/pod_with_namespaces.xml", "r" + "fixtures/pod_with_namespaces.xml", "rb" ) as pod_xml_namespaces: modified_xml = add_namespaces_to_alma_marcxml(pod_xml) assert modified_xml.read() == pod_xml_namespaces.read() +def test_add_namespaces_to_alma_marcxml_invalid_xml_raises_exception(): + with pytest.raises(ValueError), open("fixtures/invalid.xml", "rb") as invalid_xml: + add_namespaces_to_alma_marcxml(invalid_xml) + + def test_extract_files_from_tar(): with open("fixtures/pod.tar.gz", "rb") as pod_tar, open( "fixtures/pod.xml", "rb"