Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 23 additions & 14 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,31 @@ def request_data_matching_file():
@pytest.fixture(scope="session")
def mocked_s3(aws_credentials):
with mock_s3():
s3 = boto3.client("s3", region_name="us-east-1")
s3.create_bucket(Bucket="ppod")
s3.put_object(
Body=open("fixtures/pod.tar.gz", "rb"),
Bucket="ppod",
Key="upload/pod.tar.gz",
)
s3.create_bucket(Bucket="no_files")
s3.create_bucket(Bucket="a_lot_of_files")
for i in range(1001):
with open("fixtures/pod.tar.gz", "rb") as pod_tar, open(
"fixtures/empty.tar.gz", "rb"
) as empty_tar:
s3 = boto3.client("s3", region_name="us-east-1")
s3.create_bucket(Bucket="ppod")
s3.put_object(
Body=str(i),
Bucket="a_lot_of_files",
Key=f"upload/{i}.txt",
Body=pod_tar,
Bucket="ppod",
Key="upload/pod.tar.gz",
)
yield s3
s3.create_bucket(Bucket="empty_tar")
s3.put_object(
Body=empty_tar,
Bucket="empty_tar",
Key="upload/empty.tar.gz",
)
s3.create_bucket(Bucket="no_files")
s3.create_bucket(Bucket="a_lot_of_files")
for i in range(1001):
s3.put_object(
Body=str(i),
Bucket="a_lot_of_files",
Key=f"upload/{i}.txt",
)
yield s3


@pytest.fixture(autouse=True)
Expand Down
Binary file added fixtures/empty.tar.gz
Binary file not shown.
1 change: 1 addition & 0 deletions fixtures/invalid.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<collection><record><leader>01715cam 2200481 a 4500</leader><controlfield tag="005">20210714133158.0</controlfield><controlfield tag="008">961210s1997 paua b 001 0 eng </controlfield><controlfield tag="001">990008205740106761</controlfield><datafield tag="010" ind1=" " ind2=" "><subfield code="a"> 96052458</subfield></datafield><datafield tag="020" ind1=" " ind2=" "><subfield code="z">9780898719574 (electronic bk.)</subfield></datafield><datafield tag="020" ind1=" " ind2=" "><subfield code="z">0898719577 (electronic bk.)</subfield></datafield><datafield tag="020" ind1=" " ind2=" "><subfield code="a">0898713617 (pbk.)</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(MCM)000820574</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(MCM)000820574MIT01</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">bke00000429</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(OCoLC)36084666</subfield></datafield><datafield tag="040" ind1=" " ind2=" "><subfield code="a">DLC</subfield><subfield code="c">DLC</subfield><subfield code="d">C#P</subfield><subfield code="d">MYG</subfield></datafield><datafield tag="049" ind1=" " ind2=" "><subfield code="a">MYGG</subfield></datafield><datafield tag="050" ind1="0" ind2="0"><subfield code="a">QA184</subfield><subfield code="b">.T74 1997</subfield></datafield><datafield tag="082" ind1="0" ind2="0"><subfield code="a">512/.5</subfield><subfield code="2">21</subfield></datafield><datafield tag="100" ind1="1" ind2=" "><subfield code="a">Trefethen, Lloyd N.</subfield><subfield code="q">(Lloyd Nicholas)</subfield></datafield><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Numerical linear algebra /</subfield><subfield code="c">Lloyd N. Trefethen, David Bau, III.</subfield></datafield><datafield tag="260" ind1=" " ind2=" "><subfield code="a">Philadelphia :</subfield><subfield code="b">SIAM,</subfield><subfield code="c">c1997.</subfield></datafield><datafield tag="300" ind1=" " ind2=" "><subfield code="a">xii, 361 p. :</subfield><subfield code="b">ill. ;</subfield><subfield code="c">26 cm.</subfield></datafield><datafield tag="336" ind1=" " ind2=" "><subfield code="a">text</subfield><subfield code="b">txt</subfield><subfield code="2">rdacontent</subfield></datafield><datafield tag="337" ind1=" " ind2=" "><subfield code="a">unmediated</subfield><subfield code="b">n</subfield><subfield code="2">rdamedia</subfield></datafield><datafield tag="338" ind1=" " ind2=" "><subfield code="a">volume</subfield><subfield code="b">nc</subfield><subfield code="2">rdacarrier</subfield></datafield><datafield tag="504" ind1=" " ind2=" "><subfield code="a">Includes bibliographical references (p. 343-352) and index.</subfield></datafield><datafield tag="599" ind1=" " ind2=" "><subfield code="a">10820574</subfield></datafield><datafield tag="650" ind1=" " ind2="0"><subfield code="a">Algebras, Linear.</subfield></datafield><datafield tag="650" ind1=" " ind2="0"><subfield code="a">Numerical calculations.</subfield></datafield><datafield tag="793" ind1=" " ind2=" "><subfield code="a">Numerical linear algebra / also by David Bau III</subfield><subfield code="g">9800001258 980001447</subfield></datafield><datafield tag="793" ind1=" " ind2=" "><subfield code="a">Numerical linear algebra /</subfield><subfield code="g">9800004031 980004751</subfield></datafield><datafield tag="700" ind1="1" ind2=" "><subfield code="a">Bau, David.</subfield></datafield><datafield tag="900" ind1="0" ind2=" "><subfield code="b">SCI</subfield><subfield code="d">STACK</subfield><subfield code="f">QA184.T74 1997</subfield><subfield code="8">22499328430006761</subfield></datafield><datafield tag="985" ind1=" " ind2=" "><subfield code="u">1</subfield><subfield code="j">0</subfield><subfield code="aa">STACK</subfield><subfield code="v">false</subfield><subfield code="t">BOOK</subfield><subfield code="s">39080022916099</subfield><subfield code="z">STACK</subfield><subfield code="a">23499328410006761</subfield><subfield code="c">01</subfield><subfield code="bb">QA184.T74 1997</subfield><subfield code="h">SCI</subfield><subfield code="i">SCI</subfield></datafield><datafield tag="985" ind1=" " ind2=" "><subfield code="u">0</subfield><subfield code="j">0</subfield><subfield code="aa">STACK</subfield><subfield code="v">false</subfield><subfield code="t">BOOK</subfield><subfield code="s">39080028515432</subfield><subfield code="z">STACK</subfield><subfield code="a">23499328400006761</subfield><subfield code="c">01</subfield><subfield code="bb">QA184.T74 1997</subfield><subfield code="h">SCI</subfield><subfield code="i">SCI</subfield></datafield><datafield tag="985" ind1=" " ind2=" "><subfield code="u">0</subfield><subfield code="j">0</subfield><subfield code="aa">STACK</subfield><subfield code="v">false</subfield><subfield code="t">BOOK</subfield><subfield code="s">39080013586232</subfield><subfield code="z">STACK</subfield><subfield code="a">23499328420006761</subfield><subfield code="c">01</subfield><subfield code="bb">QA184.T74 1997</subfield><subfield code="h">SCI</subfield><subfield code="i">SCI</subfield></datafield></record></collection>
Binary file modified fixtures/pod.tar.gz
Binary file not shown.
264 changes: 1 addition & 263 deletions fixtures/pod.xml

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions fixtures/pod_with_namespaces.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"><record><leader>01715cam 2200481 a 4500</leader><controlfield tag="005">20210714133158.0</controlfield><controlfield tag="008">961210s1997 paua b 001 0 eng </controlfield><controlfield tag="001">990008205740106761</controlfield><datafield tag="010" ind1=" " ind2=" "><subfield code="a"> 96052458</subfield></datafield><datafield tag="020" ind1=" " ind2=" "><subfield code="z">9780898719574 (electronic bk.)</subfield></datafield><datafield tag="020" ind1=" " ind2=" "><subfield code="z">0898719577 (electronic bk.)</subfield></datafield><datafield tag="020" ind1=" " ind2=" "><subfield code="a">0898713617 (pbk.)</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(MCM)000820574</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(MCM)000820574MIT01</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">bke00000429</subfield></datafield><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(OCoLC)36084666</subfield></datafield><datafield tag="040" ind1=" " ind2=" "><subfield code="a">DLC</subfield><subfield code="c">DLC</subfield><subfield code="d">C#P</subfield><subfield code="d">MYG</subfield></datafield><datafield tag="049" ind1=" " ind2=" "><subfield code="a">MYGG</subfield></datafield><datafield tag="050" ind1="0" ind2="0"><subfield code="a">QA184</subfield><subfield code="b">.T74 1997</subfield></datafield><datafield tag="082" ind1="0" ind2="0"><subfield code="a">512/.5</subfield><subfield code="2">21</subfield></datafield><datafield tag="100" ind1="1" ind2=" "><subfield code="a">Trefethen, Lloyd N.</subfield><subfield code="q">(Lloyd Nicholas)</subfield></datafield><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Numerical linear algebra /</subfield><subfield code="c">Lloyd N. Trefethen, David Bau, III.</subfield></datafield><datafield tag="260" ind1=" " ind2=" "><subfield code="a">Philadelphia :</subfield><subfield code="b">SIAM,</subfield><subfield code="c">c1997.</subfield></datafield><datafield tag="300" ind1=" " ind2=" "><subfield code="a">xii, 361 p. :</subfield><subfield code="b">ill. ;</subfield><subfield code="c">26 cm.</subfield></datafield><datafield tag="336" ind1=" " ind2=" "><subfield code="a">text</subfield><subfield code="b">txt</subfield><subfield code="2">rdacontent</subfield></datafield><datafield tag="337" ind1=" " ind2=" "><subfield code="a">unmediated</subfield><subfield code="b">n</subfield><subfield code="2">rdamedia</subfield></datafield><datafield tag="338" ind1=" " ind2=" "><subfield code="a">volume</subfield><subfield code="b">nc</subfield><subfield code="2">rdacarrier</subfield></datafield><datafield tag="504" ind1=" " ind2=" "><subfield code="a">Includes bibliographical references (p. 343-352) and index.</subfield></datafield><datafield tag="599" ind1=" " ind2=" "><subfield code="a">10820574</subfield></datafield><datafield tag="650" ind1=" " ind2="0"><subfield code="a">Algebras, Linear.</subfield></datafield><datafield tag="650" ind1=" " ind2="0"><subfield code="a">Numerical calculations.</subfield></datafield><datafield tag="793" ind1=" " ind2=" "><subfield code="a">Numerical linear algebra / also by David Bau III</subfield><subfield code="g">9800001258 980001447</subfield></datafield><datafield tag="793" ind1=" " ind2=" "><subfield code="a">Numerical linear algebra /</subfield><subfield code="g">9800004031 980004751</subfield></datafield><datafield tag="700" ind1="1" ind2=" "><subfield code="a">Bau, David.</subfield></datafield><datafield tag="900" ind1="0" ind2=" "><subfield code="b">SCI</subfield><subfield code="d">STACK</subfield><subfield code="f">QA184.T74 1997</subfield><subfield code="8">22499328430006761</subfield></datafield><datafield tag="985" ind1=" " ind2=" "><subfield code="u">1</subfield><subfield code="j">0</subfield><subfield code="aa">STACK</subfield><subfield code="v">false</subfield><subfield code="t">BOOK</subfield><subfield code="s">39080022916099</subfield><subfield code="z">STACK</subfield><subfield code="a">23499328410006761</subfield><subfield code="c">01</subfield><subfield code="bb">QA184.T74 1997</subfield><subfield code="h">SCI</subfield><subfield code="i">SCI</subfield></datafield><datafield tag="985" ind1=" " ind2=" "><subfield code="u">0</subfield><subfield code="j">0</subfield><subfield code="aa">STACK</subfield><subfield code="v">false</subfield><subfield code="t">BOOK</subfield><subfield code="s">39080028515432</subfield><subfield code="z">STACK</subfield><subfield code="a">23499328400006761</subfield><subfield code="c">01</subfield><subfield code="bb">QA184.T74 1997</subfield><subfield code="h">SCI</subfield><subfield code="i">SCI</subfield></datafield><datafield tag="985" ind1=" " ind2=" "><subfield code="u">0</subfield><subfield code="j">0</subfield><subfield code="aa">STACK</subfield><subfield code="v">false</subfield><subfield code="t">BOOK</subfield><subfield code="s">39080013586232</subfield><subfield code="z">STACK</subfield><subfield code="a">23499328420006761</subfield><subfield code="c">01</subfield><subfield code="bb">QA184.T74 1997</subfield><subfield code="h">SCI</subfield><subfield code="i">SCI</subfield></datafield></record></collection>
42 changes: 36 additions & 6 deletions ppod.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import os
import tarfile
from io import BytesIO
from typing import IO, Generator, Optional

import sentry_sdk
Expand All @@ -25,12 +26,41 @@ def lambda_handler(event: dict, context: object) -> dict:
)
for s3_file in s3_files:
logger.info("Processing file: %s", s3_file)
s3_file_content = smart_open.open(f"s3://{bucket}/{s3_file}", "rb")
files = extract_files_from_tar(s3_file_content)
for file in files:
file # do a thing
file_count += 1
return {"files-processed": file_count}
with smart_open.open(f"s3://{bucket}/{s3_file}", "rb") as s3_file_content:
xml_files = extract_files_from_tar(s3_file_content)
for xml_file in xml_files:
if xml_file:
add_namespaces_to_alma_marcxml(xml_file)
# post modified_xml to POD
file_count += 1
else:
raise ValueError(f"No files extracted from {s3_file}")
return {"files_processed": file_count}


def add_namespaces_to_alma_marcxml(xml_file: IO[bytes]) -> BytesIO:
collection_element_with_namespaces = (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<collection xmlns="http://www.loc.gov/MARC21/slim" '
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
'xsi:schemaLocation="http://www.loc.gov/MARC21/slim '
'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">'
)
output = BytesIO()
first_chunk = xml_file.read(51)
decoded = first_chunk.decode("utf-8")
if decoded != '<?xml version="1.0" encoding="UTF-8"?>\n<collection>':
raise ValueError(
"XML file does not have expected XML declaration or collection element"
)
output.write(collection_element_with_namespaces.encode())
while True:
chunk = xml_file.read(16384)
if not chunk:
break
output.write(chunk)
output.seek(0)
return output


def extract_files_from_tar(
Expand Down
37 changes: 33 additions & 4 deletions test_ppod.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import pytest

from ppod import extract_files_from_tar, filter_files_in_bucket, lambda_handler
from ppod import (
add_namespaces_to_alma_marcxml,
extract_files_from_tar,
filter_files_in_bucket,
lambda_handler,
)


def test_ppod_configures_sentry_if_dsn_present(
Expand All @@ -28,7 +33,7 @@ def test_ppod_doesnt_configure_sentry_if_dsn_not_present(

def test_ppod_matching_files(mocked_s3, request_data_matching_file):
output = lambda_handler(request_data_matching_file, {})
assert output == {"files-processed": 1}
assert output == {"files_processed": 1}


def test_ppod_no_files_raises_exception(
Expand All @@ -39,15 +44,39 @@ def test_ppod_no_files_raises_exception(
lambda_handler(request_data_matching_file, {})


def test_ppod_empty_tar_raises_exception(
monkeypatch, mocked_s3, request_data_matching_file
):
monkeypatch.setenv("BUCKET", "empty_tar")
with pytest.raises(ValueError):
lambda_handler(request_data_matching_file, {})


def test_ppod_no_matching_files_raises_exception(mocked_s3):
request_data = {"filename-prefix": "download/"}
with pytest.raises(KeyError):
lambda_handler(request_data, {})


def test_add_namespaces_to_alma_marcxml():
with open("fixtures/pod.xml", "rb") as pod_xml, open(
"fixtures/pod_with_namespaces.xml", "rb"
) as pod_xml_namespaces:
modified_xml = add_namespaces_to_alma_marcxml(pod_xml)
assert modified_xml.read() == pod_xml_namespaces.read()


def test_add_namespaces_to_alma_marcxml_invalid_xml_raises_exception():
with pytest.raises(ValueError), open("fixtures/invalid.xml", "rb") as invalid_xml:
add_namespaces_to_alma_marcxml(invalid_xml)


def test_extract_files_from_tar():
files = extract_files_from_tar(open("fixtures/pod.tar.gz", "rb"))
assert next(files).read() == open("fixtures/pod.xml", "rb").read()
with open("fixtures/pod.tar.gz", "rb") as pod_tar, open(
"fixtures/pod.xml", "rb"
) as pod_xml:
files = extract_files_from_tar(pod_tar)
assert next(files).read() == pod_xml.read()


def test_filter_files_in_bucket_with_1001_matching_file(mocked_s3):
Expand Down