Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ name = "pypi"
boto3 = "*"
sentry-sdk = "*"
smart-open = "*"
requests = "*"

[dev-packages]
bandit = "*"
Expand All @@ -17,6 +18,8 @@ isort = "*"
moto = "*"
mypy = "*"
pytest = "*"
requests-mock = "*"
types-requests = "*"

[requires]
python_version = "3.9"
257 changes: 153 additions & 104 deletions Pipfile.lock

Large diffs are not rendered by default.

46 changes: 31 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,42 @@ make lint
```

## Required ENV
`POD_ACCESS_TOKEN` : The POD access token used to authenticate uploads. The access tokens can be found on the `Manage Organization` page.

`BUCKET` = The bucket containing the compressed MARCXML files to be submitted to POD.

`POD_URL` = The POD URL which includes the organization code: `https://pod.stanford.edu/organizations/{Organization Code}/uploads?stream=`

`SENTRY_DSN` = If set to a valid Sentry DSN, enables Sentry exception monitoring. This is not needed for local development.

`WORKSPACE` = Set to `dev` for local development, this will be set to `stage` and `prod` in those environments by Terraform.

### To run locally
NOTE: These instructions for running locally don't currently work and functionality has to be verified in our dev AWS account.
- Build the container:
```bash
docker build -t ppod .
```
- Run the container:
```bash
docker run -p 9000:8080 -e WORKSPACE=dev ppod:latest
```
- Post data to the container:

### Verify local changes in Dev1
- Ensure your AWS CLI is configured with credentials for the Dev1 account.
- Publish the lambda function:
```bash
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d "{}"
make publish-dev
make update-lambda-dev
```
- Observe output:
```
lambda

#### Submit files to POD test stream
Use the `Test` tab on the lambda to `Event JSON` that will match files in the dev1 S3 bucket:

```bash
{
"filename-prefix": "exlibris/pod/POD_ALMA_EXPORT_20220523"
}
```

Note: If it's been a while since the last POD export from Alma sandbox, there may be no files in the Dev1 S3 export bucket and you may need to run the publishing job from the sandbox.


Observe that the output reflects the correct number of files:

```bash
{
"files_processed": 2
}
```

59 changes: 51 additions & 8 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import boto3
import pytest
from moto import mock_s3
import requests_mock
from moto import mock_s3, mock_ssm


@pytest.fixture(scope="session")
Expand All @@ -13,23 +14,44 @@ def aws_credentials():


@pytest.fixture()
def request_data_matching_file():
request_data = {"filename-prefix": "upload/"}
yield request_data
def marcxml():
with open("fixtures/marc.xml", "rb") as marcxml:
yield marcxml


@pytest.fixture(scope="session")
@pytest.fixture()
def marcxml_with_namespaces():
with open("fixtures/marc_with_namespaces.xml", "rb") as marcxml_with_namespaces:
yield marcxml_with_namespaces


@pytest.fixture(autouse=True, scope="session")
def mocked_pod():
with requests_mock.Mocker() as m:
request_headers = {"Authorization": "Bearer 1234abcd"}
m.post(
"http://example.example/organizations/ORG/uploads?stream=default",
request_headers=request_headers,
)
m.post(
"http://example.example/organizations/ORG/uploads?stream=not-a-stream",
status_code=404,
)
yield m


@pytest.fixture(autouse=True, scope="session")
def mocked_s3(aws_credentials):
with mock_s3():
with open("fixtures/pod.tar.gz", "rb") as pod_tar, open(
with open("fixtures/marc.tar.gz", "rb") as pod_tar, open(
"fixtures/empty.tar.gz", "rb"
) as empty_tar:
s3 = boto3.client("s3", region_name="us-east-1")
s3.create_bucket(Bucket="ppod")
s3.put_object(
Body=pod_tar,
Bucket="ppod",
Key="upload/pod.tar.gz",
Key="upload/marc.tar.gz",
)
s3.create_bucket(Bucket="empty_tar")
s3.put_object(
Expand All @@ -48,7 +70,28 @@ def mocked_s3(aws_credentials):
yield s3


@pytest.fixture(autouse=True, scope="session")
def mocked_ssm():
with mock_ssm():
ssm = boto3.client("ssm", region_name="us-east-1")
ssm.put_parameter(
Name="/apps/ppod/stream-name",
Value="default",
)
yield ssm


@pytest.fixture()
def request_data_matching_file():
yield {"filename-prefix": "upload/"}


@pytest.fixture(autouse=True)
def test_env():
os.environ = {"WORKSPACE": "test", "BUCKET": "ppod"}
os.environ = {
"POD_ACCESS_TOKEN": "1234abcd",
"BUCKET": "ppod",
"POD_URL": "http://example.example/organizations/ORG/uploads?stream=",
"WORKSPACE": "test",
}
yield
File renamed without changes.
File renamed without changes.
File renamed without changes.
43 changes: 40 additions & 3 deletions ppod.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from io import BytesIO
from typing import IO, Generator, Optional

import requests
import sentry_sdk
import smart_open
from boto3 import client
Expand All @@ -18,8 +19,16 @@ def lambda_handler(event: dict, context: object) -> dict:
logger.info(
"Sentry DSN found, exceptions will be sent to Sentry with env=%s", env
)
file_count = 0

bucket = os.environ["BUCKET"]
ssm_client = client("ssm", region_name="us-east-1")
stream = ssm_client.get_parameter(Name="/apps/ppod/stream-name")["Parameter"][
"Value"
]
pod_url = os.environ["POD_URL"] + stream
pod_headers = {"Authorization": f'Bearer {os.environ["POD_ACCESS_TOKEN"]}'}

file_count = 0
s3_files = filter_files_in_bucket(
bucket,
event["filename-prefix"],
Expand All @@ -30,8 +39,16 @@ def lambda_handler(event: dict, context: object) -> dict:
xml_files = extract_files_from_tar(s3_file_content)
for xml_file in xml_files:
if xml_file:
add_namespaces_to_alma_marcxml(xml_file)
# post modified_xml to POD
modified_xml = add_namespaces_to_alma_marcxml(xml_file)
pod_file_name = os.path.basename(s3_file).replace("tar.gz", "xml")
response = post_file_to_pod(
pod_url, pod_headers, pod_file_name, modified_xml
)
logger.info(
"Submited file %s and received response: %s",
pod_file_name,
response,
)
file_count += 1
else:
raise ValueError(f"No files extracted from {s3_file}")
Expand Down Expand Up @@ -84,3 +101,23 @@ def filter_files_in_bucket(bucket: str, prefix: str) -> Generator[str, None, Non
yield s3_object["Key"]
except KeyError:
raise KeyError(f"No files retrieved from {bucket} with prefix {prefix}")


def post_file_to_pod(
url: str, headers: dict, pod_file_name: str, file_content: BytesIO
) -> requests.Response:
"""Post file content to POD with the specified file name."""
files = {
"upload[files][]": (
pod_file_name,
file_content,
"application/marcxml+xml",
),
}
response = requests.post(
url,
headers=headers,
files=files,
)
response.raise_for_status()
return response
11 changes: 7 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
#

-i https://pypi.org/simple
boto3==1.23.2
botocore==1.26.2; python_version >= '3.6'
certifi==2021.10.8
boto3==1.24.0
botocore==1.27.0; python_version >= '3.7'
certifi==2022.5.18.1; python_version >= '3.6'
charset-normalizer==2.0.12; python_version >= '3'
idna==3.3; python_version >= '3'
jmespath==1.0.0; python_version >= '3.7'
python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
s3transfer==0.5.2; python_version >= '3.6'
requests==2.27.1
s3transfer==0.6.0; python_version >= '3.7'
sentry-sdk==1.5.12
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
smart-open==6.0.0
Expand Down
65 changes: 39 additions & 26 deletions test_ppod.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import logging

import pytest
import requests

from ppod import (
add_namespaces_to_alma_marcxml,
extract_files_from_tar,
filter_files_in_bucket,
lambda_handler,
post_file_to_pod,
)


def test_ppod_configures_sentry_if_dsn_present(
caplog, monkeypatch, mocked_s3, request_data_matching_file
caplog, monkeypatch, request_data_matching_file
):
monkeypatch.setenv("SENTRY_DSN", "https://1234567890@00000.ingest.sentry.io/123456")
caplog.set_level(logging.INFO)
Expand All @@ -23,79 +25,90 @@ def test_ppod_configures_sentry_if_dsn_present(


def test_ppod_doesnt_configure_sentry_if_dsn_not_present(
caplog, monkeypatch, mocked_s3, request_data_matching_file
caplog, monkeypatch, request_data_matching_file
):
monkeypatch.delenv("SENTRY_DSN", raising=False)
caplog.set_level(logging.INFO)
lambda_handler(request_data_matching_file, {})
assert "Sentry DSN found" not in caplog.text


def test_ppod_matching_files(mocked_s3, request_data_matching_file):
def test_ppod_matching_files(request_data_matching_file):
output = lambda_handler(request_data_matching_file, {})
assert output == {"files_processed": 1}


def test_ppod_no_files_raises_exception(
monkeypatch, mocked_s3, request_data_matching_file
):
def test_ppod_no_files_raises_exception(monkeypatch, request_data_matching_file):
monkeypatch.setenv("BUCKET", "no_files")
with pytest.raises(KeyError):
lambda_handler(request_data_matching_file, {})


def test_ppod_empty_tar_raises_exception(
monkeypatch, mocked_s3, request_data_matching_file
):
def test_ppod_empty_tar_raises_exception(monkeypatch, request_data_matching_file):
monkeypatch.setenv("BUCKET", "empty_tar")
with pytest.raises(ValueError):
lambda_handler(request_data_matching_file, {})


def test_ppod_no_matching_files_raises_exception(mocked_s3):
def test_ppod_no_matching_files_raises_exception():
request_data = {"filename-prefix": "download/"}
with pytest.raises(KeyError):
lambda_handler(request_data, {})


def test_add_namespaces_to_alma_marcxml():
with open("fixtures/pod.xml", "rb") as pod_xml, open(
"fixtures/pod_with_namespaces.xml", "rb"
) as pod_xml_namespaces:
modified_xml = add_namespaces_to_alma_marcxml(pod_xml)
assert modified_xml.read() == pod_xml_namespaces.read()
def test_add_namespaces_to_alma_marcxml(marcxml, marcxml_with_namespaces):
modified_xml = add_namespaces_to_alma_marcxml(marcxml)
assert modified_xml.read() == marcxml_with_namespaces.read()


def test_add_namespaces_to_alma_marcxml_invalid_xml_raises_exception():
with pytest.raises(ValueError), open("fixtures/invalid.xml", "rb") as invalid_xml:
add_namespaces_to_alma_marcxml(invalid_xml)


def test_extract_files_from_tar():
with open("fixtures/pod.tar.gz", "rb") as pod_tar, open(
"fixtures/pod.xml", "rb"
) as pod_xml:
def test_extract_files_from_tar(marcxml):
with open("fixtures/marc.tar.gz", "rb") as pod_tar:
files = extract_files_from_tar(pod_tar)
assert next(files).read() == pod_xml.read()
assert next(files).read() == marcxml.read()


def test_filter_files_in_bucket_with_1001_matching_file(mocked_s3):
def test_filter_files_in_bucket_with_1001_matching_file():
files = filter_files_in_bucket("a_lot_of_files", "upload/")
assert len(list(files)) == 1001


def test_filter_files_in_bucket_with_matching_file(mocked_s3):
def test_filter_files_in_bucket_with_matching_file():
files = filter_files_in_bucket("ppod", "upload/")
assert next(files) == "upload/pod.tar.gz"
assert next(files) == "upload/marc.tar.gz"


def test_filter_files_in_bucket_with_no_file(mocked_s3):
def test_filter_files_in_bucket_with_no_file():
with pytest.raises(KeyError):
files = filter_files_in_bucket("no_files", "upload/")
next(files)


def test_filter_files_in_bucket_without_matching_file(mocked_s3):
def test_filter_files_in_bucket_without_matching_file():
with pytest.raises(KeyError):
files = filter_files_in_bucket("ppod", "download/")
next(files)


def test_post_files_to_pod_success(marcxml_with_namespaces):
response = post_file_to_pod(
"http://example.example/organizations/ORG/uploads?stream=default",
{"Authorization": "Bearer 1234abcd"},
"pod_file",
marcxml_with_namespaces,
)
assert response.status_code == 200


def test_post_files_to_pod_bad_url_raises_error(marcxml_with_namespaces):
with pytest.raises(requests.exceptions.HTTPError):
post_file_to_pod(
"http://example.example/organizations/ORG/uploads?stream=not-a-stream",
{"Authorization": "Bearer 1234abcd"},
"pod_file",
marcxml_with_namespaces,
)