From 10cb7f10272e54c246c3e725d91e22ba33bb7868 Mon Sep 17 00:00:00 2001 From: Karl Holub Date: Tue, 21 Apr 2020 23:04:20 -0500 Subject: [PATCH] strip BOM before processing records --- rispy/parser.py | 12 ++++-------- tests/data/example_bom.ris | 4 ++++ tests/test_parser.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 tests/data/example_bom.ris diff --git a/rispy/parser.py b/rispy/parser.py index e3d963a..8bb330c 100644 --- a/rispy/parser.py +++ b/rispy/parser.py @@ -199,14 +199,7 @@ def load( Returns: list: Returns list of RIS entries. """ - c = file.read() - - # Corrects for BOM in utf-8 encodings while keeping an 8-bit - # string representation - if len(c) > 3 and (c[0], c[1], c[2]) == ("\xef", "\xbb", "\xbf"): - c = c[3:] - - return list(loads(c, mapping, implementation)) + return list(loads(file.read(), mapping, implementation)) def loads( @@ -231,6 +224,9 @@ def loads( list: Returns list of RIS entries. """ + # remove BOM if present + obj = obj.lstrip("\ufeff") + filelines = obj.split("\n") implementation = RisImplementation(implementation) diff --git a/tests/data/example_bom.ris b/tests/data/example_bom.ris new file mode 100644 index 0000000..ec22ea9 --- /dev/null +++ b/tests/data/example_bom.ris @@ -0,0 +1,4 @@ +TY - JOUR +DO - 10.1186/s40981-020-0316-0 +ER - + diff --git a/tests/test_parser.py b/tests/test_parser.py index 2db612b..96bb144 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -141,6 +141,19 @@ def test_starting_newline(): assert len(entries) == 1 +def test_strip_bom(): + expected = {"type_of_reference": "JOUR", "doi": "10.1186/s40981-020-0316-0"} + + filepath = DATA_DIR / "example_bom.ris" + + # we properly decode the content of this file as UTF-8, but leave the BOM + with open(filepath, "r", encoding="utf-8") as f: + entries = rispy.load(f) + + print(entries) + assert expected == entries[0] + + def test_wos_ris(): fn = DATA_DIR / "example_wos.ris" with open(fn, "r") as f: