diff --git a/normalized.json b/normalized.json index c5c0420f..67c2a63e 100644 --- a/normalized.json +++ b/normalized.json @@ -187,6 +187,12 @@ ], "type": "object", "properties": { + "documentType": { + "description": "The type of the document", + "enum": [ + null, "article", "abstract", "dataset", "book", "book-chapter", "dissertation", "correction", "preprint", "source-code", "clinical-trial", "reference-entry", "monograph" + ] + }, "publisher": { "type": "object", "anyOf": [ diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py index 7e8d6c3d..d33ba6d8 100644 --- a/scrapi/base/schemas.py +++ b/scrapi/base/schemas.py @@ -15,7 +15,6 @@ dif_process_contributors ) - DOESCHEMA = { "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)), "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)), diff --git a/scrapi/harvesters/crossref.py b/scrapi/harvesters/crossref.py index fc473512..00f64329 100644 --- a/scrapi/harvesters/crossref.py +++ b/scrapi/harvesters/crossref.py @@ -86,6 +86,7 @@ def schema(self): 'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []), 'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), + 'documentType': ('/type', lambda x: document_type_mapping.get(x)), 'otherProperties': build_properties( ('journalTitle', '/container-title'), ('volume', '/volume'), @@ -131,3 +132,15 @@ def harvest(self, start_date=None, end_date=None): })) return doc_list + +document_type_mapping = { + "book": "book", + "book-chapter": "book-chapter", + "dataset": "dataset", + "dissertation": "dissertation", + "journal-article": "article", + "monograph": "monograph", + "proceedings-article": "article", + "reference-entry": "reference-entry", + "report": "article", +} diff --git a/scrapi/harvesters/plos.py b/scrapi/harvesters/plos.py index c4e7a836..61a694b8 100644 --- a/scrapi/harvesters/plos.py +++ b/scrapi/harvesters/plos.py @@ -106,9 +106,15 @@ def harvest(self, start_date=None, end_date=None): 'publisher': { 'name': ('//str[@name="journal"]/node()', single_result) }, + 'documentType': ('//str[@name="article_type"]/node()', lambda x: document_type_mapping.get(single_result(x).lower(), None)), 'otherProperties': build_properties( ('eissn', '//str[@name="eissn"]/node()'), ('articleType', '//str[@name="article_type"]/node()'), ('score', '//float[@name="score"]/node()') ) } + +document_type_mapping = { + 'research article': 'article', + 'correction': 'correction', +} diff --git a/scrapi/settings/defaults.py b/scrapi/settings/defaults.py index 8cadeee8..adc1faa5 100644 --- a/scrapi/settings/defaults.py +++ b/scrapi/settings/defaults.py @@ -47,6 +47,7 @@ FRONTEND_KEYS = [ "uris", "contributors", + "documentType", "providerUpdatedDateTime", "description", "title", diff --git a/scrapi/settings/local-dist.py b/scrapi/settings/local-dist.py index a82b53db..7656195b 100644 --- a/scrapi/settings/local-dist.py +++ b/scrapi/settings/local-dist.py @@ -1,3 +1,4 @@ RAW_PROCESSING = ['postgres'] NORMALIZED_PROCESSING = ['elasticsearch', 'postgres'] RESPONSE_PROCESSOR = 'postgres' +CANONICAL_PROCESSOR = 'postgres'