CenterForOpenScience · wearpants · Apr 7, 2016 · Apr 12, 2016 · Apr 14, 2016 · Apr 15, 2016
diff --git a/normalized.json b/normalized.json
@@ -187,6 +187,12 @@
     ],
     "type": "object",
     "properties": {
+        "documentType": {
+            "description": "The type of the document",
+            "enum": [
+               null, "article", "abstract", "dataset", "book", "book-chapter", "dissertation", "correction", "preprint", "source-code", "clinical-trial", "reference-entry", "monograph"
+            ]
+        },
         "publisher": {
             "type": "object",
             "anyOf": [

diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
@@ -15,7 +15,6 @@
     dif_process_contributors
 )
 
-
 DOESCHEMA = {
     "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
     "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)),

diff --git a/scrapi/harvesters/crossref.py b/scrapi/harvesters/crossref.py
@@ -86,6 +86,7 @@ def schema(self):
             'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []),
             'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
             'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
+            'documentType': ('/type', lambda x: document_type_mapping.get(x)),
             'otherProperties': build_properties(
                 ('journalTitle', '/container-title'),
                 ('volume', '/volume'),
@@ -131,3 +132,15 @@ def harvest(self, start_date=None, end_date=None):
                 }))
 
         return doc_list
+
+document_type_mapping = {
+    "book": "book",
+    "book-chapter": "book-chapter",
+    "dataset": "dataset",
+    "dissertation": "dissertation",
+    "journal-article": "article",
+    "monograph": "monograph",
+    "proceedings-article": "article",
+    "reference-entry": "reference-entry",
+    "report": "article",
+}
diff --git a/scrapi/harvesters/plos.py b/scrapi/harvesters/plos.py
@@ -106,9 +106,15 @@ def harvest(self, start_date=None, end_date=None):
         'publisher': {
             'name': ('//str[@name="journal"]/node()', single_result)
         },
+        'documentType': ('//str[@name="article_type"]/node()', lambda x: document_type_mapping.get(single_result(x).lower(), None)),
         'otherProperties': build_properties(
             ('eissn', '//str[@name="eissn"]/node()'),
             ('articleType', '//str[@name="article_type"]/node()'),
             ('score', '//float[@name="score"]/node()')
         )
     }
+
+document_type_mapping = {
+    'research article': 'article',
+    'correction': 'correction',
+}
diff --git a/scrapi/settings/defaults.py b/scrapi/settings/defaults.py
@@ -47,6 +47,7 @@
 FRONTEND_KEYS = [
     "uris",
     "contributors",
+    "documentType",
     "providerUpdatedDateTime",
     "description",
     "title",

diff --git a/scrapi/settings/local-dist.py b/scrapi/settings/local-dist.py
@@ -1,3 +1,4 @@
 RAW_PROCESSING = ['postgres']
 NORMALIZED_PROCESSING = ['elasticsearch', 'postgres']
 RESPONSE_PROCESSOR = 'postgres'
+CANONICAL_PROCESSOR = 'postgres'