HEPData · mhabedan · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/hepdata/modules/records/utils/analyses.py b/hepdata/modules/records/utils/analyses.py
@@ -23,11 +23,14 @@
 # as an Intergovernmental Organization or submit itself to any jurisdiction.
 
 import logging
+import os
 
 from celery import shared_task
 from flask import current_app
 from invenio_db import db
 import requests
+import json
+import jsonschema
 
 from hepdata.ext.opensearch.api import index_record_ids
 from hepdata.modules.submission.api import get_latest_hepsubmission, is_resource_added_to_submission
@@ -40,6 +43,10 @@
 logging.basicConfig()
 log = logging.getLogger(__name__)
 
+def get_analyses_schema():
+    schema_path = os.path.join("hepdata", "templates", "analyses_schema.json")
+    with open(schema_path) as f:
+        return json.load(f)
 
 @shared_task
 def update_analyses(endpoint=None):
@@ -50,6 +57,8 @@ def update_analyses(endpoint=None):
 
     :param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "CheckMATE" or "HackAnalysis" or "Combine" or None (default) for all
     """
+    analyses_schema = get_analyses_schema()
+
     endpoints = current_app.config["ANALYSES_ENDPOINTS"]
     for analysis_endpoint in endpoints:
 
@@ -64,62 +73,133 @@ def update_analyses(endpoint=None):
 
             if response and response.status_code == 200:
 
-                analyses = response.json()
-
                 analysis_resources = DataResource.query.filter_by(file_type=analysis_endpoint).all()
 
-                # Check for missing analyses.
-                for record in analyses:
-                    submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')
-
-                    if submission:
-                        num_new_resources = 0
-
-                        for analysis in analyses[record]:
-                            _resource_url = endpoints[analysis_endpoint]["url_template"].format(analysis)
-
-                            if not is_resource_added_to_submission(submission.publication_recid, submission.version,
-                                                                   _resource_url):
-
-                                log.info('Adding {} analysis to ins{} with URL {}'.format(
-                                    analysis_endpoint, record, _resource_url)
-                                )
-                                new_resource = DataResource(
-                                    file_location=_resource_url,
-                                    file_type=analysis_endpoint)
-
-                                if "description" in endpoints[analysis_endpoint]:
-                                    new_resource.file_description = str(endpoints[analysis_endpoint]["description"])
-
-                                if "license" in endpoints[analysis_endpoint]:
-                                    resource_license = get_license(endpoints[analysis_endpoint]["license"])
-                                    new_resource.file_license = resource_license.id
-
-                                submission.resources.append(new_resource)
-                                num_new_resources += 1
-
-                            else:
-
-                                # Remove resources from 'analysis_resources' list.
-                                resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources))
-                                for resource in resources:
-                                    analysis_resources.remove(resource)
-
-                        if num_new_resources:
-
-                            try:
-                                db.session.add(submission)
-                                db.session.commit()
-                                latest_submission = get_latest_hepsubmission(inspire_id=record)
-                                if submission.version == latest_submission.version:
-                                    index_record_ids([submission.publication_recid])
-                            except Exception as e:
-                                db.session.rollback()
-                                log.error(e)
-
-                    else:
-                        log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format(
-                            analysis_endpoint, record))
+                r_json = response.json()
+                try:
+                    jsonschema.validate(instance=r_json, schema=analyses_schema)
+                    new_json = True
+                except jsonschema.ValidationError:
+                    new_json = False
+
+                if new_json:
+
+                    # Check for missing analyses.
+                    for ana in r_json["analyses"]:
+                        inspire_id = ana["inspire_id"]
+                        submission = get_latest_hepsubmission(inspire_id=str(inspire_id), overall_status='finished') # TODO: make inspire_id an int
+
+                        if submission:
+                            num_new_resources = 0
+
+                            for implementation in ana["implementations"]:
+                                ana_name = implementation["name"]
+                                ana_path = implementation["path"] if "path" in implementation else ""
+                                _resource_url = r_json["url_templates"]["main_url"]
+                                prev_url = None
+                                n_tries, max_tries = 0, 10
+                                while _resource_url!=prev_url and n_tries<max_tries:
+                                    prev_url = _resource_url
+                                    _resource_url = _resource_url.format(name=ana_name, path=ana_path)
+                                    n_tries += 1
+
+                                if not is_resource_added_to_submission(submission.publication_recid, submission.version,
+                                                                    _resource_url):
+
+                                    log.info('Adding {} analysis to ins{} with URL {}'.format(
+                                        analysis_endpoint, inspire_id, _resource_url)
+                                    )
+                                    new_resource = DataResource(
+                                        file_location=_resource_url,
+                                        file_type=analysis_endpoint,
+                                        file_description=r_json["implementations_description"]
+                                    )
+
+                                    if "license" in r_json:
+                                        resource_license = get_license(r_json["license"])
+                                        new_resource.file_license = resource_license.id
+
+                                    submission.resources.append(new_resource)
+                                    num_new_resources += 1
+
+                                else:
+
+                                    # Remove resources from 'analysis_resources' list.
+                                    resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources))
+                                    for resource in resources:
+                                        analysis_resources.remove(resource)
+
+                            if num_new_resources:
+
+                                try:
+                                    db.session.add(submission)
+                                    db.session.commit()
+                                    latest_submission = get_latest_hepsubmission(inspire_id=inspire_id)
+                                    if submission.version == latest_submission.version:
+                                        index_record_ids([submission.publication_recid])
+                                except Exception as e:
+                                    db.session.rollback()
+                                    log.error(e)
+
+                        else:
+                            log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format(
+                                analysis_endpoint, inspire_id))
+
+                else: # old JSON file
+                    analyses = r_json
+
+                    # Check for missing analyses.
+                    for record in analyses:
+                        submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')
+
+                        if submission:
+                            num_new_resources = 0
+
+                            for analysis in analyses[record]:
+                                _resource_url = endpoints[analysis_endpoint]["url_template"].format(analysis)
+
+                                if not is_resource_added_to_submission(submission.publication_recid, submission.version,
+                                                                    _resource_url):
+
+                                    log.info('Adding {} analysis to ins{} with URL {}'.format(
+                                        analysis_endpoint, record, _resource_url)
+                                    )
+                                    new_resource = DataResource(
+                                        file_location=_resource_url,
+                                        file_type=analysis_endpoint)
+
+                                    if "description" in endpoints[analysis_endpoint]:
+                                        new_resource.file_description = str(endpoints[analysis_endpoint]["description"])
+
+                                    if "license" in endpoints[analysis_endpoint]:
+                                        resource_license = get_license(endpoints[analysis_endpoint]["license"])
+                                        new_resource.file_license = resource_license.id
+
+                                    submission.resources.append(new_resource)
+                                    num_new_resources += 1
+
+                                else:
+
+                                    # Remove resources from 'analysis_resources' list.
+                                    resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources))
+                                    for resource in resources:
+                                        analysis_resources.remove(resource)
+
+                            if num_new_resources:
+
+                                try:
+                                    db.session.add(submission)
+                                    db.session.commit()
+                                    latest_submission = get_latest_hepsubmission(inspire_id=record)
+                                    if submission.version == latest_submission.version:
+                                        index_record_ids([submission.publication_recid])
+                                except Exception as e:
+                                    db.session.rollback()
+                                    log.error(e)
+
+                        else:
+                            log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format(
+                                analysis_endpoint, record))
 
                 if analysis_resources:
                     # Extra resources that were not found in the analyses JSON file.
@@ -154,10 +234,18 @@ def update_analyses(endpoint=None):
                 if "subscribe_user_id" in endpoints[analysis_endpoint]:
                     user = get_user_from_id(endpoints[analysis_endpoint]["subscribe_user_id"])
                     if user:
-                        for record in analyses:
-                            submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')
-                            if submission and not is_current_user_subscribed_to_record(submission.publication_recid, user):
-                                subscribe(submission.publication_recid, user)
+                        # Check for missing analyses.
+                        if new_json:
+                            for ana in r_json["analyses"]:
+                                submission = get_latest_hepsubmission(inspire_id=str(ana["inspire_id"]), overall_status='finished')
+                                if submission and not is_current_user_subscribed_to_record(submission.publication_recid, user):
+                                    subscribe(submission.publication_recid, user)
+
+                        else: # old JSON file
+                            for record in analyses:
+                                submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')
+                                if submission and not is_current_user_subscribed_to_record(submission.publication_recid, user):
+                                    subscribe(submission.publication_recid, user)
 
         else:
             log.debug("No endpoint url configured for {0}".format(analysis_endpoint))
diff --git a/hepdata/templates/analyses_schema.json b/hepdata/templates/analyses_schema.json
@@ -0,0 +1,141 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hepdata.net/analyses/schemas/1.0.0/analyses_schema.json",
+  "title": "HEPData analysis tool schema",
+  "description": "A JSON schema for tracking implementations of HEPData analyses in different tools",
+  "type": "object",
+  "required": ["schema_version", "tool", "version", "date_created", "implementations_description", "url_templates", "analyses"],
+
+  "properties": {
+    "schema_version": {
+      "description": "The version of the JSON schema applying to this file",
+      "const": "1.0.0"
+    },
+
+    "tool": {
+      "description": "The name of the tool used to implement the analyses",
+      "type": "string"
+    },
+
+    "version": {
+      "description": "The version of the tool used to implement the analyses",
+      "type": "string"
+    },
+
+    "date_created": {
+      "description": "The date at which the JSON file was created, formatted as RFC 3339, section 5.6 (https://json-schema.org/understanding-json-schema/reference/type#dates-and-times), e.g. 2018-11-13T20:20:39+00:00",
+      "type": "string",
+      "format": "date-time"
+    },
+
+    "implementations_description": {
+      "description": "The type of information provided for the analyses by the tool",
+      "type": "string"
+    },
+
+    "url_templates": {
+      "description": "Templates for URLs to the main repository and important other pages",
+      "type": "object",
+      "required": ["main_url"],
+
+      "properties": {
+        "main_url": {
+          "description": "The URL template for the main repository. Should contain e.g. a {name} placeholder for the analysis name.",
+          "type": "string"
+        },
+        "val_url": {
+          "description": "The URL template for the validation page. Should contain e.g. a {name} placeholder for the analysis name.",
+          "type": "string"
+        }
+      }
+    },
+
+    "analyses": {
+      "description": "The analyses implemented in the tool",
+      "type": "array",
+      "items": {
+        "type": "object",
+        "$ref": "#/$defs/Analysis",
+        "minItems": 1,
+        "uniqueItems": true
+      }
+    },
+
+    "implementations_license": {
+      "description": "The license for the implementations of the analyses in the tool. Taken to be CC0 if not specified.",
+      "type": "object",
+      "required": ["name", "url"],
+      "additionalProperties": false,
+
+      "properties": {
+        "name": {
+          "description": "The name of the license",
+          "type": "string",
+          "maxLength": 256
+        },
+        "url": {
+          "description": "The URL to the license",
+          "type": "string",
+          "maxLength": 256
+        },
+        "description": {
+          "description": "A description of the license",
+          "type": "string"
+        }
+      }
+    }
+  },
+
+  "$defs": {
+
+    "Analysis": {
+      "description": "An analysis, identified by the INSPIRE ID, implemented at least once in a tool",
+      "type": "object",
+      "required": ["inspire_id", "implementations"],
+
+      "properties": {
+        "inspire_id": {
+          "description": "The INSPIRE ID of the analysis",
+          "type": "number"
+        },
+        "implementations":{
+          "description": "The implementations of the analysis in the tool",
+          "type": "array",
+          "item": {
+            "type": "object",
+            "$ref": "#/$defs/Implementation",
+            "minItems": 1,
+            "uniqueItems": true
+          }
+        },
+        "signature_type": {
+          "description": "The signature of the analysis, e.g. 'prompt', 'displaced'",
+          "type": "string"
+        },
+        "pretty_name": {
+          "description": "A pretty name for the analysis",
+          "type": "string"
+        }
+      }
+    },
+
+    "Implementation": {
+      "description": "An implementation of an analysis in a tool, giving the internal name to retrieve information",
+      "type": "object",
+      "required": ["name"],
+
+      "properties": {
+        "name": {
+          "description": "Internal name of the implementation",
+          "type": "string"
+        },
+        "path": {
+          "description": "The path to the implementation in the tool",
+          "type": "string"
+        }
+      }
+    }
+
+  }
+
+}