diff --git a/hepdata/modules/records/utils/analyses.py b/hepdata/modules/records/utils/analyses.py index 188949e0e..7aba654af 100644 --- a/hepdata/modules/records/utils/analyses.py +++ b/hepdata/modules/records/utils/analyses.py @@ -23,11 +23,14 @@ # as an Intergovernmental Organization or submit itself to any jurisdiction. import logging +import os from celery import shared_task from flask import current_app from invenio_db import db import requests +import json +import jsonschema from hepdata.ext.opensearch.api import index_record_ids from hepdata.modules.submission.api import get_latest_hepsubmission, is_resource_added_to_submission @@ -40,6 +43,10 @@ logging.basicConfig() log = logging.getLogger(__name__) +def get_analyses_schema(): + schema_path = os.path.join("hepdata", "templates", "analyses_schema.json") + with open(schema_path) as f: + return json.load(f) @shared_task def update_analyses(endpoint=None): @@ -50,6 +57,8 @@ def update_analyses(endpoint=None): :param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "CheckMATE" or "HackAnalysis" or "Combine" or None (default) for all """ + analyses_schema = get_analyses_schema() + endpoints = current_app.config["ANALYSES_ENDPOINTS"] for analysis_endpoint in endpoints: @@ -64,62 +73,133 @@ def update_analyses(endpoint=None): if response and response.status_code == 200: - analyses = response.json() - analysis_resources = DataResource.query.filter_by(file_type=analysis_endpoint).all() - # Check for missing analyses. - for record in analyses: - submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished') - - if submission: - num_new_resources = 0 - - for analysis in analyses[record]: - _resource_url = endpoints[analysis_endpoint]["url_template"].format(analysis) - - if not is_resource_added_to_submission(submission.publication_recid, submission.version, - _resource_url): - - log.info('Adding {} analysis to ins{} with URL {}'.format( - analysis_endpoint, record, _resource_url) - ) - new_resource = DataResource( - file_location=_resource_url, - file_type=analysis_endpoint) - - if "description" in endpoints[analysis_endpoint]: - new_resource.file_description = str(endpoints[analysis_endpoint]["description"]) - - if "license" in endpoints[analysis_endpoint]: - resource_license = get_license(endpoints[analysis_endpoint]["license"]) - new_resource.file_license = resource_license.id - - submission.resources.append(new_resource) - num_new_resources += 1 - - else: - - # Remove resources from 'analysis_resources' list. - resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources)) - for resource in resources: - analysis_resources.remove(resource) - - if num_new_resources: - - try: - db.session.add(submission) - db.session.commit() - latest_submission = get_latest_hepsubmission(inspire_id=record) - if submission.version == latest_submission.version: - index_record_ids([submission.publication_recid]) - except Exception as e: - db.session.rollback() - log.error(e) - - else: - log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format( - analysis_endpoint, record)) + r_json = response.json() + try: + jsonschema.validate(instance=r_json, schema=analyses_schema) + new_json = True + except jsonschema.ValidationError: + new_json = False + + if new_json: + + # Check for missing analyses. + for ana in r_json["analyses"]: + inspire_id = ana["inspire_id"] + submission = get_latest_hepsubmission(inspire_id=str(inspire_id), overall_status='finished') # TODO: make inspire_id an int + + if submission: + num_new_resources = 0 + + for implementation in ana["implementations"]: + ana_name = implementation["name"] + ana_path = implementation["path"] if "path" in implementation else "" + _resource_url = r_json["url_templates"]["main_url"] + prev_url = None + n_tries, max_tries = 0, 10 + while _resource_url!=prev_url and n_tries