Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 146 additions & 58 deletions hepdata/modules/records/utils/analyses.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@
# as an Intergovernmental Organization or submit itself to any jurisdiction.

import logging
import os

from celery import shared_task
from flask import current_app
from invenio_db import db
import requests
import json
import jsonschema

from hepdata.ext.opensearch.api import index_record_ids
from hepdata.modules.submission.api import get_latest_hepsubmission, is_resource_added_to_submission
Expand All @@ -40,6 +43,10 @@
logging.basicConfig()
log = logging.getLogger(__name__)

def get_analyses_schema():
schema_path = os.path.join("hepdata", "templates", "analyses_schema.json")
with open(schema_path) as f:
return json.load(f)

@shared_task
def update_analyses(endpoint=None):
Expand All @@ -50,6 +57,8 @@ def update_analyses(endpoint=None):

:param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "CheckMATE" or "HackAnalysis" or "Combine" or None (default) for all
"""
analyses_schema = get_analyses_schema()

endpoints = current_app.config["ANALYSES_ENDPOINTS"]
for analysis_endpoint in endpoints:

Expand All @@ -64,62 +73,133 @@ def update_analyses(endpoint=None):

if response and response.status_code == 200:

analyses = response.json()

analysis_resources = DataResource.query.filter_by(file_type=analysis_endpoint).all()

# Check for missing analyses.
for record in analyses:
submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')

if submission:
num_new_resources = 0

for analysis in analyses[record]:
_resource_url = endpoints[analysis_endpoint]["url_template"].format(analysis)

if not is_resource_added_to_submission(submission.publication_recid, submission.version,
_resource_url):

log.info('Adding {} analysis to ins{} with URL {}'.format(
analysis_endpoint, record, _resource_url)
)
new_resource = DataResource(
file_location=_resource_url,
file_type=analysis_endpoint)

if "description" in endpoints[analysis_endpoint]:
new_resource.file_description = str(endpoints[analysis_endpoint]["description"])

if "license" in endpoints[analysis_endpoint]:
resource_license = get_license(endpoints[analysis_endpoint]["license"])
new_resource.file_license = resource_license.id

submission.resources.append(new_resource)
num_new_resources += 1

else:

# Remove resources from 'analysis_resources' list.
resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources))
for resource in resources:
analysis_resources.remove(resource)

if num_new_resources:

try:
db.session.add(submission)
db.session.commit()
latest_submission = get_latest_hepsubmission(inspire_id=record)
if submission.version == latest_submission.version:
index_record_ids([submission.publication_recid])
except Exception as e:
db.session.rollback()
log.error(e)

else:
log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format(
analysis_endpoint, record))
r_json = response.json()
try:
jsonschema.validate(instance=r_json, schema=analyses_schema)
new_json = True
except jsonschema.ValidationError:
new_json = False

if new_json:
Comment thread
GraemeWatt marked this conversation as resolved.

# Check for missing analyses.
for ana in r_json["analyses"]:
inspire_id = ana["inspire_id"]
submission = get_latest_hepsubmission(inspire_id=str(inspire_id), overall_status='finished') # TODO: make inspire_id an int

if submission:
num_new_resources = 0

for implementation in ana["implementations"]:
ana_name = implementation["name"]
ana_path = implementation["path"] if "path" in implementation else ""
_resource_url = r_json["url_templates"]["main_url"]
prev_url = None
n_tries, max_tries = 0, 10
while _resource_url!=prev_url and n_tries<max_tries:
prev_url = _resource_url
_resource_url = _resource_url.format(name=ana_name, path=ana_path)
n_tries += 1
Comment thread
GraemeWatt marked this conversation as resolved.

if not is_resource_added_to_submission(submission.publication_recid, submission.version,
_resource_url):

log.info('Adding {} analysis to ins{} with URL {}'.format(
analysis_endpoint, inspire_id, _resource_url)
)
new_resource = DataResource(
file_location=_resource_url,
file_type=analysis_endpoint,
file_description=r_json["implementations_description"]
)

if "license" in r_json:
resource_license = get_license(r_json["license"])
new_resource.file_license = resource_license.id

submission.resources.append(new_resource)
num_new_resources += 1

else:

# Remove resources from 'analysis_resources' list.
resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources))
for resource in resources:
analysis_resources.remove(resource)

if num_new_resources:

try:
db.session.add(submission)
db.session.commit()
latest_submission = get_latest_hepsubmission(inspire_id=inspire_id)
if submission.version == latest_submission.version:
index_record_ids([submission.publication_recid])
except Exception as e:
db.session.rollback()
log.error(e)

else:
log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format(
analysis_endpoint, inspire_id))

else: # old JSON file
analyses = r_json

# Check for missing analyses.
for record in analyses:
submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')

if submission:
num_new_resources = 0

for analysis in analyses[record]:
_resource_url = endpoints[analysis_endpoint]["url_template"].format(analysis)

if not is_resource_added_to_submission(submission.publication_recid, submission.version,
_resource_url):

log.info('Adding {} analysis to ins{} with URL {}'.format(
analysis_endpoint, record, _resource_url)
)
new_resource = DataResource(
file_location=_resource_url,
file_type=analysis_endpoint)

if "description" in endpoints[analysis_endpoint]:
new_resource.file_description = str(endpoints[analysis_endpoint]["description"])

if "license" in endpoints[analysis_endpoint]:
resource_license = get_license(endpoints[analysis_endpoint]["license"])
new_resource.file_license = resource_license.id

submission.resources.append(new_resource)
num_new_resources += 1

else:

# Remove resources from 'analysis_resources' list.
resources = list(filter(lambda a: a.file_location == _resource_url, analysis_resources))
for resource in resources:
analysis_resources.remove(resource)

if num_new_resources:

try:
db.session.add(submission)
db.session.commit()
latest_submission = get_latest_hepsubmission(inspire_id=record)
if submission.version == latest_submission.version:
index_record_ids([submission.publication_recid])
except Exception as e:
db.session.rollback()
log.error(e)

else:
log.debug("An analysis is available in {0} but with no equivalent in HEPData (ins{1}).".format(
analysis_endpoint, record))

if analysis_resources:
# Extra resources that were not found in the analyses JSON file.
Expand Down Expand Up @@ -154,10 +234,18 @@ def update_analyses(endpoint=None):
if "subscribe_user_id" in endpoints[analysis_endpoint]:
user = get_user_from_id(endpoints[analysis_endpoint]["subscribe_user_id"])
if user:
for record in analyses:
submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')
if submission and not is_current_user_subscribed_to_record(submission.publication_recid, user):
subscribe(submission.publication_recid, user)
# Check for missing analyses.
if new_json:
for ana in r_json["analyses"]:
submission = get_latest_hepsubmission(inspire_id=str(ana["inspire_id"]), overall_status='finished')
if submission and not is_current_user_subscribed_to_record(submission.publication_recid, user):
subscribe(submission.publication_recid, user)

else: # old JSON file
for record in analyses:
submission = get_latest_hepsubmission(inspire_id=record, overall_status='finished')
if submission and not is_current_user_subscribed_to_record(submission.publication_recid, user):
subscribe(submission.publication_recid, user)

else:
log.debug("No endpoint url configured for {0}".format(analysis_endpoint))
141 changes: 141 additions & 0 deletions hepdata/templates/analyses_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://hepdata.net/analyses/schemas/1.0.0/analyses_schema.json",
"title": "HEPData analysis tool schema",
"description": "A JSON schema for tracking implementations of HEPData analyses in different tools",
"type": "object",
"required": ["schema_version", "tool", "version", "date_created", "implementations_description", "url_templates", "analyses"],

"properties": {
"schema_version": {
"description": "The version of the JSON schema applying to this file",
"const": "1.0.0"
},

"tool": {
"description": "The name of the tool used to implement the analyses",
"type": "string"
},

"version": {
"description": "The version of the tool used to implement the analyses",
"type": "string"
},

"date_created": {
"description": "The date at which the JSON file was created, formatted as RFC 3339, section 5.6 (https://json-schema.org/understanding-json-schema/reference/type#dates-and-times), e.g. 2018-11-13T20:20:39+00:00",
"type": "string",
"format": "date-time"
},

"implementations_description": {
"description": "The type of information provided for the analyses by the tool",
"type": "string"
},

"url_templates": {
"description": "Templates for URLs to the main repository and important other pages",
"type": "object",
"required": ["main_url"],

"properties": {
"main_url": {
"description": "The URL template for the main repository. Should contain e.g. a {name} placeholder for the analysis name.",
"type": "string"
},
"val_url": {
"description": "The URL template for the validation page. Should contain e.g. a {name} placeholder for the analysis name.",
"type": "string"
}
}
},

"analyses": {
"description": "The analyses implemented in the tool",
"type": "array",
"items": {
"type": "object",
"$ref": "#/$defs/Analysis",
"minItems": 1,
"uniqueItems": true
}
},

"implementations_license": {
"description": "The license for the implementations of the analyses in the tool. Taken to be CC0 if not specified.",
"type": "object",
"required": ["name", "url"],
"additionalProperties": false,

"properties": {
"name": {
"description": "The name of the license",
"type": "string",
"maxLength": 256
},
"url": {
"description": "The URL to the license",
"type": "string",
"maxLength": 256
},
"description": {
"description": "A description of the license",
"type": "string"
}
}
}
},

"$defs": {

"Analysis": {
"description": "An analysis, identified by the INSPIRE ID, implemented at least once in a tool",
"type": "object",
"required": ["inspire_id", "implementations"],

"properties": {
"inspire_id": {
"description": "The INSPIRE ID of the analysis",
"type": "number"
},
"implementations":{
"description": "The implementations of the analysis in the tool",
"type": "array",
"item": {
"type": "object",
"$ref": "#/$defs/Implementation",
"minItems": 1,
"uniqueItems": true
}
},
"signature_type": {
"description": "The signature of the analysis, e.g. 'prompt', 'displaced'",
"type": "string"
},
"pretty_name": {
"description": "A pretty name for the analysis",
"type": "string"
}
}
},

"Implementation": {
"description": "An implementation of an analysis in a tool, giving the internal name to retrieve information",
"type": "object",
"required": ["name"],

"properties": {
"name": {
"description": "Internal name of the implementation",
"type": "string"
},
"path": {
"description": "The path to the implementation in the tool",
"type": "string"
}
}
}

}

}
Loading
Loading