Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions fixes/add_analyses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import click
import logging

from celery import shared_task
from flask import current_app
from flask.cli import with_appcontext
from invenio_db import db

from hepdata.celery import dynamic_tasks
from hepdata.config import SIMPLEANALYSIS_FILE_TYPE, HS3_FILE_TYPE
from hepdata.cli import fix
from hepdata.ext.opensearch.api import reindex_batch
from hepdata.modules.submission.api import get_latest_hepsubmission
from hepdata.modules.submission.models import HEPSubmission
from hepdata.modules.records.utils.common import is_analysis

logging.basicConfig()
log = logging.getLogger(__name__)

@fix.command()
@with_appcontext
@click.option('--analyses-type', '-a', type=str, help=f"e.g. '{SIMPLEANALYSIS_FILE_TYPE}' or '{HS3_FILE_TYPE}'.")
@click.option('--batch-size', '-b', type=int, default=20,
help='Number of hepsubmission entries to check at a time.')
@click.option('--synchronous', '-s', type=bool, default=False)
Comment thread
GraemeWatt marked this conversation as resolved.
def add_analyses(analyses_type, batch_size, synchronous=False):
"""Check all submissions for resources with analyses_type in the description but not as the type."""

if analyses_type not in (SIMPLEANALYSIS_FILE_TYPE, HS3_FILE_TYPE):
log.error(f"analyses-type must be '{SIMPLEANALYSIS_FILE_TYPE}' or '{HS3_FILE_TYPE}'")
return

all_ids = db.session.query(HEPSubmission.id).order_by(HEPSubmission.id).all()

count = 0
total = len(all_ids)
while count < total:
batch_ids = [i[0] for i in all_ids[count:min(count + batch_size, total)]]
if synchronous:
_add_analyses_batch(analyses_type, batch_ids)
else:
log.info('Sending batch of IDs {0} to {1} to celery'.format(batch_ids[0], batch_ids[-1]))
dynamic_tasks.delay('_add_analyses_batch', 'add_analyses', analyses_type, batch_ids)
count += batch_size


@shared_task
def _add_analyses_batch(analyses_type, ids):
log.info(f"Checking for {analyses_type} resources in submission ids {ids}")
recids_to_reindex = []
for id in ids:
hepsubmission = HEPSubmission.query.get(id)

if hepsubmission:
for resource in hepsubmission.resources:
if resource.file_type != analyses_type and is_analysis(analyses_type, resource.file_description):
log.info(f"Found {analyses_type} for resource {resource.file_location}")
# Update resource to have type analyses_type
resource.file_type = analyses_type
db.session.add(resource)
db.session.commit()

# Check if this is the latest finished submission - reindex if so
latest_submission = get_latest_hepsubmission(publication_recid=hepsubmission.publication_recid, overall_status='finished')
if latest_submission and latest_submission.version == hepsubmission.version:
recids_to_reindex.append(hepsubmission.id)

if recids_to_reindex:
recids_to_reindex = list(set(recids_to_reindex)) # remove duplicates before indexing
log.info(f"Reindexing records: {recids_to_reindex}")
reindex_batch(recids_to_reindex, current_app.config['OPENSEARCH_INDEX'])
69 changes: 0 additions & 69 deletions fixes/add_histfactory_analyses.py

This file was deleted.

2 changes: 2 additions & 0 deletions hepdata/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,8 @@ def _(x):
}

HISTFACTORY_FILE_TYPE = 'HistFactory'
HS3_FILE_TYPE = 'HS3'
SIMPLEANALYSIS_FILE_TYPE = 'SimpleAnalysis'
NUISANCE_FILE_TYPE = 'ProSelecta'

ADMIN_EMAIL = 'info@hepdata.net'
Expand Down
7 changes: 4 additions & 3 deletions hepdata/ext/opensearch/document_enhancers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
from dateutil.parser import parse
from flask import current_app

from hepdata.config import CFG_PUB_TYPE, CFG_DATA_TYPE, HISTFACTORY_FILE_TYPE, NUISANCE_FILE_TYPE
from hepdata.config import (CFG_PUB_TYPE, CFG_DATA_TYPE, HISTFACTORY_FILE_TYPE,
HS3_FILE_TYPE, SIMPLEANALYSIS_FILE_TYPE, NUISANCE_FILE_TYPE)
from hepdata.ext.opensearch.config.record_mapping import mapping as os_mapping
from hepdata.modules.permissions.models import SubmissionParticipant
from hepdata.modules.submission.api import get_latest_hepsubmission
Expand Down Expand Up @@ -104,12 +105,12 @@ def add_analyses(doc):
if latest_submission:
doc["analyses"] = []
for reference in latest_submission.resources:
if reference.file_type in current_app.config['ANALYSES_ENDPOINTS']:
if reference.file_type in current_app.config['ANALYSES_ENDPOINTS'] and reference.file_location.lower().startswith('http'):
doc["analyses"].append({'type': reference.file_type, 'analysis': reference.file_location})
else:
site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net')
landing_page_url = f"{site_url}/record/resource/{reference.id}?landing_page=true"
if reference.file_type == HISTFACTORY_FILE_TYPE:
if reference.file_type in (HISTFACTORY_FILE_TYPE, HS3_FILE_TYPE, SIMPLEANALYSIS_FILE_TYPE):
doc["analyses"].append({'type': reference.file_type, 'analysis': landing_page_url,
'filename': os.path.basename(reference.file_location)})
elif reference.file_type == NUISANCE_FILE_TYPE:
Expand Down
2 changes: 2 additions & 0 deletions hepdata/modules/records/utils/analyses.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def update_analyses(endpoint=None):
try:
recids_to_reindex = []
for extra_analysis_resource in analysis_resources:
if not extra_analysis_resource.file_location.lower().startswith('http'):
continue # don't delete local files from database
query = db.select([data_reference_link.columns.submission_id]).where(
data_reference_link.columns.dataresource_id == extra_analysis_resource.id)
results = db.session.execute(query)
Expand Down
25 changes: 11 additions & 14 deletions hepdata/modules/records/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
import os
from sqlalchemy.orm.exc import NoResultFound

from hepdata.config import HISTFACTORY_FILE_TYPE, NUISANCE_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD
from hepdata.config import (HISTFACTORY_FILE_TYPE, HS3_FILE_TYPE, SIMPLEANALYSIS_FILE_TYPE,
NUISANCE_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD)
from hepdata.ext.opensearch.api import get_record
from hepdata.modules.submission.models import HEPSubmission, License, DataSubmission, DataResource

Expand Down Expand Up @@ -74,9 +75,6 @@

ALLOWED_EXTENSIONS = ('.zip', '.tar', '.tar.gz', '.tgz', '.oldhepdata', '.yaml', '.yaml.gz')

HISTFACTORY_EXTENSIONS = ALLOWED_EXTENSIONS[:4] + ('.tar.xz', '.json')
HISTFACTORY_TERMS = ("histfactory", "pyhf", "likelihoods", "workspaces")


def contains_accepted_url(file):
for pattern in URL_PATTERNS:
Expand All @@ -96,17 +94,12 @@ def is_image(filename):
return False


def is_histfactory(filename, description, type=None):
if type and type.lower() == HISTFACTORY_FILE_TYPE.lower():
def is_analysis(analyses_type, description, type=None):
if type and type.lower() == analyses_type.lower():
return True

if filename.endswith(HISTFACTORY_EXTENSIONS):
description_lc = description.lower()
for term in HISTFACTORY_TERMS:
if term in description_lc:
return True

return False
description_lc = description.lower()
return True if analyses_type.lower() in description_lc else False
Comment thread
GraemeWatt marked this conversation as resolved.


def infer_file_type(file, description, type=None):
Expand All @@ -115,7 +108,11 @@ def infer_file_type(file, description, type=None):
if result:
return pattern
else:
if is_histfactory(file, description, type):
if is_analysis(SIMPLEANALYSIS_FILE_TYPE, description, type):
return SIMPLEANALYSIS_FILE_TYPE
elif is_analysis(HS3_FILE_TYPE, description, type):
return HS3_FILE_TYPE
elif type and type.lower() == HISTFACTORY_FILE_TYPE.lower():
return HISTFACTORY_FILE_TYPE
elif type and type.lower() == NUISANCE_FILE_TYPE.lower():
return NUISANCE_FILE_TYPE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,20 @@ <h4>Other useful searches</h4>
(likelihoods in HistFactory format)
</span>
</li>
<li>
<a href='/search?q=analysis:HS3&sort_by=latest'
target="_new">analysis:HS3</a>
<span class="text-muted">
(likelihoods in HS3 format)
</span>
</li>
<li>
<a href='/search?q=analysis:SimpleAnalysis&sort_by=latest'
target="_new">analysis:SimpleAnalysis</a>
<span class="text-muted">
(code snippets in SimpleAnalysis format)
</span>
</li>
<li>
<a href='/search?q=analysis:NUISANCE&sort_by=latest'
target="_new">analysis:NUISANCE</a>
Expand Down
2 changes: 1 addition & 1 deletion hepdata/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@
and parsed by ``setup.py``.
"""

__version__ = "0.9.4dev20251013"
__version__ = "0.9.4dev20251015"
31 changes: 16 additions & 15 deletions tests/search_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def test_search(app, load_default_data, identifiers):

# Test searching of the resources field by type.
# A bunch of different types to be checked for
resource_types = ['png', 'html', 'zenodo', 'dat', 'C++', None]
resource_types = ['png', 'html', 'zenodo', 'dat', 'SimpleAnalysis', None]
for res_type in resource_types:
# Execute search for the current type
results = os_api.search(f'resources.type:{res_type}', index=index)
Expand Down Expand Up @@ -774,8 +774,9 @@ def test_add_analyses(app):
"filename": "test.tar.gz"
},
]
# This should probably be changed to use SITE_URL or some similar concept
analysis_url = "http://localhost:5000/record/resource/%s?landing_page=true"

site_url = app.config.get('SITE_URL', 'http://localhost:5000')
analysis_url = site_url + "/record/resource/%s?landing_page=true"

with app.app_context():
# Creating and submitting the test submission containing resources
Expand All @@ -802,7 +803,7 @@ def test_add_analyses(app):

# Add MadAnalysis DataResource object separately
mad_analysis_resource = DataResource(
file_location = "placeholder",
file_location = "https://placeholder",
file_type = "MadAnalysis",
file_description = "placeholder"
)
Expand Down Expand Up @@ -1072,36 +1073,36 @@ def test_reindex_batch_large_submission(app, mocker):
# Mock methods called so we can check they're called with correct parameters
mock_index_record_ids = mocker.patch('hepdata.ext.opensearch.api.index_record_ids')
mock_push_data_keywords = mocker.patch('hepdata.ext.opensearch.api.push_data_keywords')

# Mock database query to return a large number of records (250 total)
mock_db_result = [(1, i) for i in range(2, 252)] # pub_recid=1, data_recids=2-251
mocker.patch('hepdata.ext.opensearch.api.db.session.query').return_value.join.return_value.filter.return_value.all.return_value = mock_db_result

# Set up return values for batched calls
mock_index_record_ids.return_value = {'publication': [1], 'datatable': []}

# Call reindex_batch with a mock submission ID
os_api.reindex_batch([999], index)
# Should be called 3 times: 100 records, 100 records, 51 records

# Should be called 3 times: 100 records, 100 records, 51 records
assert mock_index_record_ids.call_count == 3

# Check the call arguments for batching
calls = mock_index_record_ids.call_args_list

# First batch: 100 records (1 + first 99 from 2-100)
first_batch = calls[0][0][0] # First positional argument of first call
assert len(first_batch) == 100
assert 1 in first_batch # publication record

# Second batch: 100 records (101-200)
second_batch = calls[1][0][0]
assert len(second_batch) == 100
# Third batch: 51 records (201-251)

# Third batch: 51 records (201-251)
third_batch = calls[2][0][0]
assert len(third_batch) == 51

# push_data_keywords should be called once at the end
mock_push_data_keywords.assert_called_once_with(pub_ids=[1, 1, 1]) # Called with accumulated publication IDs

Expand Down
Loading