diff --git a/fixes/add_analyses.py b/fixes/add_analyses.py new file mode 100644 index 000000000..cc59a3217 --- /dev/null +++ b/fixes/add_analyses.py @@ -0,0 +1,71 @@ +import click +import logging + +from celery import shared_task +from flask import current_app +from flask.cli import with_appcontext +from invenio_db import db + +from hepdata.celery import dynamic_tasks +from hepdata.config import SIMPLEANALYSIS_FILE_TYPE, HS3_FILE_TYPE +from hepdata.cli import fix +from hepdata.ext.opensearch.api import reindex_batch +from hepdata.modules.submission.api import get_latest_hepsubmission +from hepdata.modules.submission.models import HEPSubmission +from hepdata.modules.records.utils.common import is_analysis + +logging.basicConfig() +log = logging.getLogger(__name__) + +@fix.command() +@with_appcontext +@click.option('--analyses-type', '-a', type=str, help=f"e.g. '{SIMPLEANALYSIS_FILE_TYPE}' or '{HS3_FILE_TYPE}'.") +@click.option('--batch-size', '-b', type=int, default=20, + help='Number of hepsubmission entries to check at a time.') +@click.option('--synchronous', '-s', type=bool, default=False) +def add_analyses(analyses_type, batch_size, synchronous=False): + """Check all submissions for resources with analyses_type in the description but not as the type.""" + + if analyses_type not in (SIMPLEANALYSIS_FILE_TYPE, HS3_FILE_TYPE): + log.error(f"analyses-type must be '{SIMPLEANALYSIS_FILE_TYPE}' or '{HS3_FILE_TYPE}'") + return + + all_ids = db.session.query(HEPSubmission.id).order_by(HEPSubmission.id).all() + + count = 0 + total = len(all_ids) + while count < total: + batch_ids = [i[0] for i in all_ids[count:min(count + batch_size, total)]] + if synchronous: + _add_analyses_batch(analyses_type, batch_ids) + else: + log.info('Sending batch of IDs {0} to {1} to celery'.format(batch_ids[0], batch_ids[-1])) + dynamic_tasks.delay('_add_analyses_batch', 'add_analyses', analyses_type, batch_ids) + count += batch_size + + +@shared_task +def _add_analyses_batch(analyses_type, ids): + log.info(f"Checking for {analyses_type} resources in submission ids {ids}") + recids_to_reindex = [] + for id in ids: + hepsubmission = HEPSubmission.query.get(id) + + if hepsubmission: + for resource in hepsubmission.resources: + if resource.file_type != analyses_type and is_analysis(analyses_type, resource.file_description): + log.info(f"Found {analyses_type} for resource {resource.file_location}") + # Update resource to have type analyses_type + resource.file_type = analyses_type + db.session.add(resource) + db.session.commit() + + # Check if this is the latest finished submission - reindex if so + latest_submission = get_latest_hepsubmission(publication_recid=hepsubmission.publication_recid, overall_status='finished') + if latest_submission and latest_submission.version == hepsubmission.version: + recids_to_reindex.append(hepsubmission.id) + + if recids_to_reindex: + recids_to_reindex = list(set(recids_to_reindex)) # remove duplicates before indexing + log.info(f"Reindexing records: {recids_to_reindex}") + reindex_batch(recids_to_reindex, current_app.config['OPENSEARCH_INDEX']) diff --git a/fixes/add_histfactory_analyses.py b/fixes/add_histfactory_analyses.py deleted file mode 100644 index 06c88b4e7..000000000 --- a/fixes/add_histfactory_analyses.py +++ /dev/null @@ -1,69 +0,0 @@ -import click -import logging - -from celery import shared_task -from flask import current_app -from flask.cli import with_appcontext -from invenio_db import db - -from hepdata.celery import dynamic_tasks -from hepdata.config import HISTFACTORY_FILE_TYPE -from hepdata.cli import fix -from hepdata.ext.opensearch.api import reindex_batch -from hepdata.modules.records.utils.common import is_histfactory -from hepdata.modules.submission.api import get_latest_hepsubmission -from hepdata.modules.submission.models import HEPSubmission -from hepdata.modules.records.utils.doi_minter import create_resource_doi - -logging.basicConfig() -log = logging.getLogger(__name__) - -@fix.command() -@with_appcontext -@click.option('--batch-size', '-b', type=int, default=20, - help='Number of hepsubmission entries to check at a time.') -@click.option('--synchronous', '-s', type=bool, default=False) -def add_histfactory_analyses(batch_size, synchronous=False): - all_ids = db.session.query(HEPSubmission.id).order_by(HEPSubmission.id).all() - - count = 0 - total = len(all_ids) - while count < total: - batch_ids = [i[0] for i in all_ids[count:min(count + batch_size, total)]] - if synchronous: - _add_histfactory_analyses_batch(batch_ids) - else: - log.info('Sending batch of IDs {0} to {1} to celery'.format(batch_ids[0], batch_ids[-1])) - dynamic_tasks.delay('_add_histfactory_analyses_batch', 'add_histfactory_analyses', batch_ids) - count += batch_size - - -@shared_task -def _add_histfactory_analyses_batch(ids): - log.info(f"Checking for HistFactory resources in submission ids {ids}") - recids_to_reindex = [] - for id in ids: - hepsubmission = HEPSubmission.query.get(id) - - if hepsubmission: - for resource in hepsubmission.resources: - if resource.file_type != HISTFACTORY_FILE_TYPE and \ - is_histfactory(resource.file_location, resource.file_description): - log.info(f"Found histfactory for resource {resource.file_location}") - # Update resource to have type histfactory - resource.file_type = HISTFACTORY_FILE_TYPE - db.session.add(resource) - db.session.commit() - - # Check if this is the latest finished submission - reindex if so - latest_submission = get_latest_hepsubmission(publication_recid=hepsubmission.publication_recid, overall_status='finished') - if latest_submission and latest_submission.version == hepsubmission.version: - recids_to_reindex.append(hepsubmission.id) - - if hepsubmission.overall_status == 'finished': - site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net') - create_resource_doi.delay(hepsubmission.id, resource.id, site_url) - - if recids_to_reindex: - log.info(f"Reindexing records: {recids_to_reindex}") - reindex_batch(recids_to_reindex, current_app.config['OPENSEARCH_INDEX']) diff --git a/hepdata/config.py b/hepdata/config.py index 426e73e34..853d761bd 100644 --- a/hepdata/config.py +++ b/hepdata/config.py @@ -369,6 +369,8 @@ def _(x): } HISTFACTORY_FILE_TYPE = 'HistFactory' +HS3_FILE_TYPE = 'HS3' +SIMPLEANALYSIS_FILE_TYPE = 'SimpleAnalysis' NUISANCE_FILE_TYPE = 'ProSelecta' ADMIN_EMAIL = 'info@hepdata.net' diff --git a/hepdata/ext/opensearch/document_enhancers.py b/hepdata/ext/opensearch/document_enhancers.py index fbba8ae90..238b497ed 100644 --- a/hepdata/ext/opensearch/document_enhancers.py +++ b/hepdata/ext/opensearch/document_enhancers.py @@ -30,7 +30,8 @@ from dateutil.parser import parse from flask import current_app -from hepdata.config import CFG_PUB_TYPE, CFG_DATA_TYPE, HISTFACTORY_FILE_TYPE, NUISANCE_FILE_TYPE +from hepdata.config import (CFG_PUB_TYPE, CFG_DATA_TYPE, HISTFACTORY_FILE_TYPE, + HS3_FILE_TYPE, SIMPLEANALYSIS_FILE_TYPE, NUISANCE_FILE_TYPE) from hepdata.ext.opensearch.config.record_mapping import mapping as os_mapping from hepdata.modules.permissions.models import SubmissionParticipant from hepdata.modules.submission.api import get_latest_hepsubmission @@ -104,12 +105,12 @@ def add_analyses(doc): if latest_submission: doc["analyses"] = [] for reference in latest_submission.resources: - if reference.file_type in current_app.config['ANALYSES_ENDPOINTS']: + if reference.file_type in current_app.config['ANALYSES_ENDPOINTS'] and reference.file_location.lower().startswith('http'): doc["analyses"].append({'type': reference.file_type, 'analysis': reference.file_location}) else: site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net') landing_page_url = f"{site_url}/record/resource/{reference.id}?landing_page=true" - if reference.file_type == HISTFACTORY_FILE_TYPE: + if reference.file_type in (HISTFACTORY_FILE_TYPE, HS3_FILE_TYPE, SIMPLEANALYSIS_FILE_TYPE): doc["analyses"].append({'type': reference.file_type, 'analysis': landing_page_url, 'filename': os.path.basename(reference.file_location)}) elif reference.file_type == NUISANCE_FILE_TYPE: diff --git a/hepdata/modules/records/utils/analyses.py b/hepdata/modules/records/utils/analyses.py index 409e489e7..802b6434e 100644 --- a/hepdata/modules/records/utils/analyses.py +++ b/hepdata/modules/records/utils/analyses.py @@ -205,6 +205,8 @@ def update_analyses(endpoint=None): try: recids_to_reindex = [] for extra_analysis_resource in analysis_resources: + if not extra_analysis_resource.file_location.lower().startswith('http'): + continue # don't delete local files from database query = db.select([data_reference_link.columns.submission_id]).where( data_reference_link.columns.dataresource_id == extra_analysis_resource.id) results = db.session.execute(query) diff --git a/hepdata/modules/records/utils/common.py b/hepdata/modules/records/utils/common.py index fdd0b1d33..16cb34eaf 100644 --- a/hepdata/modules/records/utils/common.py +++ b/hepdata/modules/records/utils/common.py @@ -30,7 +30,8 @@ import os from sqlalchemy.orm.exc import NoResultFound -from hepdata.config import HISTFACTORY_FILE_TYPE, NUISANCE_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD +from hepdata.config import (HISTFACTORY_FILE_TYPE, HS3_FILE_TYPE, SIMPLEANALYSIS_FILE_TYPE, + NUISANCE_FILE_TYPE, SIZE_LOAD_CHECK_THRESHOLD) from hepdata.ext.opensearch.api import get_record from hepdata.modules.submission.models import HEPSubmission, License, DataSubmission, DataResource @@ -74,9 +75,6 @@ ALLOWED_EXTENSIONS = ('.zip', '.tar', '.tar.gz', '.tgz', '.oldhepdata', '.yaml', '.yaml.gz') -HISTFACTORY_EXTENSIONS = ALLOWED_EXTENSIONS[:4] + ('.tar.xz', '.json') -HISTFACTORY_TERMS = ("histfactory", "pyhf", "likelihoods", "workspaces") - def contains_accepted_url(file): for pattern in URL_PATTERNS: @@ -96,17 +94,12 @@ def is_image(filename): return False -def is_histfactory(filename, description, type=None): - if type and type.lower() == HISTFACTORY_FILE_TYPE.lower(): +def is_analysis(analyses_type, description, type=None): + if type and type.lower() == analyses_type.lower(): return True - if filename.endswith(HISTFACTORY_EXTENSIONS): - description_lc = description.lower() - for term in HISTFACTORY_TERMS: - if term in description_lc: - return True - - return False + description_lc = description.lower() + return True if analyses_type.lower() in description_lc else False def infer_file_type(file, description, type=None): @@ -115,7 +108,11 @@ def infer_file_type(file, description, type=None): if result: return pattern else: - if is_histfactory(file, description, type): + if is_analysis(SIMPLEANALYSIS_FILE_TYPE, description, type): + return SIMPLEANALYSIS_FILE_TYPE + elif is_analysis(HS3_FILE_TYPE, description, type): + return HS3_FILE_TYPE + elif type and type.lower() == HISTFACTORY_FILE_TYPE.lower(): return HISTFACTORY_FILE_TYPE elif type and type.lower() == NUISANCE_FILE_TYPE.lower(): return NUISANCE_FILE_TYPE diff --git a/hepdata/modules/search/templates/hepdata_search/modals/search_help.html b/hepdata/modules/search/templates/hepdata_search/modals/search_help.html index 5f3a3b12f..f0c5e1069 100644 --- a/hepdata/modules/search/templates/hepdata_search/modals/search_help.html +++ b/hepdata/modules/search/templates/hepdata_search/modals/search_help.html @@ -277,6 +277,20 @@

Other useful searches

(likelihoods in HistFactory format) +
  • + analysis:HS3 + + (likelihoods in HS3 format) + +
  • +
  • + analysis:SimpleAnalysis + + (code snippets in SimpleAnalysis format) + +
  • analysis:NUISANCE diff --git a/hepdata/version.py b/hepdata/version.py index b1820f88e..e289c04fe 100644 --- a/hepdata/version.py +++ b/hepdata/version.py @@ -28,4 +28,4 @@ and parsed by ``setup.py``. """ -__version__ = "0.9.4dev20251013" +__version__ = "0.9.4dev20251015" diff --git a/tests/search_test.py b/tests/search_test.py index b3b0d6a5d..4293606d9 100644 --- a/tests/search_test.py +++ b/tests/search_test.py @@ -335,7 +335,7 @@ def test_search(app, load_default_data, identifiers): # Test searching of the resources field by type. # A bunch of different types to be checked for - resource_types = ['png', 'html', 'zenodo', 'dat', 'C++', None] + resource_types = ['png', 'html', 'zenodo', 'dat', 'SimpleAnalysis', None] for res_type in resource_types: # Execute search for the current type results = os_api.search(f'resources.type:{res_type}', index=index) @@ -774,8 +774,9 @@ def test_add_analyses(app): "filename": "test.tar.gz" }, ] - # This should probably be changed to use SITE_URL or some similar concept - analysis_url = "http://localhost:5000/record/resource/%s?landing_page=true" + + site_url = app.config.get('SITE_URL', 'http://localhost:5000') + analysis_url = site_url + "/record/resource/%s?landing_page=true" with app.app_context(): # Creating and submitting the test submission containing resources @@ -802,7 +803,7 @@ def test_add_analyses(app): # Add MadAnalysis DataResource object separately mad_analysis_resource = DataResource( - file_location = "placeholder", + file_location = "https://placeholder", file_type = "MadAnalysis", file_description = "placeholder" ) @@ -1072,36 +1073,36 @@ def test_reindex_batch_large_submission(app, mocker): # Mock methods called so we can check they're called with correct parameters mock_index_record_ids = mocker.patch('hepdata.ext.opensearch.api.index_record_ids') mock_push_data_keywords = mocker.patch('hepdata.ext.opensearch.api.push_data_keywords') - + # Mock database query to return a large number of records (250 total) mock_db_result = [(1, i) for i in range(2, 252)] # pub_recid=1, data_recids=2-251 mocker.patch('hepdata.ext.opensearch.api.db.session.query').return_value.join.return_value.filter.return_value.all.return_value = mock_db_result - + # Set up return values for batched calls mock_index_record_ids.return_value = {'publication': [1], 'datatable': []} - + # Call reindex_batch with a mock submission ID os_api.reindex_batch([999], index) - - # Should be called 3 times: 100 records, 100 records, 51 records + + # Should be called 3 times: 100 records, 100 records, 51 records assert mock_index_record_ids.call_count == 3 - + # Check the call arguments for batching calls = mock_index_record_ids.call_args_list - + # First batch: 100 records (1 + first 99 from 2-100) first_batch = calls[0][0][0] # First positional argument of first call assert len(first_batch) == 100 assert 1 in first_batch # publication record - + # Second batch: 100 records (101-200) second_batch = calls[1][0][0] assert len(second_batch) == 100 - - # Third batch: 51 records (201-251) + + # Third batch: 51 records (201-251) third_batch = calls[2][0][0] assert len(third_batch) == 51 - + # push_data_keywords should be called once at the end mock_push_data_keywords.assert_called_once_with(pub_ids=[1, 1, 1]) # Called with accumulated publication IDs diff --git a/tests/submission_test.py b/tests/submission_test.py index afe72134c..c0db6a381 100644 --- a/tests/submission_test.py +++ b/tests/submission_test.py @@ -49,7 +49,7 @@ process_saved_file, get_commit_message ) from hepdata.modules.records.utils.common import infer_file_type, contains_accepted_url, allowed_file, record_exists, \ - get_record_contents, is_histfactory, get_record_by_id + get_record_contents, is_analysis, get_record_by_id from hepdata.modules.records.utils.data_files import get_data_path_for_record from hepdata.modules.records.utils.submission import process_submission_directory, do_finalise, unload_submission, \ cleanup_data_related_recid @@ -94,20 +94,16 @@ def test_url_pattern(): assert (url_group["exp_result"] == url_type) -@pytest.mark.parametrize("filename,description,type,expected", +@pytest.mark.parametrize("analyses_type,description,type,expected", [ - ("pyhf.tar.gz", "PyHF", None, True), - ("pyhf.tgz", "File containing likelihoods", None, True), - ("pyhf.zip", "HistFactory JSON file", None, True), - ("test.zip", "Some sort of file", "HistFactory", True), - ("test.zip", "Some sort of file", "histfactory", True), - ("pyhf.tar.gz", "A file", None, False), - ("pyhf.json", "HistFactory JSON file", None, True), - ("test.zip", "Some sort of file", "json", False), + ("SimpleAnalysis", "SimpleAnalysis code snippet", None, True), + ("SimpleAnalysis", "ComplicatedAnalysis code snippet", None, False), + ("SimpleAnalysis", "Code snippet", "SimpleAnalysis", True), + ("SimpleAnalysis", "Code snippet", "ComplicatedAnalysis", False), ] ) -def test_is_histfactory(filename, description, type, expected): - assert is_histfactory(filename, description, type) == expected +def test_is_analysis(analyses_type, description, type, expected): + assert is_analysis(analyses_type, description, type) == expected @pytest.mark.parametrize("filename,description,type,expected", @@ -121,8 +117,11 @@ def test_is_histfactory(filename, description, type, expected): ("test.root", "", None, "ROOT"), ("test.docx", "", None, "docx"), ("test", "", None, "resource"), - ("pyhf.tgz", "File containing likelihoods", None, "HistFactory"), + ("pyhf.tgz", "File containing likelihoods", None, "tgz"), ("test.zip", "Some sort of file", "HistFactory", "HistFactory"), + ("test.zip", "Some sort of file", "HS3", "HS3"), + ("snippet.cxx", "SimpleAnalysis code snippet", None, "SimpleAnalysis"), + ("snippet.cxx", "Code snippet", "SimpleAnalysis", "SimpleAnalysis"), ("snippet.cxx", "ProSelecta analysis", "ProSelecta", "ProSelecta") ] ) @@ -919,7 +918,7 @@ def test_do_finalise_async_indexing(app, admin_idx, mocker): """ # Mock the reindex_batch.delay function mock_reindex_batch_delay = mocker.patch('hepdata.modules.records.utils.submission.reindex_batch.delay') - + with app.app_context(): admin_idx.recreate_index() # Create test submission/record @@ -944,6 +943,6 @@ def test_do_finalise_async_indexing(app, admin_idx, mocker): # Verify that reindex_batch.delay was called with correct parameters mock_reindex_batch_delay.assert_called_once_with( - [hepdata_submission.id], + [hepdata_submission.id], app.config['OPENSEARCH_INDEX'] )