diff --git a/doc/release-notes/8097-indexall-performance.md b/doc/release-notes/8097-indexall-performance.md new file mode 100644 index 00000000000..b027c21cbce --- /dev/null +++ b/doc/release-notes/8097-indexall-performance.md @@ -0,0 +1,6 @@ +### Indexing performance on datasets with large numbers of files + +We discovered that whenever a full reindexing needs to be performed, datasets with large numbers of files take exceptionally long time to index (for example, in the IQSS repository it takes several hours for a dataset that has 25,000 files). In situations where the Solr index needs to be erased and rebuilt from scratch (such as a Solr version upgrade, or a corrupt index, etc.) this can significantly delay the repopulation of the search catalog. + +We are still investigating the reasons behind this performance issue. For now, even though some improvements have been made, a dataset with thousands of files is still going to take a long time to index. But we've made a simple change to the reindexing process, to index any such datasets at the very end of the batch, after all the datasets with fewer files have been reindexed. This does not improve the overall reindexing time, but will repopulate the bulk of the search index much faster for the users of the installation. + diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 288575d5462..8ebdc4745e6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -225,6 +225,51 @@ public List findAllOrSubset(long numPartitions, long partitionId, boolean return typedQuery.getResultList(); } + /** + * For docs, see the equivalent method on the DataverseServiceBean. + * @param numPartitions + * @param partitionId + * @param skipIndexed + * @return a list of datasets + * @see DataverseServiceBean#findAllOrSubset(long, long, boolean) + */ + public List findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) { + /* + Disregards deleted or replaced files when determining 'size' of dataset. + Could possibly make more efficient by getting file metadata counts + of latest published/draft version. + Also disregards partitioning which is no longer supported. + SEK - 11/09/2021 + */ + + String skipClause = skipIndexed ? "AND o.indexTime is null " : ""; + Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " + + "left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' " + + skipClause + + " group by o.id " + + "ORDER BY count(f.id) asc, o.id"); + + List queryResults; + queryResults = query.getResultList(); + + List retVal = new ArrayList(); + for (Object[] result : queryResults) { + Long dsId; + if (result[0] != null) { + try { + dsId = Long.parseLong(result[0].toString()) ; + } catch (Exception ex) { + dsId = null; + } + if (dsId == null) { + continue; + } + retVal.add(dsId); + } + } + return retVal; + } + /** * Merges the passed dataset to the persistence context. * @param ds the dataset whose new state we want to persist. diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index 54a6799016b..6262b6204f4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -532,7 +532,7 @@ public boolean contentEquals(FileMetadata other) { public boolean compareContent(FileMetadata other){ FileVersionDifference diffObj = new FileVersionDifference(this, other, false); - return diffObj.compareMetadata(this, other); + return diffObj.isSame(); } @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index 8c5a549f619..e0dea739edc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -10,7 +10,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.ResourceBundle; /** * @@ -21,6 +20,9 @@ public final class FileVersionDifference { private FileMetadata newFileMetadata; private FileMetadata originalFileMetadata; private boolean details = false; + private boolean same = false; + + private List differenceSummaryGroups = new ArrayList<>(); private List differenceDetailItems = new ArrayList<>(); @@ -37,7 +39,7 @@ public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata original this.originalFileMetadata = originalFileMetadata; this.details = details; - compareMetadata(newFileMetadata, originalFileMetadata); + this.same = compareMetadata(newFileMetadata, originalFileMetadata); //Compare versions - File Metadata first } @@ -50,7 +52,7 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin and it updates the FileVersionDifference object which is used to display the differences on the dataset versions tab. The return value is used by the index service bean tomark whether a file needs to be re-indexed in the context of a dataset update. When there are changes (after v4.19)to the file metadata data model this method must be updated. - retVal of True means metadatas are equal. + retVal of True means metadatas are equal. */ boolean retVal = true; @@ -68,6 +70,7 @@ When there are changes (after v4.19)to the file metadata data model this method if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null ){ //File Added + if (!details) return false; retVal = false; updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); } @@ -75,6 +78,7 @@ When there are changes (after v4.19)to the file metadata data model this method //Check to see if File replaced if (originalFileMetadata != null && newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null &&!this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())){ + if (!details) return false; updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1); retVal = false; } @@ -83,6 +87,8 @@ When there are changes (after v4.19)to the file metadata data model this method if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel())); + } else{ + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), 0, 1, 0, 0); @@ -97,6 +103,8 @@ When there are changes (after v4.19)to the file metadata data model this method && !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), newFileMetadata.getDescription())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 1, 0, 0); @@ -107,6 +115,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), "", newFileMetadata.getDescription())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 1, 0, 0, 0); @@ -117,6 +127,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "" )); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0); @@ -130,6 +142,8 @@ When there are changes (after v4.19)to the file metadata data model this method && !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), newFileMetadata.getProvFreeForm())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 1, 0, 0); @@ -140,6 +154,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), "", newFileMetadata.getProvFreeForm())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 1, 0, 0, 0); @@ -150,6 +166,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "" )); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 0, 1, 0); @@ -170,7 +188,7 @@ When there are changes (after v4.19)to the file metadata data model this method } if (!value1.equals(value2)) { - + if (!details) return false; int added = 0; int deleted = 0; @@ -254,6 +272,14 @@ public void setOriginalFileMetadata(FileMetadata originalFileMetadata) { this.originalFileMetadata = originalFileMetadata; } + public boolean isSame() { + return same; + } + + public void setSame(boolean same) { + this.same = same; + } + public List getDifferenceSummaryGroups() { return differenceSummaryGroups; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index 5171b1a864a..34c145fa6e8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -1,12 +1,15 @@ package edu.harvard.iq.dataverse.search; +import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.util.SystemConfig; import java.io.IOException; +import java.sql.Timestamp; import java.util.ArrayList; +import java.util.Date; import java.util.List; import java.util.concurrent.Future; import java.util.logging.Level; @@ -200,7 +203,7 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo int datasetIndexCount = 0; int datasetFailureCount = 0; - List datasetIds = datasetService.findAllOrSubset(numPartitions, partitionId, skipIndexed); + List datasetIds = datasetService.findAllOrSubsetOrderByFilesOwned(skipIndexed); for (Long id : datasetIds) { try { datasetIndexCount++; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 8e3968279aa..e4844156271 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -2,6 +2,7 @@ import edu.harvard.iq.dataverse.ControlledVocabularyValue; import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.DataFileTag; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; @@ -42,6 +43,7 @@ import java.util.Calendar; import java.util.Collection; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; @@ -119,6 +121,8 @@ public class IndexServiceBean { SettingsServiceBean settingsService; @EJB SolrClientService solrClientService; + @EJB + DataFileServiceBean dataFileService; @EJB VariableServiceBean variableService; @@ -330,6 +334,15 @@ public Future indexDatasetInNewTransaction(Long datasetId) throws SolrS dataset = null; return ret; } + + @TransactionAttribute(REQUIRES_NEW) + public Future indexDatasetObjectInNewTransaction(Dataset dataset) throws SolrServerException, IOException{ //Dataset dataset) { + boolean doNormalSolrDocCleanUp = false; + // return indexDataset(dataset, doNormalSolrDocCleanUp); + Future ret = indexDataset(dataset, doNormalSolrDocCleanUp); + dataset = null; + return ret; + } @Asynchronous public Future asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) throws SolrServerException, IOException { @@ -626,16 +639,18 @@ public Future indexDataset(Dataset dataset, boolean doNormalSolrDocClean for (FileMetadata fm : latestVersion.getFileMetadatas()) { datafilesInDraftVersion.add(fm.getDataFile().getId()); } - String indexDraftResult = addOrUpdateDataset(indexableDraftVersion); - results.append("The latest version is a working copy (latestVersionState: ") - .append(latestVersionStateString).append(") and will be indexed as ") - .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); + desiredCards.put(DatasetVersion.VersionState.RELEASED, true); IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion); String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion, datafilesInDraftVersion); results.append("There is a published version we will attempt to index. Result: ").append(indexReleasedVersionResult).append("\n"); + String indexDraftResult = addOrUpdateDataset(indexableDraftVersion); + results.append("The latest version is a working copy (latestVersionState: ") + .append(latestVersionStateString).append(") and will be indexed as ") + .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); + desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); if (doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); @@ -712,7 +727,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d Dataset dataset = indexableDataset.getDatasetVersion().getDataset(); logger.fine("adding or updating Solr document for dataset id " + dataset.getId()); Collection docs = new ArrayList<>(); - SolrInputDocument solrInputDocument = new SolrInputDocument(); String datasetSolrDocId = indexableDataset.getSolrDocId(); solrInputDocument.addField(SearchFields.ID, datasetSolrDocId); @@ -936,9 +950,15 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d List filesIndexed = new ArrayList<>(); if (datasetVersion != null) { List fileMetadatas = datasetVersion.getFileMetadatas(); + List releasedFileMetadatas = new ArrayList<>(); + Map fileMap = new HashMap<>(); boolean checkForDuplicateMetadata = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { checkForDuplicateMetadata = true; + releasedFileMetadatas = dataset.getReleasedVersion().getFileMetadatas(); + for(FileMetadata released: releasedFileMetadatas){ + fileMap.put(released.getDataFile().getId(), released); + } logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } @@ -955,37 +975,24 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d } boolean indexThisMetadata = true; - if (checkForDuplicateMetadata) { - + if (checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { logger.fine("Checking if this file metadata is a duplicate."); - for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) { - if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) { - /* - * Duplicate if metadata matches and, for full text indexing and the - * SearchFields.ACCESS field, if the restricted status of the file hasn't - * changed. To address the case where full text indexing was on when a file was - * not restricted and it is now restricted and full text indexing has been shut - * off, we need to check for the change in restricted status regardless of - * whether full text indexing is on now. - */ - if ((fileMetadata.getDataFile().isRestricted() == releasedFileMetadata.getDataFile().isRestricted())) { - if (fileMetadata.contentEquals(releasedFileMetadata) - && variableMetadataUtil.compareVariableMetadata(releasedFileMetadata,fileMetadata) - ) { - indexThisMetadata = false; - logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); - } else { - logger.fine("This file metadata has changed since the released version; we want to index it!"); - } + FileMetadata getFromMap = fileMap.get(fileMetadata.getDataFile().getId()); + if (getFromMap != null) { + if ((fileMetadata.getDataFile().isRestricted() == getFromMap.getDataFile().isRestricted())) { + if (fileMetadata.contentEquals(getFromMap) + && variableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { + indexThisMetadata = false; + logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } else { - logger.fine("This file's restricted status has changed since the released version; we want to index it!"); + logger.fine("This file metadata has changed since the released version; we want to index it!"); } - break; + } else { + logger.fine("This file's restricted status has changed since the released version; we want to index it!"); } } - } + } if (indexThisMetadata) { - SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); Long fileEntityId = fileMetadata.getDataFile().getId(); @@ -1272,7 +1279,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d solrInputDocument.addField(SearchFields.EMBARGO_END_DATE, embargoEndDate.toEpochDay()); } } - + try { solrClientService.getSolrClient().add(docs); solrClientService.getSolrClient().commit();