diff --git a/doc/release-notes/7398-saved-search-performance.md b/doc/release-notes/7398-saved-search-performance.md new file mode 100644 index 00000000000..4986524ed4f --- /dev/null +++ b/doc/release-notes/7398-saved-search-performance.md @@ -0,0 +1,30 @@ +## Release Highlights + +### Saved Search Performance Improvements + +A refactoring has greatly improved Saved Search performance in the application. If your installation has multiple, potentially long-running Saved Searches in place, this greatly improves the probability that those search jobs will complete without timing out. + +## Notes for Dataverse Installation Administrators + +### DB Cleanup for Saved Searches + +A previous version of dataverse changed the indexing logic so that when a user links a dataverse, its children are also indexed as linked. This means that the children do not need to be separately linked, and in this version we removed the logic that creates a saved search to create those links when a dataverse is linked. + +We recommend cleaning up the db to a) remove these saved searches and b) remove the links for the objects. We can do this via a few queries, which are available in the folder here: + +https://github.com/IQSS/dataverse/raw/develop/scripts/issues/7398/ + +There are four sets of queries available, and they should be run in this order: + +- ss_for_deletion.txt to identify the Saved Searches to be deleted +- delete_ss.txt to delete the Saved Searches identified in the previous query +- dld_for_deletion.txt to identify the linked datasets and dataverses to be deleted +- delete_dld.txt to delete the linked datasets and dataverses identified in the previous queries + +Note: removing these saved searches and links should not affect what users will see as linked due to the aforementioned indexing change. Similarly, not removing these saved searches and links should not affect anything, but is a cleanup of unnecessary rows in the database. + +## Additional Upgrade Instructions + +X\. (Optional, but recommended) DB Cleanup + +Perform the DB Cleanup for Saved Searches and Linked Objects, summarized in the "Notes for Dataverse Installation Administrators" section above. diff --git a/scripts/issues/7398/delete_dld.txt b/scripts/issues/7398/delete_dld.txt new file mode 100644 index 00000000000..30edf42774b --- /dev/null +++ b/scripts/issues/7398/delete_dld.txt @@ -0,0 +1,21 @@ +-- these queries will delete linked objects identified using the query in dld_for_deletion + +begin; + +delete from datasetlinkingdataverse where id in ( +select dld.id +from datasetlinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld +where dld.dataset_id = dvo.id +and dld.linkingdataverse_id = dvld.linkingdataverse_id +and dvo.owner_id = dvld.dataverse_id +); + +delete from dataverselinkingdataverse where id in ( +select dld.id +from dataverselinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld +where dld.dataverse_id = dvo.id +and dld.linkingdataverse_id = dvld.linkingdataverse_id +and dvo.owner_id = dvld.dataverse_id +); + +commit; \ No newline at end of file diff --git a/scripts/issues/7398/delete_ss.txt b/scripts/issues/7398/delete_ss.txt new file mode 100644 index 00000000000..3cf053ce5af --- /dev/null +++ b/scripts/issues/7398/delete_ss.txt @@ -0,0 +1,18 @@ +-- these queries will delete the saved searches identified using the ss_for_deletion query + +begin; + +create temporary table delete_ss on commit drop as ( +Select ss.id +from savedsearch ss, savedsearchfilterquery ssfq, dataverselinkingdataverse dld +where ss.id = ssfq.savedsearch_id +and ss.definitionpoint_id = dld.linkingdataverse_id +and dld.dataverse_id = rtrim(reverse(split_part(reverse(ssfq.filterquery),'/',1)),'"')::integer +and ss.query='*' +and ssfq.filterquery like 'subtreePaths%' +); + +delete from savedsearchfilterquery where savedsearch_id in (select id from delete_ss); +delete from savedsearch where id in (select id from delete_ss); + +commit; diff --git a/scripts/issues/7398/dld_for_deletion.txt b/scripts/issues/7398/dld_for_deletion.txt new file mode 100644 index 00000000000..51aa9ce1458 --- /dev/null +++ b/scripts/issues/7398/dld_for_deletion.txt @@ -0,0 +1,19 @@ +-- these queries will show you the linked objects that will get deleted + +-- datasets +select dld.dataset_id, dvo.owner_id, dld.linkingdataverse_id, +dvld.dataverse_id, dvld.linkingdataverse_id +from datasetlinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld +where dld.dataset_id = dvo.id +and dld.linkingdataverse_id = dvld.linkingdataverse_id +and dvo.owner_id = dvld.dataverse_id +order by dld.linkingdataverse_id; + +-- dataverses +select dld.dataverse_id, dvo.owner_id, dld.linkingdataverse_id, +dvld.dataverse_id, dvld.linkingdataverse_id +from dataverselinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld +where dld.dataverse_id = dvo.id +and dld.linkingdataverse_id = dvld.linkingdataverse_id +and dvo.owner_id = dvld.dataverse_id +order by dld.linkingdataverse_id; diff --git a/scripts/issues/7398/ss_for_deletion.txt b/scripts/issues/7398/ss_for_deletion.txt new file mode 100644 index 00000000000..11ad8bbc49a --- /dev/null +++ b/scripts/issues/7398/ss_for_deletion.txt @@ -0,0 +1,10 @@ +-- this query will show you the saved searches that will get deleted + +select ss.id, ss.definitionpoint_id, dld.dataverse_id, ssfq.filterquery +from savedsearch ss, savedsearchfilterquery ssfq, dataverselinkingdataverse dld +where ss.id = ssfq.savedsearch_id +and ss.definitionpoint_id = dld.linkingdataverse_id +and dld.dataverse_id = rtrim(reverse(split_part(reverse(ssfq.filterquery),'/',1)),'"')::integer +and ss.query='*' +and ssfq.filterquery like 'subtreePaths%' +order by ss.definitionpoint_id; \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/DataversePage.java b/src/main/java/edu/harvard/iq/dataverse/DataversePage.java index 165c1759b5e..34efc928ab9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataversePage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataversePage.java @@ -787,18 +787,9 @@ public String saveLinkedDataverse() { return ""; } - AuthenticatedUser savedSearchCreator = getAuthenticatedUser(); - if (savedSearchCreator == null) { - String msg = BundleUtil.getStringFromBundle("dataverse.link.user"); - logger.severe(msg); - JsfHelper.addErrorMessage(msg); - return returnRedirect(); - } - linkingDataverse = dataverseService.find(linkingDataverseId); LinkDataverseCommand cmd = new LinkDataverseCommand(dvRequestService.getDataverseRequest(), linkingDataverse, dataverse); - //LinkDvObjectCommand cmd = new LinkDvObjectCommand (session.getUser(), linkingDataverse, dataverse); try { commandEngine.submit(cmd); } catch (CommandException ex) { @@ -808,32 +799,9 @@ public String saveLinkedDataverse() { JsfHelper.addErrorMessage(msg); return returnRedirect(); } - - SavedSearch savedSearchOfChildren = createSavedSearchForChildren(savedSearchCreator); - - boolean createLinksAndIndexRightNow = false; - if (createLinksAndIndexRightNow) { - try { - // create links (does indexing) right now (might be expensive) - boolean debug = false; - DataverseRequest dataverseRequest = new DataverseRequest(savedSearchCreator, SavedSearchServiceBean.getHttpServletRequest()); - savedSearchService.makeLinksForSingleSavedSearch(dataverseRequest, savedSearchOfChildren, debug); - JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("dataverse.linked.success", getSuccessMessageArguments())); - return returnRedirect(); - } catch (SearchException | CommandException ex) { - // error: solr is down, etc. can't link children right now - JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("dataverse.linked.internalerror", getSuccessMessageArguments())); - String msg = dataverse.getDisplayName() + " has been successfully linked to " + linkingDataverse.getDisplayName() + " but contents will not appear until an internal error has been fixed."; - logger.log(Level.SEVERE, "{0} {1}", new Object[]{msg, ex}); - //JsfHelper.addErrorMessage(msg); - return returnRedirect(); - } - } else { - // defer: please wait for the next timer/cron job - //JsfHelper.addSuccessMessage(dataverse.getDisplayName() + " has been successfully linked to " + linkingDataverse.getDisplayName() + ". Please wait for its contents to appear."); - JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("dataverse.linked.success.wait", getSuccessMessageArguments())); - return returnRedirect(); - } + + JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("dataverse.linked.success.wait", getSuccessMessageArguments())); + return returnRedirect(); } private List getSuccessMessageArguments() { diff --git a/src/main/java/edu/harvard/iq/dataverse/search/savedsearch/SavedSearchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/savedsearch/SavedSearchServiceBean.java index 20cb1a8629a..a495842e40d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/savedsearch/SavedSearchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/savedsearch/SavedSearchServiceBean.java @@ -8,6 +8,7 @@ import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.EjbDataverseEngine; import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; +import edu.harvard.iq.dataverse.authorization.users.GuestUser; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.search.SearchServiceBean; import edu.harvard.iq.dataverse.search.SolrQueryResponse; @@ -20,6 +21,7 @@ import edu.harvard.iq.dataverse.search.SortBy; import edu.harvard.iq.dataverse.util.SystemConfig; import java.util.ArrayList; +import java.util.Collections; import java.util.Date; import java.util.List; import java.util.logging.Level; @@ -171,35 +173,40 @@ public JsonObjectBuilder makeLinksForSingleSavedSearch(DataverseRequest dvReq, S JsonArrayBuilder savedSearchArrayBuilder = Json.createArrayBuilder(); JsonArrayBuilder infoPerHit = Json.createArrayBuilder(); SolrQueryResponse queryResponse = findHits(savedSearch); + + List skipList = new ArrayList(); // a list for the definition point itself and already linked objects + skipList.add(savedSearch.getDefinitionPoint().getId()); - // get linked objects and add to a list TypedQuery typedQuery = em.createNamedQuery("DataverseLinkingDataverse.findIdsByLinkingDataverseId", Long.class) .setParameter("linkingDataverseId", savedSearch.getDefinitionPoint().getId()); - List alreadyLinkedObjectIds = typedQuery.getResultList(); + skipList.addAll(typedQuery.getResultList()); typedQuery = em.createNamedQuery("DatasetLinkingDataverse.findIdsByLinkingDataverseId", Long.class) .setParameter("linkingDataverseId", savedSearch.getDefinitionPoint().getId()); - alreadyLinkedObjectIds.addAll(typedQuery.getResultList()); - + skipList.addAll(typedQuery.getResultList()); + for (SolrSearchResult solrSearchResult : queryResponse.getSolrSearchResults()) { JsonObjectBuilder hitInfo = Json.createObjectBuilder(); hitInfo.add("name", solrSearchResult.getNameSort()); hitInfo.add("dvObjectId", solrSearchResult.getEntityId()); + + if (skipList.contains(solrSearchResult.getEntityId())) { + hitInfo.add(resultString, "Skipping because would link to itself or an already linked entity."); + infoPerHit.add(hitInfo); + continue; + } DvObject dvObjectThatDefinitionPointWillLinkTo = dvObjectService.findDvObject(solrSearchResult.getEntityId()); if (dvObjectThatDefinitionPointWillLinkTo == null) { hitInfo.add(resultString, "Could not find DvObject with id " + solrSearchResult.getEntityId()); infoPerHit.add(hitInfo); - break; - } + continue; + } + if (dvObjectThatDefinitionPointWillLinkTo.isInstanceofDataverse()) { Dataverse dataverseToLinkTo = (Dataverse) dvObjectThatDefinitionPointWillLinkTo; - if (wouldResultInLinkingToItself(savedSearch.getDefinitionPoint(), dataverseToLinkTo)) { - hitInfo.add(resultString, "Skipping because dataverse id " + dataverseToLinkTo.getId() + " would link to itself."); - } else if (alreadyLinkedToTheDataverse(alreadyLinkedObjectIds, dataverseToLinkTo)) { - hitInfo.add(resultString, "Skipping because dataverse " + savedSearch.getDefinitionPoint().getId() + " already links to dataverse " + dataverseToLinkTo.getId() + "."); - } else if (dataverseToLinkToIsAlreadyPartOfTheSubtree(savedSearch.getDefinitionPoint(), dataverseToLinkTo)) { + if (dataverseToLinkToIsAlreadyPartOfTheSubtree(savedSearch.getDefinitionPoint(), dataverseToLinkTo)) { hitInfo.add(resultString, "Skipping because " + dataverseToLinkTo + " is already part of the subtree for " + savedSearch.getDefinitionPoint()); } else { DataverseLinkingDataverse link = commandEngine.submitInNewTransaction(new LinkDataverseCommand(dvReq, savedSearch.getDefinitionPoint(), dataverseToLinkTo)); @@ -207,29 +214,22 @@ public JsonObjectBuilder makeLinksForSingleSavedSearch(DataverseRequest dvReq, S } } else if (dvObjectThatDefinitionPointWillLinkTo.isInstanceofDataset()) { Dataset datasetToLinkTo = (Dataset) dvObjectThatDefinitionPointWillLinkTo; - if (alreadyLinkedToTheDataset(alreadyLinkedObjectIds, datasetToLinkTo)) { - hitInfo.add(resultString, "Skipping because dataverse " + savedSearch.getDefinitionPoint() + " already links to dataset " + datasetToLinkTo + "."); - } else if (datasetToLinkToIsAlreadyPartOfTheSubtree(savedSearch.getDefinitionPoint(), datasetToLinkTo)) { + if (datasetToLinkToIsAlreadyPartOfTheSubtree(savedSearch.getDefinitionPoint(), datasetToLinkTo)) { // already there from normal search/browse hitInfo.add(resultString, "Skipping because dataset " + datasetToLinkTo.getId() + " is already part of the subtree for " + savedSearch.getDefinitionPoint().getAlias()); } else if (datasetAncestorAlreadyLinked(savedSearch.getDefinitionPoint(), datasetToLinkTo)) { hitInfo.add(resultString, "FIXME: implement this?"); - } else if (!datasetToLinkTo.isReleased()) { - hitInfo.add(resultString, "Skipping because dataset " + datasetToLinkTo.getId() + " is not released " ); } else { DatasetLinkingDataverse link = commandEngine.submitInNewTransaction(new LinkDatasetCommand(dvReq, savedSearch.getDefinitionPoint(), datasetToLinkTo)); - alreadyLinkedObjectIds.add(datasetToLinkTo.getId()); // because search results could produce two hits (published and draft) hitInfo.add(resultString, "Persisted DatasetLinkingDataverse id " + link.getId() + " link of " + link.getDataset() + " to " + link.getLinkingDataverse()); } - } else if (dvObjectThatDefinitionPointWillLinkTo.isInstanceofDataFile()) { - hitInfo.add(resultString, "Skipping because the search matched a file. The matched file id was " + dvObjectThatDefinitionPointWillLinkTo.getId() + "."); } else { hitInfo.add(resultString, "Unexpected DvObject type."); } infoPerHit.add(hitInfo); } - + JsonObjectBuilder info = getInfo(savedSearch, infoPerHit); if (debugFlag) { info.add("debug", getDebugInfo(savedSearch)); @@ -237,13 +237,12 @@ public JsonObjectBuilder makeLinksForSingleSavedSearch(DataverseRequest dvReq, S savedSearchArrayBuilder.add(info); response.add("hits for saved search id " + savedSearch.getId(), savedSearchArrayBuilder); - Date end = new Date(); - logger.info("SAVED SEARCH (" + savedSearch.getId() + ") total time in ms: " + (end.getTime() - start.getTime())); + logger.info("SAVED SEARCH (" + savedSearch.getId() + ") total time in ms: " + (new Date().getTime() - start.getTime())); return response; } private SolrQueryResponse findHits(SavedSearch savedSearch) throws SearchException { - String sortField = SearchFields.RELEVANCE; + String sortField = SearchFields.TYPE; // first return dataverses, then datasets String sortOrder = SortBy.DESCENDING; SortBy sortBy = new SortBy(sortField, sortOrder); int paginationStart = 0; @@ -251,16 +250,23 @@ private SolrQueryResponse findHits(SavedSearch savedSearch) throws SearchExcepti int numResultsPerPage = Integer.MAX_VALUE; List dataverses = new ArrayList<>(); dataverses.add(savedSearch.getDefinitionPoint()); + + // since saved search can only link Dataverses and Datasets, we can limit our search + List searchFilterQueries = savedSearch.getFilterQueriesAsStrings(); + searchFilterQueries.add("dvObjectType:(dataverses OR datasets)"); + + // run the search as GuestUser to only link published objects SolrQueryResponse solrQueryResponse = searchService.search( - new DataverseRequest(savedSearch.getCreator(), getHttpServletRequest()), + new DataverseRequest(GuestUser.get(), getHttpServletRequest()), dataverses, savedSearch.getQuery(), - savedSearch.getFilterQueriesAsStrings(), + searchFilterQueries, sortBy.getField(), sortBy.getOrder(), paginationStart, dataRelatedToMe, - numResultsPerPage + numResultsPerPage, + false // do not retrieve entities ); return solrQueryResponse; } @@ -289,18 +295,6 @@ private JsonArrayBuilder getFilterQueries(SavedSearch savedSearch) { return filterQueriesArrayBuilder; } - private boolean alreadyLinkedToTheDataverse(List alreadyLinkedObjectIds, Dataverse dataverseToLinkTo) { - return alreadyLinkedObjectIds.contains(dataverseToLinkTo.getId()); - } - - private boolean alreadyLinkedToTheDataset(List alreadyLinkedObjectIds, Dataset linkToThisDataset) { - return alreadyLinkedObjectIds.contains(linkToThisDataset.getId()); - } - - private static boolean wouldResultInLinkingToItself(Dataverse savedSearchDefinitionPoint, Dataverse dataverseToLinkTo) { - return savedSearchDefinitionPoint.equals(dataverseToLinkTo); - } - private boolean datasetToLinkToIsAlreadyPartOfTheSubtree(Dataverse definitionPoint, Dataset datasetWeMayLinkTo) { Dataverse ancestor = datasetWeMayLinkTo.getOwner(); while (ancestor != null) { @@ -351,14 +345,12 @@ public static HttpServletRequest getHttpServletRequest() { * Saved Search was created. The default IP address in the * DataverseRequest constructor is used instead, which as of this * writing is 0.0.0.0 to mean "undefined". Is this a feature or a bug? - * What is the expected interplay between Saved Search and IP Groups? - * Users might be surprised to see certain DvObjects in the results of - * their query when creating the Saved Search and later find that those - * DvObjects, which are only visible due to an IP Groups membership, are - * not found by Saved Search when executed by cron, for example. As of - * this writing Saved Search is a superuser-only feature so perhaps IP - * Groups are irrelevant because all DvObjects are discoverable to - * superusers. + * This is not an issue for the search itself, since it is now run as the + * Guest User, but would present a problem if the user does not have + * permission to create links and could only create the saved search due to + * a granted permission from the IP Group. + * As of this writing Saved Search is a superuser-only feature; so IP + * Groups are irrelevant because all superusers can create links. */ return null; }