Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
8053aa5
early mods of the new export data provider framework #11405
landreev Jun 5, 2025
ebbde8b
... and had to change it back temporarily
landreev Jun 5, 2025
dcae3fc
re-checking in the updated data provider interface. #11405
landreev Jun 12, 2025
788dc4d
Merge branch 'develop' into 11405-refactor-exports
landreev Aug 25, 2025
be3603c
intermediate state of the export data provider implementation using t…
landreev Aug 26, 2025
90757da
removed some experimental code. #11405
landreev Aug 26, 2025
fbd4998
work in progress
landreev Aug 27, 2025
3cdb98f
work in progress (this is working for me, since I have dataverse-spi …
landreev Sep 2, 2025
de62287
alternative, batch implementation #11405
landreev Sep 2, 2025
4fd6a0e
logging statements for debugging #11405
landreev Sep 4, 2025
a8a7ac9
A quick fix for the batch-based datatable metadata processing #11405
landreev Sep 5, 2025
b1ed132
implemented a dedicated method in the VersionFileService for looking up
landreev Sep 8, 2025
a769133
Merge branch 'develop' into 11405-refactor-exports
landreev Sep 8, 2025
933f6a7
experimental attempts to build with snapshot releases of dataverse-sp…
landreev Sep 9, 2025
a5ca53d
fixing the typo in the repo url :( #11405
landreev Sep 9, 2025
11cd77c
Adding the option of re-exporting select formats only to the API #11405
landreev Sep 16, 2025
f9751e1
Merge branch 'develop' into 11405-refactor-exports
landreev Nov 5, 2025
7dac667
Made the batch-sizing math smarter (processing datavariable metadata …
landreev Nov 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions modules/dataverse-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -419,19 +419,25 @@
<id>unidata-all</id>
<name>Unidata All</name>
<url>https://artifacts.unidata.ucar.edu/repository/unidata-all/</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<!-- Uncomment when using snapshot releases from Maven Central -->
<!--
<repository>
<id>oss-sonatype</id>
<name>oss-sonatype</name>
<id>central-portal-snapshots</id>
<name>Central Portal Snapshots</name>
<url>
https://oss.sonatype.org/content/repositories/snapshots/
https://central.sonatype.com/repository/maven-snapshots/
</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<!--
<repository>
<id>s01-oss-sonatype</id>
<name>s01-oss-sonatype</name>
Expand All @@ -441,8 +447,8 @@
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
-->
</repository>
-->
</repositories>

<profiles>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ public interface ExportDataProvider {
* formatting it, since there can be a very large number of
* files in a dataset.
*/

JsonObject getDatasetJson(ExportDataContext... context);


/**
*
* @return - dataset metadata in the JSON-LD based OAI_ORE format used in
Expand Down Expand Up @@ -73,6 +75,7 @@ public interface ExportDataProvider {
* @throws ExportException
*/
JsonArray getTabularDataDetails(ExportDataContext ... context) throws ExportException;


/**
*
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@
<dependency>
<groupId>io.gdcc</groupId>
<artifactId>dataverse-spi</artifactId>
<version>2.0.0</version>
<version>2.1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>javax.cache</groupId>
Expand Down
15 changes: 10 additions & 5 deletions src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ public void exportAllDatasets(boolean forceReExport) {
|| dataset.getLastExportTime().before(publicationDate)))) {
countAll++;
try {
recordService.exportAllFormatsInNewTransaction(dataset);
recordService.exportFormatsInNewTransaction(dataset, null);
exportLogger.info("Success exporting dataset: " + dataset.getDisplayName() + " " + dataset.getGlobalId().asString());
countSuccess++;
} catch (Exception ex) {
Expand All @@ -785,13 +785,18 @@ public void exportAllDatasets(boolean forceReExport) {

@Asynchronous
public void reExportDatasetAsync(Dataset dataset) {
exportDataset(dataset, true);
exportDataset(dataset, true, null);
}

@Asynchronous
public void reExportDatasetAsync(Dataset dataset, List<String> formatNames) {
exportDataset(dataset, true, formatNames);
}

public void exportDataset(Dataset dataset, boolean forceReExport) {
private void exportDataset(Dataset dataset, boolean forceReExport, List<String> formatNames) {
if (dataset != null) {
// Note that the logic for handling a dataset is similar to what is implemented in exportAllDatasets,
// but when only one dataset is exported we do not log in a separate export logging file
// but when only one dataset is exported we do not use a dedicated log file
if (dataset.isReleased() && dataset.getReleasedVersion() != null && !dataset.isDeaccessioned()) {

// can't trust dataset.getPublicationDate(), no.
Expand All @@ -800,7 +805,7 @@ public void exportDataset(Dataset dataset, boolean forceReExport) {
&& (dataset.getLastExportTime() == null
|| dataset.getLastExportTime().before(publicationDate)))) {
try {
recordService.exportAllFormatsInNewTransaction(dataset);
recordService.exportFormatsInNewTransaction(dataset, formatNames);
logger.info("Success exporting dataset: " + dataset.getDisplayName() + " " + dataset.getGlobalId().asString());
} catch (Exception ex) {
logger.log(Level.INFO, "Error exporting dataset: " + dataset.getDisplayName() + " " + dataset.getGlobalId().asString() + "; " + ex.getMessage(), ex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,55 @@ public List<FileMetadata> getFileMetadatas(DatasetVersion datasetVersion, Intege
}
return typedQuery.getResultList();
}

/**
* Similar to the above, but dedicated for retrieving FileMetadatas of only
* tabular datafiles in the specified DatasetVersion. Used in the metadata
* export subsystem.
*
* @param datasetVersion the DatasetVersion to access
* @param limit for pagination, can be null
* @param offset for pagination, can be null
* @param publicFilesOnly skip restricted, embargoed etc. files
* @return a FileMetadata list from the specified DatasetVersion
*/
public List<FileMetadata> getTabularDataFileMetadatas(DatasetVersion datasetVersion, Integer limit, Integer offset, boolean publicFilesOnly) {
CriteriaBuilder criteriaBuilder = em.getCriteriaBuilder();
CriteriaQuery<FileMetadata> criteriaQuery = criteriaBuilder.createQuery(FileMetadata.class);

Root<FileMetadata> fileMetadataRoot = criteriaQuery.from(FileMetadata.class);
Predicate basePredicate = criteriaBuilder.equal(fileMetadataRoot.get("datasetVersion").<String>get("id"), datasetVersion.getId());

Root<DataTable> dataTableRoot = criteriaQuery.from(DataTable.class);
Predicate tabularPredicate = criteriaBuilder.equal(dataTableRoot.get("dataFile"), fileMetadataRoot.get("dataFile"));

Predicate combinedPredicate;

if (publicFilesOnly) {
combinedPredicate = criteriaBuilder.and(basePredicate, tabularPredicate);
} else {
combinedPredicate = criteriaBuilder.and(basePredicate,
tabularPredicate,
createSearchCriteriaAccessStatusPredicate(FileSearchCriteria.FileAccessStatus.Public,
criteriaBuilder,
fileMetadataRoot));
}

criteriaQuery
.select(fileMetadataRoot)
.where(combinedPredicate)
.orderBy(criteriaBuilder.asc(fileMetadataRoot.get("label")));

TypedQuery<FileMetadata> typedQuery = em.createQuery(criteriaQuery);
if (limit != null) {
typedQuery.setMaxResults(limit);
}
if (offset != null) {
typedQuery.setFirstResult(offset);
}

return typedQuery.getResultList();
}

/**
* Returns the total download size of all files for a particular DatasetVersion
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/edu/harvard/iq/dataverse/api/Files.java
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,7 @@ public Response getFileDataTables(@Context ContainerRequestContext crc, @PathPar
if (!dataFile.isTabularData()) {
return badRequest(BundleUtil.getStringFromBundle("files.api.only.tabular.supported"));
}
return ok(jsonDT(dataFile.getDataTables()));
return ok(jsonDT(dataFile.getDataTables(), true));
}

@POST
Expand Down
11 changes: 9 additions & 2 deletions src/main/java/edu/harvard/iq/dataverse/api/Metadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

import edu.harvard.iq.dataverse.harvest.server.OAISetServiceBean;
import edu.harvard.iq.dataverse.harvest.server.OAISet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
*
Expand Down Expand Up @@ -64,10 +67,14 @@ public Response reExportAll() {

@GET
@Path("{id}/reExportDataset")
public Response indexDatasetByPersistentId(@PathParam("id") String id) {
public Response indexDatasetByPersistentId(@PathParam("id") String id, @QueryParam("formats") String formats) {
try {
Dataset dataset = findDatasetOrDie(id);
datasetService.reExportDatasetAsync(dataset);
List<String> formatNames = null;
if (formats != null) {
formatNames = new ArrayList<>(Arrays.asList(formats.split(",")));
}
datasetService.reExportDatasetAsync(dataset, formatNames);
return ok("export started");
} catch (WrappedResponse wr) {
return wr.getResponse();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public class DataFileDTO {
private String md5;
private String description;
private String pidURL;
private List<String> tabularTags;

public String getPidURL() {
return pidURL;
Expand Down Expand Up @@ -119,5 +120,11 @@ public void setDescription(String description) {
this.description = description;
}

public List<String> getTabularTags() {
return tabularTags;
}

public void setTabularTags(List<String> tabularTags) {
this.tabularTags = tabularTags;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import io.gdcc.spi.export.Exporter;
import io.gdcc.spi.export.XMLExporter;
import edu.harvard.iq.dataverse.util.BundleUtil;
import io.gdcc.spi.export.ExportDataContext;
import java.io.OutputStream;
import java.util.Locale;
import java.util.Optional;
Expand Down Expand Up @@ -38,7 +39,7 @@ public String getDisplayName(Locale locale) {
@Override
public void exportDataset(ExportDataProvider dataProvider, OutputStream outputStream) throws ExportException {
try {
DublinCoreExportUtil.datasetJson2dublincore(dataProvider.getDatasetJson(), outputStream, DublinCoreExportUtil.DC_FLAVOR_DCTERMS);
DublinCoreExportUtil.datasetJson2dublincore(dataProvider.getDatasetJson(ExportDataContext.context().withDatasetMetadataOnly()), outputStream, DublinCoreExportUtil.DC_FLAVOR_DCTERMS);
} catch (XMLStreamException xse) {
throw new ExportException("Caught XMLStreamException performing DCTERMS export", xse);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public String getDisplayName(Locale locale) {
@Override
public void exportDataset(ExportDataProvider dataProvider, OutputStream outputStream) throws ExportException {
try {
DdiExportUtil.datasetJson2ddi(dataProvider.getDatasetJson(), dataProvider.getDatasetFileDetails(),
DdiExportUtil.datasetJson2ddi(dataProvider.getDatasetJson(), dataProvider,
outputStream);
} catch (XMLStreamException xse) {
throw new ExportException("Caught XMLStreamException performing DDI export", xse);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import edu.harvard.iq.dataverse.export.dublincore.DublinCoreExportUtil;
import io.gdcc.spi.export.ExportDataProvider;
import io.gdcc.spi.export.ExportException;
import io.gdcc.spi.export.ExportDataContext;
import io.gdcc.spi.export.Exporter;
import io.gdcc.spi.export.XMLExporter;
import edu.harvard.iq.dataverse.util.BundleUtil;
Expand Down Expand Up @@ -38,7 +39,7 @@ public String getDisplayName(Locale locale) {
@Override
public void exportDataset(ExportDataProvider dataProvider, OutputStream outputStream) throws ExportException {
try {
DublinCoreExportUtil.datasetJson2dublincore(dataProvider.getDatasetJson(), outputStream,
DublinCoreExportUtil.datasetJson2dublincore(dataProvider.getDatasetJson(ExportDataContext.context().withDatasetMetadataOnly()), outputStream,
DublinCoreExportUtil.DC_FLAVOR_OAI);
} catch (XMLStreamException xse) {
throw new ExportException("Caught XMLStreamException performing DC export", xse);
Expand Down
66 changes: 61 additions & 5 deletions src/main/java/edu/harvard/iq/dataverse/export/ExportService.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public class ExportService {
private Map<String, Exporter> exporterMap = new HashMap<>();

private static final Logger logger = Logger.getLogger(ExportService.class.getCanonicalName());

private ExportService() {
/*
* Step 1 - find the EXPORTERS dir and add all jar files there to a class loader
Expand Down Expand Up @@ -319,16 +319,72 @@ public void exportAllFormats(Dataset dataset) throws ExportException {
}

}

public void exportFormats(Dataset dataset, List<String> formatNames) throws ExportException {
try {
if (formatNames == null) {
clearAllCachedFormats(dataset);
} else {
clearCachedFormats(dataset, formatNames);
}
} catch (IOException ex) {
Logger.getLogger(ExportService.class.getName()).log(Level.SEVERE, null, ex);
}

public void clearAllCachedFormats(Dataset dataset) throws IOException {
try {
DatasetVersion releasedVersion = dataset.getReleasedVersion();
if (releasedVersion == null) {
throw new ExportException("No released version for dataset " + dataset.getGlobalId().toString());
}
InternalExportDataProvider dataProvider = new InternalExportDataProvider(releasedVersion);

for (Exporter e : exporterMap.values()) {
String formatName = e.getFormatName();
clearCachedExport(dataset, formatName);
if (formatNames == null || formatNames.contains(formatName)) {
if (e.getPrerequisiteFormatName().isPresent()) {
String prereqFormatName = e.getPrerequisiteFormatName().get();
try (InputStream preReqStream = getExport(dataset.getReleasedVersion(), prereqFormatName)) {
dataProvider.setPrerequisiteInputStream(preReqStream);
cacheExport(dataset, dataProvider, formatName, e);
dataProvider.setPrerequisiteInputStream(null);
} catch (IOException ioe) {
throw new ExportException("Could not get prerequisite " + e.getPrerequisiteFormatName() + " to create " + formatName + "export for dataset " + dataset.getId(), ioe);
}
} else {
cacheExport(dataset, dataProvider, formatName, e);
}
}
}
// Finally, if we have been able to successfully export in all available
// formats, we'll increment the "last exported" time stamp:
dataset.setLastExportTime(new Timestamp(new Date().getTime()));

} catch (ServiceConfigurationError serviceError) {
throw new ExportException("Service configuration error during export. " + serviceError.getMessage());
} catch (RuntimeException e) {
logger.log(Level.FINE, e.getMessage(), e);
throw new ExportException(
"Unknown runtime exception exporting metadata. " + (e.getMessage() == null ? "" : e.getMessage()));
}
}

dataset.setLastExportTime(null);
public void clearAllCachedFormats(Dataset dataset) throws IOException {
List<String> formatNames = new ArrayList<>();

for (Exporter e : exporterMap.values()) {
String formatName = e.getFormatName();
formatNames.add(formatName);
clearCachedExport(dataset, formatName);
}
clearCachedFormats(dataset, formatNames);
dataset.setLastExportTime(null);
}

public void clearCachedFormats(Dataset dataset, List<String> formatNames) throws IOException {
try {
for (String formatName : formatNames) {
clearCachedExport(dataset, formatName);
}
} catch (IOException ex) {
// not fatal
}
Expand Down Expand Up @@ -379,7 +435,7 @@ public void exportFormat(Dataset dataset, String formatName) throws ExportExcept
}

}

public Exporter getExporter(String formatName) throws ExportException {
Exporter e = exporterMap.get(formatName);
if (e != null) {
Expand Down
Loading