Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions doc/release-notes/5323-saved-original-size
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Starting the release 4.10 the size of the saved original file (for an
ingested tabular datafile) is stored in the database. We provided the
following API that retrieve and permanently store the sizes for any
already existing saved originals:
/api/admin/datafiles/integrity/fixmissingoriginalsizes (see the
documentation note in the Native API guide, under "Datafile
Integrity").

While it's not strictly necessary to have these sizes in the database,
having them makes certain operations more efficient (primary example
is a user downloading the saved originals for multiple files/an entire
dataset etc.) Also, if present in the database, the size will be added
to the file information displayed in the output of the /api/datasets;
which can be useful for some users.

9 changes: 9 additions & 0 deletions doc/sphinx-guides/source/api/native-api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,15 @@ Delete Provenance JSON for an uploaded file::

DELETE http://$SERVER/api/files/{id}/prov-json?key=$apiKey

Datafile Integrity
~~~~~~~~~~~~~~~~~~

Starting the release 4.10 the size of the saved original file (for an ingested tabular datafile) is stored in the database. The following API will retrieve and permanently store the sizes for any already existing saved originals::

GET http://$SERVER/api/admin/datafiles/integrity/fixmissingoriginalsizes{?limit=N}

Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles.

Builtin Users
-------------

Expand Down
4 changes: 3 additions & 1 deletion scripts/database/upgrades/upgrade_v4.9.4_to_v4.10.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ INSERT INTO setting(
ALTER TABLE actionlogrecord ALTER COLUMN info TYPE text;


ALTER TABLE dataverse ALTER COLUMN defaultcontributorrole_id DROP NOT NULL;
ALTER TABLE dataverse ALTER COLUMN defaultcontributorrole_id DROP NOT NULL;

ALTER TABLE datatable ADD COLUMN originalfilesize BIGINT;
10 changes: 10 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,16 @@ public String getOriginalFileFormat() {
}
return null;
}

public Long getOriginalFileSize() {
if (isTabularData()) {
DataTable dataTable = getDataTable();
if (dataTable != null) {
return dataTable.getOriginalFileSize();
}
}
return null;
}

@Override
public boolean isAncestorOf( DvObject other ) {
Expand Down
20 changes: 16 additions & 4 deletions src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ public DataFile findCheapAndEasy(Long id) {
if (MIME_TYPE_TSV.equalsIgnoreCase(contentType)) {
Object[] dtResult;
try {
dtResult = (Object[]) em.createNativeQuery("SELECT ID, UNF, CASEQUANTITY, VARQUANTITY, ORIGINALFILEFORMAT FROM dataTable WHERE DATAFILE_ID = " + id).getSingleResult();
dtResult = (Object[]) em.createNativeQuery("SELECT ID, UNF, CASEQUANTITY, VARQUANTITY, ORIGINALFILEFORMAT, ORIGINALFILESIZE FROM dataTable WHERE DATAFILE_ID = " + id).getSingleResult();
} catch (Exception ex) {
dtResult = null;
}
Expand All @@ -509,6 +509,8 @@ public DataFile findCheapAndEasy(Long id) {

dataTable.setOriginalFileFormat((String)dtResult[4]);

dataTable.setOriginalFileSize((Long)dtResult[5]);

dataTable.setDataFile(dataFile);
dataFile.setDataTable(dataTable);

Expand Down Expand Up @@ -567,7 +569,7 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion

int i = 0;

List<Object[]> dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList();
List<Object[]> dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList();

for (Object[] result : dataTableResults) {
DataTable dataTable = new DataTable();
Expand All @@ -583,6 +585,8 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion

dataTable.setOriginalFileFormat((String)result[5]);

dataTable.setOriginalFileSize((Long)result[6]);

dataTables.add(dataTable);
datatableMap.put(fileId, i++);

Expand Down Expand Up @@ -1444,7 +1448,7 @@ public boolean isReplacementFile(DataFile df) {
} // end: isReplacementFile

public List<Long> selectFilesWithMissingOriginalTypes() {
Query query = em.createNativeQuery("SELECT f.id FROM datafile f, datatable t where t.datafile_id = f.id AND t.originalfileformat='" + MIME_TYPE_TSV + "' ORDER BY f.id");
Query query = em.createNativeQuery("SELECT f.id FROM datafile f, datatable t where t.datafile_id = f.id AND (t.originalfileformat='" + MIME_TYPE_TSV + "' OR t.originalfileformat IS NULL) ORDER BY f.id");

try {
return query.getResultList();
Expand All @@ -1453,7 +1457,15 @@ public List<Long> selectFilesWithMissingOriginalTypes() {
}
}


public List<Long> selectFilesWithMissingOriginalSizes() {
Query query = em.createNativeQuery("SELECT f.id FROM datafile f, datatable t where t.datafile_id = f.id AND (t.originalfilesize IS NULL ) AND (t.originalfileformat IS NOT NULL) ORDER BY f.id");

try {
return query.getResultList();
} catch (Exception ex) {
return new ArrayList<>();
}
}

public String generateDataFileIdentifier(DataFile datafile, GlobalIdServiceBean idServiceBean) {
String doiIdentifierType = settingsService.getValueForKey(SettingsServiceBean.Key.IdentifierGenerationStyle, "randomString");
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DataTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ public DataTable() {
*/
private String originalFormatVersion;

/*
* Size of the original file:
*/

private Long originalFileSize;

/*
* Getter and Setter methods:
*/
Expand Down Expand Up @@ -172,6 +178,14 @@ public void setOriginalFileFormat(String originalFileType) {
this.originalFileFormat = originalFileType;
}

public Long getOriginalFileSize() {
return originalFileSize;
}

public void setOriginalFileSize(Long originalFileSize) {
this.originalFileSize = originalFileSize;
}


public String getOriginalFormatVersion() {
return originalFormatVersion;
Expand Down
34 changes: 18 additions & 16 deletions src/main/java/edu/harvard/iq/dataverse/api/Access.java
Original file line number Diff line number Diff line change
Expand Up @@ -578,23 +578,25 @@ public void write(OutputStream os) throws IOException,
//without doing a large deal of rewriting or architecture redo.
//The previous size checks for non-original download is still quick.
//-MAD 4.9.2
DataAccessRequest daReq = new DataAccessRequest();
StorageIO<DataFile> accessObject = DataAccess.getStorageIO(file, daReq);

if (accessObject != null) {
Boolean gotOriginal = false;
StoredOriginalFile sof = new StoredOriginalFile();
StorageIO<DataFile> tempAccessObject = sof.retreive(accessObject);
if(null != tempAccessObject) { //If there is an original, use it
gotOriginal = true;
accessObject = tempAccessObject;
}
if(!gotOriginal) { //if we didn't get this from sof.retreive we have to open it
accessObject.open();
}
size = accessObject.getSize();
// OK, here's the better solution: we now store the size of the original file in
// the database (in DataTable), so we get it for free.
// However, there may still be legacy datatables for which the size is not saved.
// so the "inefficient" code is kept, below, as a fallback solution.
// -- L.A., 4.10

if (file.getDataTable().getOriginalFileSize() != null) {
size = file.getDataTable().getOriginalFileSize();
} else {
DataAccessRequest daReq = new DataAccessRequest();
StorageIO<DataFile> storageIO = DataAccess.getStorageIO(file, daReq);
storageIO.open();
size = storageIO.getAuxObjectSize(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);

// save it permanently:
file.getDataTable().setOriginalFileSize(size);
fileService.saveDataTable(file.getDataTable());
}
if(size == 0L){
if (size == 0L){
throw new IOException("Invalid file size or accessObject when checking limits of zip file");
}
} else {
Expand Down
28 changes: 28 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/api/Admin.java
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,34 @@ public Response fixMissingOriginalTypes() {

return ok(info);
}

@Path("datafiles/integrity/fixmissingoriginalsizes")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@landreev can you please document this new API endpoint in the guides?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have documented it in the Native API guide, with the other file-related API calls. Also added the release notes specific to the issue (doc/release-notes/5323-saved-original-size).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(thanks :)

@GET
public Response fixMissingOriginalSizes(@QueryParam("limit") Integer limit) {
JsonObjectBuilder info = Json.createObjectBuilder();

List<Long> affectedFileIds = fileService.selectFilesWithMissingOriginalSizes();

if (affectedFileIds.isEmpty()) {
info.add("message",
"All the tabular files in the database already have the original sizes set correctly; exiting.");
} else {

int howmany = affectedFileIds.size();
String message = "Found " + howmany + " tabular files with missing original sizes. ";

if (limit == null || howmany <= limit) {
message = message.concat(" Kicking off an async job that will repair the files in the background.");
} else {
affectedFileIds.subList(limit, howmany-1).clear();
message = message.concat(" Kicking off an async job that will repair the " + limit + " files in the background.");
}
info.add("message", message);
}

ingestService.fixMissingOriginalSizes(affectedFileIds);
return ok(info);
}

/**
* This method is used in API tests, called from UtilIt.java.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ public static StorageIO<DataFile> retreive(StorageIO<DataFile> storageIO) {
try {
storageIO.open();
Channel storedOriginalChannel = storageIO.openAuxChannel(SAVED_ORIGINAL_FILENAME_EXTENSION);
storedOriginalSize = storageIO.getAuxObjectSize(SAVED_ORIGINAL_FILENAME_EXTENSION);
storedOriginalSize = dataFile.getDataTable().getOriginalFileSize() != null ?
dataFile.getDataTable().getOriginalFileSize() :
storageIO.getAuxObjectSize(SAVED_ORIGINAL_FILENAME_EXTENSION);
inputStreamIO = new InputStreamIO(Channels.newInputStream((ReadableByteChannel) storedOriginalChannel), storedOriginalSize);
logger.fine("Opened stored original file as Aux "+SAVED_ORIGINAL_FILENAME_EXTENSION);
} catch (IOException ioEx) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,7 @@ public boolean ingestAsTabular(Long datafile_id) {
} else {
tabDataIngest.getDataTable().setOriginalFileFormat(originalContentType);
}
tabDataIngest.getDataTable().setOriginalFileSize(originalFileSize);

dataFile.setDataTable(tabDataIngest.getDataTable());
tabDataIngest.getDataTable().setDataFile(dataFile);
Expand Down Expand Up @@ -1675,6 +1676,22 @@ public void fixMissingOriginalTypes(List<Long> datafileIds) {
logger.info("Finished repairing tabular data files that were missing the original file format labels.");
}

// This method takes a list of file ids and tries to fix the size of the saved
// original, if present
// Note the @Asynchronous attribute - this allows us to just kick off and run this
// (potentially large) job in the background.
// The method is called by the "fixmissingoriginalsizes" /admin api call.
@Asynchronous
public void fixMissingOriginalSizes(List<Long> datafileIds) {
for (Long fileId : datafileIds) {
fixMissingOriginalSize(fileId);
try {
Thread.sleep(1000);
} catch (Exception ex) {}
}
logger.info("Finished repairing tabular data files that were missing the original file sizes.");
}

// This method fixes a datatable object that's missing the format type of
// the ingested original. It will check the saved original file to
// determine the type.
Expand Down Expand Up @@ -1745,6 +1762,8 @@ private void fixMissingOriginalType(long fileId) {
logger.warning("Caught exception trying to determine original file type (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ioex.getMessage());
}

Long savedOriginalFileSize = savedOriginalFile.length();

// If we had to create a temp file, delete it now:
if (tempFileRequired) {
savedOriginalFile.delete();
Expand All @@ -1769,6 +1788,7 @@ private void fixMissingOriginalType(long fileId) {

// save permanently in the database:
dataFile.getDataTable().setOriginalFileFormat(fileTypeDetermined);
dataFile.getDataTable().setOriginalFileSize(savedOriginalFileSize);
fileService.saveDataTable(dataFile.getDataTable());

} else {
Expand All @@ -1779,6 +1799,46 @@ private void fixMissingOriginalType(long fileId) {
}
}

// This method fixes a datatable object that's missing the size of the
// ingested original.
private void fixMissingOriginalSize(long fileId) {
DataFile dataFile = fileService.find(fileId);

if (dataFile != null && dataFile.isTabularData()) {
Long savedOriginalFileSize = dataFile.getDataTable().getOriginalFileSize();
Long datatableId = dataFile.getDataTable().getId();

if (savedOriginalFileSize == null) {

StorageIO<DataFile> storageIO;

try {
storageIO = dataFile.getStorageIO();
storageIO.open();
savedOriginalFileSize = storageIO.getAuxObjectSize(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);

} catch (Exception ex) {
logger.warning("Exception "+ex.getClass()+" caught trying to look up the size of the saved original; (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ex.getMessage());
return;
}

if (savedOriginalFileSize == null) {
logger.warning("Failed to look up the size of the saved original file! (datafile id=" + fileId + ", datatable id=" + datatableId + ")");
return;
}

// save permanently in the database:
dataFile.getDataTable().setOriginalFileSize(savedOriginalFileSize);
fileService.saveDataTable(dataFile.getDataTable());

} else {
logger.info("DataFile id=" + fileId + "; original file size already present: " + savedOriginalFileSize);
}
} else {
logger.warning("DataFile id=" + fileId + ": No such DataFile!");
}
}

public static void main(String[] args) {

String file = args[0];
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ public static String getFacetFileType(DataFile dataFile) {
}

public static String getUserFriendlyOriginalType(DataFile dataFile) {
if (!dataFile.isTabularData()) {
return null;
}

String fileType = dataFile.getOriginalFileFormat();

if (fileType != null && !fileType.equals("")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ public static JsonObjectBuilder json(DataFile df, FileMetadata fileMetadata) {
.add("storageIdentifier", df.getStorageIdentifier())
.add("originalFileFormat", df.getOriginalFileFormat())
.add("originalFormatLabel", df.getOriginalFormatLabel())
.add ("originalFileSize", df.getOriginalFileSize())
.add("UNF", df.getUnf())
//---------------------------------------------
// For file replace: rootDataFileId, previousDataFileId
Expand All @@ -586,7 +587,7 @@ public static JsonObjectBuilder json(DataFile df, FileMetadata fileMetadata) {
//---------------------------------------------
// Checksum
// * @todo Should we deprecate "md5" now that it's under
// * "checksum" (which may also be a SHA-1 rather than an MD5)?
// * "checksum" (which may also be a SHA-1 rather than an MD5)? - YES!
//---------------------------------------------
.add("md5", getMd5IfItExists(df.getChecksumType(), df.getChecksumValue()))
.add("checksum", getChecksumTypeAndValue(df.getChecksumType(), df.getChecksumValue()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ public void testGetFileCategories() {
assertEquals("Data", jsonObject.getJsonArray("categories").getString(0));
assertEquals("", jsonObject.getJsonObject("dataFile").getString("filename"));
assertEquals(-1, jsonObject.getJsonObject("dataFile").getInt("filesize"));
assertEquals("UNKNOWN", jsonObject.getJsonObject("dataFile").getString("originalFormatLabel"));
assertEquals(-1, jsonObject.getJsonObject("dataFile").getInt("rootDataFileId"));
assertEquals("Survey", jsonObject.getJsonObject("dataFile").getJsonArray("tabularTags").getString(0));
}
Expand Down