From e82771a967dc8db0a6d39b289938cabf9995f796 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 3 Sep 2021 16:53:33 -0400 Subject: [PATCH 01/26] support HTTP "Range" header #6937 Support is not full. As noted in the docs, only a single range is supported and the "If-Range" header is not supported. --- doc/sphinx-guides/source/api/dataaccess.rst | 16 +++ .../dataverse/api/DownloadInstanceWriter.java | 114 ++++++++++++++++- .../iq/dataverse/dataaccess/FileAccessIO.java | 1 + .../iq/dataverse/dataaccess/Range.java | 30 +++++ .../iq/dataverse/dataaccess/S3AccessIO.java | 2 + .../iq/dataverse/dataaccess/StorageIO.java | 14 +++ .../api/DownloadInstanceWriterTest.java | 116 ++++++++++++++++++ .../edu/harvard/iq/dataverse/api/FilesIT.java | 49 +++++++- .../edu/harvard/iq/dataverse/api/UtilIT.java | 24 ++-- .../dataaccess/FileAccessIOTest.java | 24 ++++ 10 files changed, 380 insertions(+), 10 deletions(-) create mode 100644 src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java create mode 100644 src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java diff --git a/doc/sphinx-guides/source/api/dataaccess.rst b/doc/sphinx-guides/source/api/dataaccess.rst index 55f7f021887..3973428a5a3 100755 --- a/doc/sphinx-guides/source/api/dataaccess.rst +++ b/doc/sphinx-guides/source/api/dataaccess.rst @@ -131,6 +131,22 @@ true Generates a thumbnail image by rescaling to the default thumbnai ``N`` Rescales the image to ``N`` pixels wide. ``imageThumb=true`` and ``imageThumb=64`` are equivalent. ============== =========== +Headers: +~~~~~~~~ + +============== =========== +Header Description +============== =========== +Range Download a specified byte range. Examples: + + - ``bytes=0-9`` gets the first 10 bytes. + - ``bytes=10-19`` gets 20 bytes from the middle. + - ``bytes=-10`` gets the last 10 bytes. + - ``bytes=9-`` gets all bytes except the first 10. + + Only a single range is supported. The "If-Range" header is not supported. For more on the "Range" header, see https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests +============== =========== + Multiple File ("bundle") download --------------------------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 2621a5e0b09..c8c9de30215 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -80,7 +80,24 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] StorageIO storageIO = DataAccess.getStorageIO(dataFile, daReq); if (storageIO != null) { + List ranges = new ArrayList<>(); + long fileSize = storageIO.getDataFile().getFilesize(); try { + String range = di.getRequestHttpHeaders().getHeaderString("Range"); + long offset = 0; + try { + ranges = getRanges(range, fileSize); + } catch (Exception ex) { + logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage()); + // The message starts with "Datafile" because otherwise the message is not passed + // to the user due to how WebApplicationExceptionHandler works. + throw new NotFoundException("Datafile download error due to Range header: " + ex.getLocalizedMessage()); + } + if (!ranges.isEmpty()) { + // For now we only support a single range. + offset = ranges.get(0).getStart(); + } + storageIO.setOffset(offset); storageIO.open(); } catch (IOException ioex) { //throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE); @@ -403,9 +420,14 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] long contentSize; boolean useChunkedTransfer = false; //if ((contentSize = getFileSize(di, storageIO.getVarHeader())) > 0) { - if ((contentSize = getContentSize(storageIO)) > 0) { + if ((contentSize = getContentSize(storageIO)) > 0 && ranges.isEmpty()) { logger.fine("Content size (retrieved from the AccessObject): " + contentSize); httpHeaders.add("Content-Length", contentSize); + } else if (ranges.isEmpty()) { + // For now we only support a single range. + long rangeContentSize = ranges.get(0).getLength(); + logger.fine("Content size (Range header in use): " + rangeContentSize); + httpHeaders.add("Content-Length", rangeContentSize); } else { //httpHeaders.add("Transfer-encoding", "chunked"); //useChunkedTransfer = true; @@ -433,12 +455,26 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] } } + // Default to reading the whole file. We'll count down as we write. + long leftToRead = fileSize; + if (!ranges.isEmpty()) { + // Read a range of bytes instead of the whole file. + // For now we only support a single range. + leftToRead = ranges.get(0).getLength(); + } while ((bufsize = instream.read(bffr)) != -1) { if (useChunkedTransfer) { String chunkSizeLine = String.format("%x\r\n", bufsize); outstream.write(chunkSizeLine.getBytes()); } - outstream.write(bffr, 0, bufsize); + if ((leftToRead -= bufsize) > 0) { + // Just do a normal write. Potentially lots to go. Don't break. + outstream.write(bffr, 0, bufsize); + } else { + // Get those last bytes or bytes equal to bufsize. Last one. Then break. + outstream.write(bffr, 0, (int) leftToRead + bufsize); + break; + } if (useChunkedTransfer) { outstream.write(chunkClose); } @@ -585,4 +621,78 @@ private long getFileSize(DownloadInstance di, String extraHeader) { } return -1; } + + /** + * @param range "bytes 0-10" for example. Found in the "Range" HTTP header. + * @param fileSize File size in bytes. + * @throws RunTimeException on any problems processing the Range header. + */ + public List getRanges(String range, long fileSize) { + // Inspired by https://gist.github.com/davinkevin/b97e39d7ce89198774b4 + // via https://stackoverflow.com/questions/28427339/how-to-implement-http-byte-range-requests-in-spring-mvc/28479001#28479001 + List ranges = new ArrayList<>(); + + if (range != null) { + logger.fine("Range header supplied: " + range); + + // Technically this regex supports multiple ranges. + // Below we have a check to enforce a single range. + if (!range.matches("^bytes=\\d*-\\d*(,\\d*-\\d*)*$")) { + throw new RuntimeException("The format is bytes=- where start and end are optional."); + } + + if (ranges.isEmpty()) { + // The 6 is to remove "bytes=" + + for (String part : range.substring(6).split(",")) { + + long start = getRangeStart(part); + long end = getRangeEnd(part); + + if (start == -1) { + // start does not exist. Base start off of how many bytes from end. + start = fileSize - end; + end = fileSize - 1; + } else if (end == -1 || end > fileSize - 1) { + // Set end when it doesn't exist. + // Also, automatically set end to size of file if end is beyond + // the file size (rather than throwing an error). + end = fileSize - 1; + } + + if (start > end) { + throw new RuntimeException("Start is larger than end."); + } + + if (ranges.size() < 1) { + ranges.add(new Range(start, end)); + } else { + throw new RuntimeException("Only one range is allowed."); + } + + } + } + } + + return ranges; + } + + /** + * @return Return a positive long or -1 if start does not exist. + */ + public static long getRangeStart(String part) { + // Get everything before the "-". + String start = part.substring(0, part.indexOf("-")); + return (start.length() > 0) ? Long.parseLong(start) : -1; + } + + /** + * @return Return a positive long or -1 if end does not exist. + */ + public static long getRangeEnd(String part) { + // Get everything after the "-". + String end = part.substring(part.indexOf("-") + 1, part.length()); + return (end.length() > 0) ? Long.parseLong(end) : -1; + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java index bd0549622f0..9c55caa3360 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java @@ -509,6 +509,7 @@ public FileInputStream openLocalFileAsInputStream () { try { in = new FileInputStream(getFileSystemPath().toFile()); + in.skip(this.getOffset()); } catch (IOException ex) { // We don't particularly care what the reason why we have // failed to access the file was. diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java new file mode 100644 index 00000000000..e07d6aaae94 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java @@ -0,0 +1,30 @@ +package edu.harvard.iq.dataverse.dataaccess; + +public class Range { + + // Used to set the offset, how far to skip into the file. + private final long start; + // Used to calculate the length. + private final long end; + // Used to determine when to stop reading. + private final long length; + + public Range(long start, long end) { + this.start = start; + this.end = end; + this.length = end - start + 1; + } + + public long getStart() { + return start; + } + + public long getEnd() { + return end; + } + + public long getLength() { + return length; + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 80fbbd103e7..52bb2cd5895 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -282,6 +282,8 @@ public InputStream getInputStream() throws IOException { setChannel(Channels.newChannel(super.getInputStream())); + super.getInputStream().skip(super.getOffset()); + return super.getInputStream(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java index 2f66eec5f4c..0387561a075 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java @@ -191,6 +191,12 @@ public boolean canWrite() { /*private int status;*/ private long size; + + /** + * Where in the file to seek to when reading (default is zero bytes, the + * start of the file). + */ + private long offset; private String mimeType; private String fileName; @@ -272,6 +278,10 @@ public long getSize() { return size; } + public long getOffset() { + return offset; + } + public InputStream getInputStream() throws IOException { return in; } @@ -381,6 +391,10 @@ public void setSize(long s) { size = s; } + public void setOffset(long offset) { + this.offset = offset; + } + public void setInputStream(InputStream is) { in = is; } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java b/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java new file mode 100644 index 00000000000..a59bc497c8b --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java @@ -0,0 +1,116 @@ +package edu.harvard.iq.dataverse.api; + +import edu.harvard.iq.dataverse.dataaccess.Range; +import java.util.List; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import org.junit.Before; +import org.junit.Test; + +public class DownloadInstanceWriterTest { + + DownloadInstanceWriter diw; + + @Before + public void setUpClass() { + diw = new DownloadInstanceWriter(); + } + + // Get first 10 bytes. + @Test + public void testGetRange0to9of100() { + List ranges = diw.getRanges("bytes=0-9", 100); + assertEquals(0, ranges.get(0).getStart()); + assertEquals(9, ranges.get(0).getEnd()); + assertEquals(10, ranges.get(0).getLength()); + } + + // Don't start reading until 90th byte, get rest. + @Test + public void testGetRange90toNullof100() { + List ranges = diw.getRanges("bytes=90-", 100); + assertEquals(90, ranges.get(0).getStart()); + assertEquals(99, ranges.get(0).getEnd()); + assertEquals(10, ranges.get(0).getLength()); + } + + // Get last 10 bytes. + @Test + public void testGetRangeNullto5of10() { + List ranges = diw.getRanges("bytes=-10", 100); + assertEquals(90, ranges.get(0).getStart()); + assertEquals(99, ranges.get(0).getEnd()); + assertEquals(10, ranges.get(0).getLength()); + } + + // Get last byte + @Test + public void testGetRange100toNullof101() { + List ranges = diw.getRanges("bytes=100-", 101); + assertEquals(100, ranges.get(0).getStart()); + assertEquals(100, ranges.get(0).getEnd()); + assertEquals(1, ranges.get(0).getLength()); + } + + // When you request a range beyond the size of the file we just + // give you what we can (rather than throwing an error). + @Test + public void testGetRangeBeyondFilesize() { + List ranges = diw.getRanges("bytes=100-149", 120); + assertEquals(100, ranges.get(0).getStart()); + assertEquals(119, ranges.get(0).getEnd()); + assertEquals(20, ranges.get(0).getLength()); + } + + // Attempt to get invalid range (start larger than end). + @Test + public void testGetRangeInvalidStartLargerThanEnd() { + Exception expectedException = null; + try { + List ranges = diw.getRanges("bytes=20-10", 100); + } catch (Exception ex) { + System.out.println("exeption: " + ex); + expectedException = ex; + } + assertNotNull(expectedException); + } + + // Test "junk" instead of "bytes=0-10" + @Test + public void testGetRangeInvalidJunk() { + Exception expectedException = null; + try { + List ranges = diw.getRanges("junk", 100); + } catch (Exception ex) { + System.out.println("exeption: " + ex); + expectedException = ex; + } + assertNotNull(expectedException); + } + + // Get first 10 bytes and last 10 bytes. Not currently supported. + @Test + public void testGetRanges0to0and90toNull() { + Exception expectedException = null; + try { + List ranges = diw.getRanges("bytes=0-9,-10", 100); + // These asserts on start, end, etc. don't actually + // run because we throw an expection that multiple + // ranges are not supported. + // + // first range + assertEquals(0, ranges.get(0).getStart()); + assertEquals(9, ranges.get(0).getEnd()); + assertEquals(10, ranges.get(0).getLength()); + // second range + assertEquals(90, ranges.get(1).getStart()); + assertEquals(99, ranges.get(1).getEnd()); + assertEquals(10, ranges.get(1).getLength()); + } catch (Exception ex) { + System.out.println("exeption: " + ex); + expectedException = ex; + } + assertNotNull(expectedException); + } + +} diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index 9fa06e28a0d..c824fbf7182 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -38,6 +38,7 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; +import org.junit.Ignore; public class FilesIT { @@ -1579,5 +1580,51 @@ private void dashes(){ private void msgt(String m){ dashes(); msg(m); dashes(); } - + + @Ignore + @Test + public void test2toNull() { + Integer localFile = 15; + Integer s3file = 16; + Integer fileId = 0; + fileId = localFile; + String nullApiToken = null; + String byteRange = null; + byteRange = "2-"; + // java.lang.IndexOutOfBoundsException: Range [0, 0 + -2) out of bounds for length 32768 + Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullApiToken); + downloadFile.prettyPrint(); + } + + @Ignore + @Test + public void test0to100() { + Integer localFile = 16; + Integer s3file = 11; + Integer fileId = 0; + fileId = localFile; + fileId = s3file; + String nullApiToken = null; + String byteRange = null; + byteRange = "0-9"; // first ten + byteRange = "-9"; // last ten + byteRange = "9-"; // last ten + Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullApiToken); + downloadFile.prettyPrint(); + } + + @Ignore + @Test + public void testMultipleRanges() { + Integer localFile = 16; + Integer s3file = 11; + Integer fileId = 0; + fileId = localFile; +// fileId = s3file; + String nullApiToken = null; + String byteRange = null; + byteRange = "0-9,90-99"; + Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullApiToken); + downloadFile.prettyPrint(); + } } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index d29d3761f22..a040c3506c3 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -720,13 +720,23 @@ static Response downloadFile(Integer fileId) { } static Response downloadFile(Integer fileId, String apiToken) { - return given() - /** - * Data Access API does not support X-Dataverse-key header - - * https://github.com/IQSS/dataverse/issues/2662 - */ - //.header(API_TOKEN_HTTP_HEADER, apiToken) - .get("/api/access/datafile/" + fileId + "?key=" + apiToken); + String nullByteRange = null; + return downloadFile(fileId, nullByteRange, apiToken); + } + + static Response downloadFile(Integer fileId, String byteRange, String apiToken) { + RequestSpecification requestSpecification = given(); + if (byteRange != null) { + requestSpecification.header("Range", "bytes=" + byteRange); + } + /** + * Data Access API does not support X-Dataverse-key header - + * https://github.com/IQSS/dataverse/issues/2662 + * + * Actually, these days it does. We could switch. + */ + //.header(API_TOKEN_HTTP_HEADER, apiToken) + return requestSpecification.get("/api/access/datafile/" + fileId + "?key=" + apiToken); } static Response downloadTabularFile(Integer fileId) { diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java index 286fe8a6595..ad31e4adee3 100644 --- a/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java @@ -269,6 +269,30 @@ public void testOpenLocalFileAsInputStream() throws IOException { assertNotNull(datasetAccess.openLocalFileAsOutputStream()); } + /** + * Test of openLocalFileAsInputStream (with an offset) and + * openLocalFileAsOutputStream method, of class FileAccessIO. + * + * @throws java.io.IOException if test is broken + */ + @Test + public void testOpenLocalFileAsInputStreamWithOffset() throws IOException { + // Start reading a few bytes in. + datasetAccess.setOffset(11); + BufferedReader br = new BufferedReader(new InputStreamReader(datasetAccess.openLocalFileAsInputStream())); + StringBuilder sb = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append('\n'); + } + // Put offset back. + datasetAccess.setOffset(0); + // Instead of "This is a test string" + assertEquals("est string\n", sb.toString()); + assertNotNull(datasetAccess.openLocalFileAsOutputStream()); + } + /** * Test of getAuxFileAsInputStream method, of class FileAccessIO. * From 875d08de3583b63a63bb908059c94b838121f907 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 8 Sep 2021 14:03:17 -0400 Subject: [PATCH 02/26] avoid NPE when HttpHeaders is null #6937 This was null for aux files, for example. --- .../harvard/iq/dataverse/api/DownloadInstanceWriter.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index c8c9de30215..0f5ccfadb0a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -43,6 +43,7 @@ import javax.ws.rs.NotFoundException; import javax.ws.rs.RedirectionException; import javax.ws.rs.ServiceUnavailableException; +import javax.ws.rs.core.HttpHeaders; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; @@ -83,7 +84,11 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] List ranges = new ArrayList<>(); long fileSize = storageIO.getDataFile().getFilesize(); try { - String range = di.getRequestHttpHeaders().getHeaderString("Range"); + String range = null; + HttpHeaders headers = di.getRequestHttpHeaders(); + if (headers != null) { + range = headers.getHeaderString("Range"); + } long offset = 0; try { ranges = getRanges(range, fileSize); From fb8d5597f68cf0219b9111bfb23f2c784f1903af Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 8 Sep 2021 15:17:40 -0400 Subject: [PATCH 03/26] set file size properly for original tabular files #6937 --- .../edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 0f5ccfadb0a..5537693c670 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -293,6 +293,8 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if ("original".equals(di.getConversionParamValue())) { logger.fine("stored original of an ingested file requested"); + // Get orginal file size before storageIO is reassigned to prevent an NPE. + fileSize = storageIO.getDataFile().getOriginalFileSize(); storageIO = StoredOriginalFile.retreive(storageIO); } else { // Other format conversions: From 809d20b7a15aaa31d9e332d07f54c13894038fc5 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 8 Sep 2021 15:51:19 -0400 Subject: [PATCH 04/26] send content size from range #6937 --- .../edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 5537693c670..c3b3350a355 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -430,7 +430,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if ((contentSize = getContentSize(storageIO)) > 0 && ranges.isEmpty()) { logger.fine("Content size (retrieved from the AccessObject): " + contentSize); httpHeaders.add("Content-Length", contentSize); - } else if (ranges.isEmpty()) { + } else if (!ranges.isEmpty()) { // For now we only support a single range. long rangeContentSize = ranges.get(0).getLength(); logger.fine("Content size (Range header in use): " + rangeContentSize); From 29ea2564a96bf68790c2282108922b12971ae897 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 30 Sep 2021 16:52:30 -0400 Subject: [PATCH 05/26] move range check down so storageIO.getSize() works #6937 Also start writing real integration tests. A FIXME now notes that ranges don't currently work well with tabular files, due to how we store the header. --- .../dataverse/api/DownloadInstanceWriter.java | 109 +++++++++------- .../edu/harvard/iq/dataverse/api/FilesIT.java | 116 ++++++++++++++++-- .../edu/harvard/iq/dataverse/api/UtilIT.java | 16 ++- 3 files changed, 187 insertions(+), 54 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index c3b3350a355..c7b22f0e980 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -81,28 +81,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] StorageIO storageIO = DataAccess.getStorageIO(dataFile, daReq); if (storageIO != null) { - List ranges = new ArrayList<>(); - long fileSize = storageIO.getDataFile().getFilesize(); try { - String range = null; - HttpHeaders headers = di.getRequestHttpHeaders(); - if (headers != null) { - range = headers.getHeaderString("Range"); - } - long offset = 0; - try { - ranges = getRanges(range, fileSize); - } catch (Exception ex) { - logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage()); - // The message starts with "Datafile" because otherwise the message is not passed - // to the user due to how WebApplicationExceptionHandler works. - throw new NotFoundException("Datafile download error due to Range header: " + ex.getLocalizedMessage()); - } - if (!ranges.isEmpty()) { - // For now we only support a single range. - offset = ranges.get(0).getStart(); - } - storageIO.setOffset(offset); storageIO.open(); } catch (IOException ioex) { //throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE); @@ -293,8 +272,6 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if ("original".equals(di.getConversionParamValue())) { logger.fine("stored original of an ingested file requested"); - // Get orginal file size before storageIO is reassigned to prevent an NPE. - fileSize = storageIO.getDataFile().getOriginalFileSize(); storageIO = StoredOriginalFile.retreive(storageIO); } else { // Other format conversions: @@ -425,7 +402,32 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] httpHeaders.add("Content-Type", mimeType + "; name=\"" + finalFileName + "\""); long contentSize; + // This is hard-coded to false and could probably be removed some day, + // but for now it's illustrative. boolean useChunkedTransfer = false; + + // User may have requested a range of bytes. + List ranges = new ArrayList<>(); + long fileSize = storageIO.getSize(); + String range = null; + HttpHeaders headers = di.getRequestHttpHeaders(); + if (headers != null) { + range = headers.getHeaderString("Range"); + } + long offset = 0; + try { + ranges = getRanges(range, fileSize); + } catch (Exception ex) { + logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage()); + // The message starts with "Datafile" because otherwise the message is not passed + // to the user due to how WebApplicationExceptionHandler works. + throw new NotFoundException("Datafile download error due to Range header: " + ex.getLocalizedMessage()); + } + if (!ranges.isEmpty()) { + // For now we only support a single range. + offset = ranges.get(0).getStart(); + } + storageIO.setOffset(offset); //if ((contentSize = getFileSize(di, storageIO.getVarHeader())) > 0) { if ((contentSize = getContentSize(storageIO)) > 0 && ranges.isEmpty()) { logger.fine("Content size (retrieved from the AccessObject): " + contentSize); @@ -436,6 +438,10 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] logger.fine("Content size (Range header in use): " + rangeContentSize); httpHeaders.add("Content-Length", rangeContentSize); } else { + // Neither a range request, nor do we know the size. Must be dynamically + // generated, such as a subsetting request. + // Note the chunk lines below are commented out because Payara will see + // that the header is not present and switch into chunk mode. //httpHeaders.add("Transfer-encoding", "chunked"); //useChunkedTransfer = true; } @@ -450,11 +456,15 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // any extra content, such as the variable header for the // subsettable files: if (storageIO.getVarHeader() != null) { + logger.fine("storageIO.getVarHeader().getBytes().length: " + storageIO.getVarHeader().getBytes().length); if (storageIO.getVarHeader().getBytes().length > 0) { if (useChunkedTransfer) { String chunkSizeLine = String.format("%x\r\n", storageIO.getVarHeader().getBytes().length); outstream.write(chunkSizeLine.getBytes()); } + // FIXME: check if ranges.isEmpty() here. These bytes, the header, is being written + // even if we are only requesting the last bytes of a file. So you see all or part + // of the header, even if you don't request it. outstream.write(storageIO.getVarHeader().getBytes()); if (useChunkedTransfer) { outstream.write(chunkClose); @@ -462,29 +472,42 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] } } - // Default to reading the whole file. We'll count down as we write. - long leftToRead = fileSize; - if (!ranges.isEmpty()) { - // Read a range of bytes instead of the whole file. - // For now we only support a single range. - leftToRead = ranges.get(0).getLength(); - } - while ((bufsize = instream.read(bffr)) != -1) { - if (useChunkedTransfer) { - String chunkSizeLine = String.format("%x\r\n", bufsize); - outstream.write(chunkSizeLine.getBytes()); - } - if ((leftToRead -= bufsize) > 0) { - // Just do a normal write. Potentially lots to go. Don't break. + // Dynamic streams, etc. Normal operation. No leftToRead. + if (ranges.isEmpty()) { + logger.fine("Normal, non-range request of file id " + dataFile.getId()); + while ((bufsize = instream.read(bffr)) != -1) { + if (useChunkedTransfer) { + String chunkSizeLine = String.format("%x\r\n", bufsize); + outstream.write(chunkSizeLine.getBytes()); + } outstream.write(bffr, 0, bufsize); - } else { - // Get those last bytes or bytes equal to bufsize. Last one. Then break. - outstream.write(bffr, 0, (int) leftToRead + bufsize); - break; + if (useChunkedTransfer) { + outstream.write(chunkClose); + } } - if (useChunkedTransfer) { - outstream.write(chunkClose); + } else { + logger.fine("Range request of file id " + dataFile.getId()); + // Read a range of bytes instead of the whole file. We'll count down as we write. + // For now we only support a single range. + long leftToRead = ranges.get(0).getLength(); + while ((bufsize = instream.read(bffr)) != -1) { + if (useChunkedTransfer) { + String chunkSizeLine = String.format("%x\r\n", bufsize); + outstream.write(chunkSizeLine.getBytes()); + } + if ((leftToRead -= bufsize) > 0) { + // Just do a normal write. Potentially lots to go. Don't break. + outstream.write(bffr, 0, bufsize); + } else { + // Get those last bytes or bytes equal to bufsize. Last one. Then break. + outstream.write(bffr, 0, (int) leftToRead + bufsize); + break; + } + if (useChunkedTransfer) { + outstream.write(chunkClose); + } } + } if (useChunkedTransfer) { diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index c824fbf7182..1d13559985b 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -11,8 +11,12 @@ import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.SystemConfig; +import java.io.File; +import java.io.IOException; import static java.lang.Thread.sleep; import java.math.BigDecimal; +import java.nio.file.Path; +import java.nio.file.Paths; import java.text.MessageFormat; import java.util.Arrays; import java.util.Collections; @@ -1589,30 +1593,40 @@ public void test2toNull() { Integer fileId = 0; fileId = localFile; String nullApiToken = null; + String nullFormat = null; + String nullThumbnail = null; String byteRange = null; byteRange = "2-"; // java.lang.IndexOutOfBoundsException: Range [0, 0 + -2) out of bounds for length 32768 - Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullApiToken); + Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullFormat, nullThumbnail, nullApiToken); downloadFile.prettyPrint(); } - @Ignore +// @Ignore @Test public void test0to100() { Integer localFile = 16; + Integer tabularLocalFile = 156; Integer s3file = 11; Integer fileId = 0; fileId = localFile; - fileId = s3file; +// fileId = s3file; + fileId = tabularLocalFile; String nullApiToken = null; + String format = null; + format = "original"; + String nullThumbnail = null; String byteRange = null; - byteRange = "0-9"; // first ten - byteRange = "-9"; // last ten - byteRange = "9-"; // last ten - Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullApiToken); +// byteRange = "0-9"; // first ten + byteRange = "0-150"; +// byteRange = "-9"; // last ten +// byteRange = "9-"; // last ten + Response downloadFile = UtilIT.downloadFile(fileId, byteRange, format, nullThumbnail, nullApiToken); downloadFile.prettyPrint(); } + // test Range header with thumbnails, etc. + @Ignore @Test public void testMultipleRanges() { @@ -1622,9 +1636,95 @@ public void testMultipleRanges() { fileId = localFile; // fileId = s3file; String nullApiToken = null; + String nullFormat = null; + String nullThumbnail = null; String byteRange = null; byteRange = "0-9,90-99"; - Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullApiToken); + Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullFormat, nullThumbnail, nullApiToken); downloadFile.prettyPrint(); } + + @Test + public void testRange() throws IOException { + + Response createUser = UtilIT.createRandomUser(); + createUser.prettyPrint(); + String authorUsername = UtilIT.getUsernameFromResponse(createUser); + String authorApiToken = UtilIT.getApiTokenFromResponse(createUser); + + Response createDataverse = UtilIT.createRandomDataverse(authorApiToken); + createDataverse.prettyPrint(); + createDataverse.then().assertThat() + .statusCode(CREATED.getStatusCode()); + String dataverseAlias = UtilIT.getAliasFromResponse(createDataverse); + + Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, authorApiToken); + createDataset.prettyPrint(); + createDataset.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset); + String datasetPid = JsonPath.from(createDataset.asString()).getString("data.persistentId"); + + Path pathToCsv = Paths.get(java.nio.file.Files.createTempDirectory(null) + File.separator + "data.csv"); + String contentOfCsv = "" + + "name,pounds,species\n" + + "Marshall,40,dog\n" + + "Tiger,17,cat\n" + + "Panther,21,cat\n"; + java.nio.file.Files.write(pathToCsv, contentOfCsv.getBytes()); + + Response uploadFileCsv = UtilIT.uploadFileViaNative(datasetId.toString(), pathToCsv.toString(), authorApiToken); + uploadFileCsv.prettyPrint(); + uploadFileCsv.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.files[0].label", equalTo("data.csv")); + + Integer fileIdCsv = JsonPath.from(uploadFileCsv.body().asString()).getInt("data.files[0].dataFile.id"); + + assertTrue("Failed test if Ingest Lock exceeds max duration " + pathToCsv, UtilIT.sleepForLock(datasetId.longValue(), "Ingest", authorApiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION)); + + + String byteRange = "0-9"; // first ten +// byteRange = "0-29"; // first thirty +// byteRange = "-40"; + String format = null; +// format = "original"; +// String nullApiToken = null; + Response downloadFile = UtilIT.downloadFile(fileIdCsv, byteRange, format, null, authorApiToken); + downloadFile.prettyPrint(); + + if (true) return; + Response uploadFile = UtilIT.uploadFile(datasetPid, "trees.zip", authorApiToken); + uploadFile.prettyPrint(); + +// Response uploadFile = UtilIT.uploadFileViaNative(datasetPid, apiToken, authorApiToken); + + Response getDatasetJson1 = UtilIT.nativeGetUsingPersistentId(datasetPid, authorApiToken); + getDatasetJson1.prettyPrint(); + Long fileIdPng = JsonPath.from(getDatasetJson1.getBody().asString()).getLong("data.latestVersion.files[1].dataFile.id"); + System.out.println("datafileId: " + fileIdPng); + getDatasetJson1.then().assertThat() + .statusCode(200); + + String trueOrWidthInPixels = "true"; + Response getFileThumbnailImageA = UtilIT.getFileThumbnail(fileIdPng.toString(), trueOrWidthInPixels, authorApiToken); + getFileThumbnailImageA.then().assertThat() + .contentType("image/png") + .statusCode(OK.getStatusCode()); + + Response publishDataverse = UtilIT.publishDataverseViaNativeApi(dataverseAlias, authorApiToken); + publishDataverse.then().assertThat().statusCode(OK.getStatusCode()); + Response publishDataset = UtilIT.publishDatasetViaNativeApi(datasetPid, "major", authorApiToken); + publishDataset.then().assertThat().statusCode(OK.getStatusCode()); + + // Yes, you can get a range of bytes from a thumbnail. + String imageThumbPixels = "true"; + Response downloadThumbnail = UtilIT.downloadFile(fileIdPng.intValue(), "0-149", null, imageThumbPixels, authorApiToken); + downloadThumbnail.prettyPrint(); + downloadThumbnail.then().assertThat().statusCode(OK.getStatusCode()); + + + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index a040c3506c3..94b47f250f3 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -721,14 +721,24 @@ static Response downloadFile(Integer fileId) { static Response downloadFile(Integer fileId, String apiToken) { String nullByteRange = null; - return downloadFile(fileId, nullByteRange, apiToken); + String nullFormat = null; + String nullImageThumb = null; + return downloadFile(fileId, nullByteRange, nullFormat, nullImageThumb, apiToken); } - static Response downloadFile(Integer fileId, String byteRange, String apiToken) { + static Response downloadFile(Integer fileId, String byteRange, String format, String imageThumb, String apiToken) { RequestSpecification requestSpecification = given(); if (byteRange != null) { requestSpecification.header("Range", "bytes=" + byteRange); } + String optionalFormat = ""; + if (format != null) { + optionalFormat = "&format=" + format; + } + String optionalImageThumb = ""; + if (format != null) { + optionalImageThumb = "&imageThumb=" + imageThumb; + } /** * Data Access API does not support X-Dataverse-key header - * https://github.com/IQSS/dataverse/issues/2662 @@ -736,7 +746,7 @@ static Response downloadFile(Integer fileId, String byteRange, String apiToken) * Actually, these days it does. We could switch. */ //.header(API_TOKEN_HTTP_HEADER, apiToken) - return requestSpecification.get("/api/access/datafile/" + fileId + "?key=" + apiToken); + return requestSpecification.get("/api/access/datafile/" + fileId + "?key=" + apiToken + optionalFormat + optionalImageThumb); } static Response downloadTabularFile(Integer fileId) { From adab5546bb88be7920bd49520995d4bf8784db40 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 4 Oct 2021 16:06:02 -0400 Subject: [PATCH 06/26] vary variable header writes by range request #6937 If a range is not requested, write the whole variable header line. Otherwise, make a reasonable effort to write what we can. --- .../dataverse/api/DownloadInstanceWriter.java | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index c7b22f0e980..f13da0bce1f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -36,6 +36,7 @@ import java.net.URISyntaxException; import java.net.URLEncoder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; @@ -462,10 +463,34 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] String chunkSizeLine = String.format("%x\r\n", storageIO.getVarHeader().getBytes().length); outstream.write(chunkSizeLine.getBytes()); } - // FIXME: check if ranges.isEmpty() here. These bytes, the header, is being written - // even if we are only requesting the last bytes of a file. So you see all or part - // of the header, even if you don't request it. - outstream.write(storageIO.getVarHeader().getBytes()); + // If a range is not being requested, let's call that the normal case. + // Write the entire line of variable headers. Later, the rest of the file + // will be written. + if (ranges.isEmpty()) { + outstream.write(storageIO.getVarHeader().getBytes()); + } else { + // Range requested. We should only write the header if the range specifies the + // start of the file or if the range includes some of the end of the header. + // For now we only support a single range. + Range range1 = ranges.get(0); + if (range1.getStart() == 0 & range1.getLength() >= storageIO.getVarHeader().getBytes().length) { + // Does the range requested include the whole length of the variable headers? + // If so, write those headers, if the range is the start of the file (index 0). + logger.fine("Writing entire variable header line."); + outstream.write(storageIO.getVarHeader().getBytes()); + } else if (range1.getStart() == 0 & range1.getLength() < storageIO.getVarHeader().getBytes().length) { + // Is the user requesting a small range near the front, so small that we can't + // even write the full line of variable headers? Ok, we'll just write as much + // as we can, within the range. + logger.fine("Writing this many bytes of the variable header line: " + range1.getLength()); + outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), 0, (int) range1.getLength())); + } else { + // TODO: Consider supporting weird scenarios: + // - The user is requesting the end or middle of the file and should end up with part or all of the variable headers. + // - ?? + logger.fine("A range was requested but didn't start with 0. Skip printing of variable headers."); + } + } if (useChunkedTransfer) { outstream.write(chunkClose); } From 4724e116a4f04bfd39707141c1d4565dc2502f28 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 5 Oct 2021 16:07:57 -0400 Subject: [PATCH 07/26] get offset working again #6937 Offset stopped working in 29ea256. Before that commit we were relying on setOffset() being called after open(). Now we can (and do) call setOffset() after open(). --- .../iq/dataverse/dataaccess/FileAccessIO.java | 1 - .../harvard/iq/dataverse/dataaccess/S3AccessIO.java | 2 -- .../harvard/iq/dataverse/dataaccess/StorageIO.java | 12 ++++++++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java index 9c55caa3360..bd0549622f0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java @@ -509,7 +509,6 @@ public FileInputStream openLocalFileAsInputStream () { try { in = new FileInputStream(getFileSystemPath().toFile()); - in.skip(this.getOffset()); } catch (IOException ex) { // We don't particularly care what the reason why we have // failed to access the file was. diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 52bb2cd5895..80fbbd103e7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -282,8 +282,6 @@ public InputStream getInputStream() throws IOException { setChannel(Channels.newChannel(super.getInputStream())); - super.getInputStream().skip(super.getOffset()); - return super.getInputStream(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java index 0387561a075..b0e9648285c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java @@ -391,8 +391,16 @@ public void setSize(long s) { size = s; } - public void setOffset(long offset) { - this.offset = offset; + // open() has already been called. Now we can skip, if need be. + public void setOffset(long offset) throws IOException { + InputStream inputStream = getInputStream(); + if (inputStream != null) { + inputStream.skip(offset); + // The skip has already been done. Why not record it. + this.offset = offset; + } else { + throw new IOException("Could not skip into InputStream because it is null"); + } } public void setInputStream(InputStream is) { From f0f08776f14c79862cce6850835122f29154a88a Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 5 Oct 2021 16:23:46 -0400 Subject: [PATCH 08/26] delete old setOffset test (was for pre 4724e1) #6937 --- .../dataaccess/FileAccessIOTest.java | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java index ad31e4adee3..286fe8a6595 100644 --- a/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIOTest.java @@ -269,30 +269,6 @@ public void testOpenLocalFileAsInputStream() throws IOException { assertNotNull(datasetAccess.openLocalFileAsOutputStream()); } - /** - * Test of openLocalFileAsInputStream (with an offset) and - * openLocalFileAsOutputStream method, of class FileAccessIO. - * - * @throws java.io.IOException if test is broken - */ - @Test - public void testOpenLocalFileAsInputStreamWithOffset() throws IOException { - // Start reading a few bytes in. - datasetAccess.setOffset(11); - BufferedReader br = new BufferedReader(new InputStreamReader(datasetAccess.openLocalFileAsInputStream())); - StringBuilder sb = new StringBuilder(); - String line; - while ((line = br.readLine()) != null) { - sb.append(line); - sb.append('\n'); - } - // Put offset back. - datasetAccess.setOffset(0); - // Instead of "This is a test string" - assertEquals("est string\n", sb.toString()); - assertNotNull(datasetAccess.openLocalFileAsOutputStream()); - } - /** * Test of getAuxFileAsInputStream method, of class FileAccessIO. * From b02209d168e464049efb2989b487ceb4d5a5ea9a Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 5 Oct 2021 16:25:08 -0400 Subject: [PATCH 09/26] add more offset/range API tests #6937 --- .../edu/harvard/iq/dataverse/api/FilesIT.java | 130 +++++++++++++----- 1 file changed, 97 insertions(+), 33 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index 1d13559985b..ab40ed6118c 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -1602,7 +1602,7 @@ public void test2toNull() { downloadFile.prettyPrint(); } -// @Ignore + @Ignore @Test public void test0to100() { Integer localFile = 16; @@ -1648,24 +1648,73 @@ public void testMultipleRanges() { public void testRange() throws IOException { Response createUser = UtilIT.createRandomUser(); - createUser.prettyPrint(); +// createUser.prettyPrint(); String authorUsername = UtilIT.getUsernameFromResponse(createUser); String authorApiToken = UtilIT.getApiTokenFromResponse(createUser); Response createDataverse = UtilIT.createRandomDataverse(authorApiToken); - createDataverse.prettyPrint(); +// createDataverse.prettyPrint(); createDataverse.then().assertThat() .statusCode(CREATED.getStatusCode()); String dataverseAlias = UtilIT.getAliasFromResponse(createDataverse); Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, authorApiToken); - createDataset.prettyPrint(); +// createDataset.prettyPrint(); createDataset.then().assertThat() .statusCode(CREATED.getStatusCode()); Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset); String datasetPid = JsonPath.from(createDataset.asString()).getString("data.persistentId"); + Path pathToTxt = Paths.get(java.nio.file.Files.createTempDirectory(null) + File.separator + "file.txt"); + String contentOfTxt = "" + + "first is the worst\n" + + "second is the best\n" + + "third is the one with the hairy chest\n"; + java.nio.file.Files.write(pathToTxt, contentOfTxt.getBytes()); + + Response uploadFileTxt = UtilIT.uploadFileViaNative(datasetId.toString(), pathToTxt.toString(), authorApiToken); +// uploadFileTxt.prettyPrint(); + uploadFileTxt.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.files[0].label", equalTo("file.txt")); + + Integer fileIdTxt = JsonPath.from(uploadFileTxt.body().asString()).getInt("data.files[0].dataFile.id"); + + // Download the whole file. + Response downloadTxtNoArgs = UtilIT.downloadFile(fileIdTxt, null, null, null, authorApiToken); + downloadTxtNoArgs.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("first is the worst\n" + + "second is the best\n" + + "third is the one with the hairy chest\n")); + + // Download the first 10 bytes. + Response downloadTxtFirst10 = UtilIT.downloadFile(fileIdTxt, "0-9", null, null, authorApiToken); + downloadTxtFirst10.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("first is t")); + + // Download the last 6 bytes. + Response downloadTxtLast6 = UtilIT.downloadFile(fileIdTxt, "-6", null, null, authorApiToken); + downloadTxtLast6.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("chest\n")); + + // Download some bytes from the middle. + Response downloadTxtMiddle = UtilIT.downloadFile(fileIdTxt, "09-19", null, null, authorApiToken); + downloadTxtMiddle.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("the worst\ns")); + + // Skip the first 10 bytes and download the rest. + Response downloadTxtSkipFirst10 = UtilIT.downloadFile(fileIdTxt, "9-", null, null, authorApiToken); + downloadTxtSkipFirst10.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("the worst\n" + + "second is the best\n" + + "third is the one with the hairy chest\n")); + Path pathToCsv = Paths.get(java.nio.file.Files.createTempDirectory(null) + File.separator + "data.csv"); String contentOfCsv = "" + "name,pounds,species\n" @@ -1675,7 +1724,7 @@ public void testRange() throws IOException { java.nio.file.Files.write(pathToCsv, contentOfCsv.getBytes()); Response uploadFileCsv = UtilIT.uploadFileViaNative(datasetId.toString(), pathToCsv.toString(), authorApiToken); - uploadFileCsv.prettyPrint(); +// uploadFileCsv.prettyPrint(); uploadFileCsv.then().assertThat() .statusCode(OK.getStatusCode()) .body("data.files[0].label", equalTo("data.csv")); @@ -1684,28 +1733,44 @@ public void testRange() throws IOException { assertTrue("Failed test if Ingest Lock exceeds max duration " + pathToCsv, UtilIT.sleepForLock(datasetId.longValue(), "Ingest", authorApiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION)); - - String byteRange = "0-9"; // first ten -// byteRange = "0-29"; // first thirty -// byteRange = "-40"; - String format = null; -// format = "original"; -// String nullApiToken = null; - Response downloadFile = UtilIT.downloadFile(fileIdCsv, byteRange, format, null, authorApiToken); - downloadFile.prettyPrint(); + // Just the tabular file, not the original, no byte range. Vanilla. + Response downloadFileNoArgs = UtilIT.downloadFile(fileIdCsv, null, null, null, authorApiToken); + downloadFileNoArgs.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("name pounds species\n" + + "\"Marshall\" 40 \"dog\"\n" + + "\"Tiger\" 17 \"cat\"\n" + + "\"Panther\" 21 \"cat\"\n")); + + // Original version of tabular file (CSV in this case). + Response downloadFileOrig = UtilIT.downloadFile(fileIdCsv, null, "original", null, authorApiToken); + downloadFileOrig.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("name,pounds,species\n" + + "Marshall,40,dog\n" + + "Tiger,17,cat\n" + + "Panther,21,cat\n")); + + // first ten bytes + Response downloadFirstTen = UtilIT.downloadFile(fileIdCsv, "0-9", "original", null, authorApiToken); + downloadFirstTen.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("name,pound")); - if (true) return; - Response uploadFile = UtilIT.uploadFile(datasetPid, "trees.zip", authorApiToken); - uploadFile.prettyPrint(); - -// Response uploadFile = UtilIT.uploadFileViaNative(datasetPid, apiToken, authorApiToken); + // last ten bytes + Response downloadLastTen = UtilIT.downloadFile(fileIdCsv, "-10", "original", null, authorApiToken); + downloadLastTen.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("er,21,cat\n")); + + String pathToZipWithImage = "scripts/search/data/binary/trees.zip"; + Response uploadFileZipWithImage = UtilIT.uploadFileViaNative(datasetId.toString(), pathToZipWithImage, authorApiToken); +// uploadFileZipWithImage.prettyPrint(); + uploadFileZipWithImage.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.files[0].label", equalTo("trees.png")); - Response getDatasetJson1 = UtilIT.nativeGetUsingPersistentId(datasetPid, authorApiToken); - getDatasetJson1.prettyPrint(); - Long fileIdPng = JsonPath.from(getDatasetJson1.getBody().asString()).getLong("data.latestVersion.files[1].dataFile.id"); - System.out.println("datafileId: " + fileIdPng); - getDatasetJson1.then().assertThat() - .statusCode(200); + Integer fileIdPng = JsonPath.from(uploadFileZipWithImage.body().asString()).getInt("data.files[0].dataFile.id"); String trueOrWidthInPixels = "true"; Response getFileThumbnailImageA = UtilIT.getFileThumbnail(fileIdPng.toString(), trueOrWidthInPixels, authorApiToken); @@ -1713,17 +1778,16 @@ public void testRange() throws IOException { .contentType("image/png") .statusCode(OK.getStatusCode()); - Response publishDataverse = UtilIT.publishDataverseViaNativeApi(dataverseAlias, authorApiToken); - publishDataverse.then().assertThat().statusCode(OK.getStatusCode()); - Response publishDataset = UtilIT.publishDatasetViaNativeApi(datasetPid, "major", authorApiToken); - publishDataset.then().assertThat().statusCode(OK.getStatusCode()); - // Yes, you can get a range of bytes from a thumbnail. String imageThumbPixels = "true"; - Response downloadThumbnail = UtilIT.downloadFile(fileIdPng.intValue(), "0-149", null, imageThumbPixels, authorApiToken); - downloadThumbnail.prettyPrint(); + Response downloadThumbnail = UtilIT.downloadFile(fileIdPng, "0-149", null, imageThumbPixels, authorApiToken); +// downloadThumbnail.prettyPrint(); downloadThumbnail.then().assertThat().statusCode(OK.getStatusCode()); - + +// Response publishDataverse = UtilIT.publishDataverseViaNativeApi(dataverseAlias, authorApiToken); +// publishDataverse.then().assertThat().statusCode(OK.getStatusCode()); +// Response publishDataset = UtilIT.publishDatasetViaNativeApi(datasetPid, "major", authorApiToken); +// publishDataset.then().assertThat().statusCode(OK.getStatusCode()); } From 3d237a803e52d662130436f91bc8f7bf30360417 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 5 Oct 2021 16:35:32 -0400 Subject: [PATCH 10/26] add release note for HTTP Range header #6937 --- doc/release-notes/6937-range.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 doc/release-notes/6937-range.md diff --git a/doc/release-notes/6937-range.md b/doc/release-notes/6937-range.md new file mode 100644 index 00000000000..2ce3e30b9aa --- /dev/null +++ b/doc/release-notes/6937-range.md @@ -0,0 +1,10 @@ +### Support for HTTP "Range" Header for Partial File Downloads + +Dataverse now supports the HTTP "Range" header, which allows users to download parts of a file. Here are some examples: + +- `bytes=0-9` gets the first 10 bytes. +- `bytes=10-19` gets 20 bytes from the middle. +- `bytes=-10` gets the last 10 bytes. +- `bytes=9-` gets all bytes except the first 10. + +Only a single range is supported. For more information, see the [Data Access API](https://guides.dataverse.org/en/latest/api/dataaccess.html) section of the API Guide. From a57e6298efc71bd246e0a50809e90debdb4636b8 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 6 Oct 2021 10:07:26 -0400 Subject: [PATCH 11/26] try to make reviewdog happy by replacing tabs with \t #6937 --- src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index ab40ed6118c..fc9ae2b4fdc 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -1737,10 +1737,10 @@ public void testRange() throws IOException { Response downloadFileNoArgs = UtilIT.downloadFile(fileIdCsv, null, null, null, authorApiToken); downloadFileNoArgs.then().assertThat() .statusCode(OK.getStatusCode()) - .body(equalTo("name pounds species\n" - + "\"Marshall\" 40 \"dog\"\n" - + "\"Tiger\" 17 \"cat\"\n" - + "\"Panther\" 21 \"cat\"\n")); + .body(equalTo("name\tpounds\tspecies\n" + + "\"Marshall\"\t40\t\"dog\"\n" + + "\"Tiger\"\t17\t\"cat\"\n" + + "\"Panther\"\t21\t\"cat\"\n")); // Original version of tabular file (CSV in this case). Response downloadFileOrig = UtilIT.downloadFile(fileIdCsv, null, "original", null, authorApiToken); From e6233d8814f8905ebd08f5e604d0181e0766e546 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 6 Oct 2021 16:13:05 -0400 Subject: [PATCH 12/26] remove redundant ranges.isEmpty() check #6937 Just above `ranges` is initialized to `new ArrayList<>()`. --- .../dataverse/api/DownloadInstanceWriter.java | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index f13da0bce1f..ee27d14e409 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -696,36 +696,33 @@ public List getRanges(String range, long fileSize) { throw new RuntimeException("The format is bytes=- where start and end are optional."); } - if (ranges.isEmpty()) { - // The 6 is to remove "bytes=" - - for (String part : range.substring(6).split(",")) { - - long start = getRangeStart(part); - long end = getRangeEnd(part); - - if (start == -1) { - // start does not exist. Base start off of how many bytes from end. - start = fileSize - end; - end = fileSize - 1; - } else if (end == -1 || end > fileSize - 1) { - // Set end when it doesn't exist. - // Also, automatically set end to size of file if end is beyond - // the file size (rather than throwing an error). - end = fileSize - 1; - } - - if (start > end) { - throw new RuntimeException("Start is larger than end."); - } + // The 6 is to remove "bytes=" + for (String part : range.substring(6).split(",")) { + + long start = getRangeStart(part); + long end = getRangeEnd(part); + + if (start == -1) { + // start does not exist. Base start off of how many bytes from end. + start = fileSize - end; + end = fileSize - 1; + } else if (end == -1 || end > fileSize - 1) { + // Set end when it doesn't exist. + // Also, automatically set end to size of file if end is beyond + // the file size (rather than throwing an error). + end = fileSize - 1; + } - if (ranges.size() < 1) { - ranges.add(new Range(start, end)); - } else { - throw new RuntimeException("Only one range is allowed."); - } + if (start > end) { + throw new RuntimeException("Start is larger than end."); + } + if (ranges.size() < 1) { + ranges.add(new Range(start, end)); + } else { + throw new RuntimeException("Only one range is allowed."); } + } } From ed4ec86ad3ba3ecfd86a47f4a1209f0b38bc6041 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 7 Oct 2021 12:23:24 -0400 Subject: [PATCH 13/26] rearranged the handling of the "range requst on a tabular file" special case. (#6937) --- .../dataverse/api/DownloadInstanceWriter.java | 85 +++++++++++++------ 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index ee27d14e409..0ea67d5bccf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -416,6 +416,10 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] range = headers.getHeaderString("Range"); } long offset = 0; + long leftToRead = -1L; + // Moving the "left to read" var. here; - since we may need + // to start counting our range bytes outside the main .write() + // loop, if it's a tabular file with a header. try { ranges = getRanges(range, fileSize); } catch (Exception ex) { @@ -427,9 +431,9 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (!ranges.isEmpty()) { // For now we only support a single range. offset = ranges.get(0).getStart(); + leftToRead = ranges.get(0).getLength(); } - storageIO.setOffset(offset); - //if ((contentSize = getFileSize(di, storageIO.getVarHeader())) > 0) { + if ((contentSize = getContentSize(storageIO)) > 0 && ranges.isEmpty()) { logger.fine("Content size (retrieved from the AccessObject): " + contentSize); httpHeaders.add("Content-Length", contentSize); @@ -452,6 +456,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] int bufsize; byte[] bffr = new byte[4 * 8192]; byte[] chunkClose = "\r\n".getBytes(); + // before writing out any bytes from the input stream, flush // any extra content, such as the variable header for the @@ -459,36 +464,52 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (storageIO.getVarHeader() != null) { logger.fine("storageIO.getVarHeader().getBytes().length: " + storageIO.getVarHeader().getBytes().length); if (storageIO.getVarHeader().getBytes().length > 0) { - if (useChunkedTransfer) { - String chunkSizeLine = String.format("%x\r\n", storageIO.getVarHeader().getBytes().length); - outstream.write(chunkSizeLine.getBytes()); - } + //if (useChunkedTransfer) { + // String chunkSizeLine = String.format("%x\r\n", storageIO.getVarHeader().getBytes().length); + // outstream.write(chunkSizeLine.getBytes()); + //} // If a range is not being requested, let's call that the normal case. // Write the entire line of variable headers. Later, the rest of the file // will be written. if (ranges.isEmpty()) { + logger.info("writing the entire variable header"); outstream.write(storageIO.getVarHeader().getBytes()); } else { - // Range requested. We should only write the header if the range specifies the - // start of the file or if the range includes some of the end of the header. - // For now we only support a single range. - Range range1 = ranges.get(0); - if (range1.getStart() == 0 & range1.getLength() >= storageIO.getVarHeader().getBytes().length) { - // Does the range requested include the whole length of the variable headers? - // If so, write those headers, if the range is the start of the file (index 0). - logger.fine("Writing entire variable header line."); - outstream.write(storageIO.getVarHeader().getBytes()); - } else if (range1.getStart() == 0 & range1.getLength() < storageIO.getVarHeader().getBytes().length) { - // Is the user requesting a small range near the front, so small that we can't - // even write the full line of variable headers? Ok, we'll just write as much - // as we can, within the range. - logger.fine("Writing this many bytes of the variable header line: " + range1.getLength()); - outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), 0, (int) range1.getLength())); + // Range requested. Since the output stream of a + // tabular file is made up of the varHeader and the body of + // the physical file, we should assume that the requested + // range may span any portion of the combined stream. + // Thus we may or may not to write the header, or a + // portion thereof. + logger.info("Skipping the variable header completely."); + int headerLength = storageIO.getVarHeader().getBytes().length; + if (offset >= headerLength) { + // We can skip the entire header. + // All we need to do is adjust the byte offset + // in the physical file; the number of bytes + // left to write stays unchanged, since we haven't + // written anything. + offset -= storageIO.getVarHeader().getBytes().length; } else { - // TODO: Consider supporting weird scenarios: - // - The user is requesting the end or middle of the file and should end up with part or all of the variable headers. - // - ?? - logger.fine("A range was requested but didn't start with 0. Skip printing of variable headers."); + // We need to write some portion of the header; + // Once we are done, we may or may not still have + // some bytes left to write from the main physical file. + if (offset + leftToRead <= headerLength) { + // This is a more straightforward case - we just need to + // write a portion of the header, and then we are done! + logger.info("Writing this many bytes of the variable header line: " + leftToRead); + outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), (int)offset, (int)offset + (int)leftToRead)); + // set "left to read" to zero, indicating that we are done: + leftToRead = 0; + } else { + // write the requested portion of the header: + logger.info("Writing this many bytes of the variable header line: " + (headerLength - offset)); + outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), (int)offset, headerLength)); + // and adjust the file offset and remaining number of bytes accordingly: + leftToRead -= (headerLength - offset); + offset = 0; + } + } } if (useChunkedTransfer) { @@ -510,11 +531,19 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] outstream.write(chunkClose); } } - } else { - logger.fine("Range request of file id " + dataFile.getId()); + } else if (leftToRead > 0) { + // This is a range request, and we still have bytes to read + // (for a tabular file, we may have already written enough + // bytes from the variable header!) + storageIO.setOffset(offset); + // Thinking about it, we could just do instream.skip(offset) + // here... But I would like to have this offset functionality + // in StorageIO, for any future cases where we may not + // be able to do that on the stream directly -- L.A. + logger.info("Range request of file id " + dataFile.getId()); // Read a range of bytes instead of the whole file. We'll count down as we write. // For now we only support a single range. - long leftToRead = ranges.get(0).getLength(); + //long leftToRead = ranges.get(0).getLength(); while ((bufsize = instream.read(bffr)) != -1) { if (useChunkedTransfer) { String chunkSizeLine = String.format("%x\r\n", bufsize); From 3e8b193b3fa2971380dfd4be91e0cee33d639e4e Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 7 Oct 2021 12:41:26 -0400 Subject: [PATCH 14/26] minor/cosmetic change (#6937) --- .../edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 0ea67d5bccf..92bf78ae03e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -489,7 +489,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // in the physical file; the number of bytes // left to write stays unchanged, since we haven't // written anything. - offset -= storageIO.getVarHeader().getBytes().length; + offset -= headerLength; } else { // We need to write some portion of the header; // Once we are done, we may or may not still have @@ -539,7 +539,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // Thinking about it, we could just do instream.skip(offset) // here... But I would like to have this offset functionality // in StorageIO, for any future cases where we may not - // be able to do that on the stream directly -- L.A. + // be able to do that on the stream directly (?) -- L.A. logger.info("Range request of file id " + dataFile.getId()); // Read a range of bytes instead of the whole file. We'll count down as we write. // For now we only support a single range. From 12900b739cdc470dbc66b40a74072dd04d841e90 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 7 Oct 2021 12:45:23 -0400 Subject: [PATCH 15/26] couple more cosmetic changes in comments/logging (the .info() logging will need to be muted of course) (#6937) --- .../edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 92bf78ae03e..3baeb099c40 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -479,9 +479,8 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // tabular file is made up of the varHeader and the body of // the physical file, we should assume that the requested // range may span any portion of the combined stream. - // Thus we may or may not to write the header, or a + // Thus we may or may not have to write the header, or a // portion thereof. - logger.info("Skipping the variable header completely."); int headerLength = storageIO.getVarHeader().getBytes().length; if (offset >= headerLength) { // We can skip the entire header. @@ -489,6 +488,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // in the physical file; the number of bytes // left to write stays unchanged, since we haven't // written anything. + logger.info("Skipping the variable header completely."); offset -= headerLength; } else { // We need to write some portion of the header; From f6b4a06097f5840a3693a6f1ad39825208a0d03d Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 7 Oct 2021 13:38:32 -0400 Subject: [PATCH 16/26] add more range API tests #6937 downloadTabLast10 is commented out because it doesn't yet pass. --- .../edu/harvard/iq/dataverse/api/FilesIT.java | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index fc9ae2b4fdc..c951c5667c5 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -1742,9 +1742,41 @@ public void testRange() throws IOException { + "\"Tiger\"\t17\t\"cat\"\n" + "\"Panther\"\t21\t\"cat\"\n")); + // first 10 bytes of tabular format + Response downloadTabFirstTen = UtilIT.downloadFile(fileIdCsv, "0-9", null, null, authorApiToken); + downloadTabFirstTen.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("name\tpound")); + + // first 30 bytes of tabular format + Response downloadTabFirst30 = UtilIT.downloadFile(fileIdCsv, "0-29", null, null, authorApiToken); + downloadTabFirst30.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("name\tpounds\tspecies\n" + + "\"Marshall\"")); + +// // FIXME: This should be something like "er 21 cat" (the last line) like downloadOrigLastTen +// // FIXME: but right now it's showing "17 cat" (the second to last line). +// // last 10 bytes of tabular format +// Response downloadTabLast10 = UtilIT.downloadFile(fileIdCsv, "-10", null, null, authorApiToken); +// downloadTabLast10.then().assertThat() +// .statusCode(OK.getStatusCode()) +// .body(equalTo("er\"\t21\t\"cat\"\n")); + + Response downloadTabMiddleBytesHeader = UtilIT.downloadFile(fileIdCsv, "1-7", null, null, authorApiToken); + downloadTabMiddleBytesHeader.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("ame\tpou")); + + Response downloadTabMiddleBytesBody = UtilIT.downloadFile(fileIdCsv, "31-43", null, null, authorApiToken); + downloadTabMiddleBytesBody.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("40\t\"dog\"\n" + + "\"Tig")); + // Original version of tabular file (CSV in this case). - Response downloadFileOrig = UtilIT.downloadFile(fileIdCsv, null, "original", null, authorApiToken); - downloadFileOrig.then().assertThat() + Response downloadOrig = UtilIT.downloadFile(fileIdCsv, null, "original", null, authorApiToken); + downloadOrig.then().assertThat() .statusCode(OK.getStatusCode()) .body(equalTo("name,pounds,species\n" + "Marshall,40,dog\n" @@ -1752,17 +1784,23 @@ public void testRange() throws IOException { + "Panther,21,cat\n")); // first ten bytes - Response downloadFirstTen = UtilIT.downloadFile(fileIdCsv, "0-9", "original", null, authorApiToken); - downloadFirstTen.then().assertThat() + Response downloadOrigFirstTen = UtilIT.downloadFile(fileIdCsv, "0-9", "original", null, authorApiToken); + downloadOrigFirstTen.then().assertThat() .statusCode(OK.getStatusCode()) .body(equalTo("name,pound")); // last ten bytes - Response downloadLastTen = UtilIT.downloadFile(fileIdCsv, "-10", "original", null, authorApiToken); - downloadLastTen.then().assertThat() + Response downloadOrigLastTen = UtilIT.downloadFile(fileIdCsv, "-10", "original", null, authorApiToken); + downloadOrigLastTen.then().assertThat() .statusCode(OK.getStatusCode()) .body(equalTo("er,21,cat\n")); + // middle bytes + Response downloadOrigMiddle = UtilIT.downloadFile(fileIdCsv, "29-39", "original", null, authorApiToken); + downloadOrigMiddle.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("40,dog\nTige")); + String pathToZipWithImage = "scripts/search/data/binary/trees.zip"; Response uploadFileZipWithImage = UtilIT.uploadFileViaNative(datasetId.toString(), pathToZipWithImage, authorApiToken); // uploadFileZipWithImage.prettyPrint(); From 90786ce41cfecfadc730ef1b2a04aa67805bb377 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 7 Oct 2021 14:05:04 -0400 Subject: [PATCH 17/26] fix byte count in docs and release note #6937 --- doc/release-notes/6937-range.md | 2 +- doc/sphinx-guides/source/api/dataaccess.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/release-notes/6937-range.md b/doc/release-notes/6937-range.md index 2ce3e30b9aa..2c592d0c251 100644 --- a/doc/release-notes/6937-range.md +++ b/doc/release-notes/6937-range.md @@ -3,7 +3,7 @@ Dataverse now supports the HTTP "Range" header, which allows users to download parts of a file. Here are some examples: - `bytes=0-9` gets the first 10 bytes. -- `bytes=10-19` gets 20 bytes from the middle. +- `bytes=10-19` gets 10 bytes from the middle. - `bytes=-10` gets the last 10 bytes. - `bytes=9-` gets all bytes except the first 10. diff --git a/doc/sphinx-guides/source/api/dataaccess.rst b/doc/sphinx-guides/source/api/dataaccess.rst index 3973428a5a3..3f99740f267 100755 --- a/doc/sphinx-guides/source/api/dataaccess.rst +++ b/doc/sphinx-guides/source/api/dataaccess.rst @@ -140,7 +140,7 @@ Header Description Range Download a specified byte range. Examples: - ``bytes=0-9`` gets the first 10 bytes. - - ``bytes=10-19`` gets 20 bytes from the middle. + - ``bytes=10-19`` gets 10 bytes from the middle. - ``bytes=-10`` gets the last 10 bytes. - ``bytes=9-`` gets all bytes except the first 10. From a62da5dcfe740630b1060ba84697734505f41e55 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 7 Oct 2021 23:00:11 -0400 Subject: [PATCH 18/26] as agreed - some cleanup: removing the old, unused legacy code for creating chunked-encoded stream. (#6937) --- .../dataverse/api/DownloadInstanceWriter.java | 37 +------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 3baeb099c40..47a17e475d9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -403,10 +403,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] httpHeaders.add("Content-Type", mimeType + "; name=\"" + finalFileName + "\""); long contentSize; - // This is hard-coded to false and could probably be removed some day, - // but for now it's illustrative. - boolean useChunkedTransfer = false; - + // User may have requested a range of bytes. List ranges = new ArrayList<>(); long fileSize = storageIO.getSize(); @@ -445,18 +442,12 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] } else { // Neither a range request, nor do we know the size. Must be dynamically // generated, such as a subsetting request. - // Note the chunk lines below are commented out because Payara will see - // that the header is not present and switch into chunk mode. - //httpHeaders.add("Transfer-encoding", "chunked"); - //useChunkedTransfer = true; } // (the httpHeaders map must be modified *before* writing any // data in the output stream!) int bufsize; byte[] bffr = new byte[4 * 8192]; - byte[] chunkClose = "\r\n".getBytes(); - // before writing out any bytes from the input stream, flush // any extra content, such as the variable header for the @@ -464,10 +455,6 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (storageIO.getVarHeader() != null) { logger.fine("storageIO.getVarHeader().getBytes().length: " + storageIO.getVarHeader().getBytes().length); if (storageIO.getVarHeader().getBytes().length > 0) { - //if (useChunkedTransfer) { - // String chunkSizeLine = String.format("%x\r\n", storageIO.getVarHeader().getBytes().length); - // outstream.write(chunkSizeLine.getBytes()); - //} // If a range is not being requested, let's call that the normal case. // Write the entire line of variable headers. Later, the rest of the file // will be written. @@ -512,9 +499,6 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] } } - if (useChunkedTransfer) { - outstream.write(chunkClose); - } } } @@ -522,14 +506,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (ranges.isEmpty()) { logger.fine("Normal, non-range request of file id " + dataFile.getId()); while ((bufsize = instream.read(bffr)) != -1) { - if (useChunkedTransfer) { - String chunkSizeLine = String.format("%x\r\n", bufsize); - outstream.write(chunkSizeLine.getBytes()); - } outstream.write(bffr, 0, bufsize); - if (useChunkedTransfer) { - outstream.write(chunkClose); - } } } else if (leftToRead > 0) { // This is a range request, and we still have bytes to read @@ -545,10 +522,6 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // For now we only support a single range. //long leftToRead = ranges.get(0).getLength(); while ((bufsize = instream.read(bffr)) != -1) { - if (useChunkedTransfer) { - String chunkSizeLine = String.format("%x\r\n", bufsize); - outstream.write(chunkSizeLine.getBytes()); - } if ((leftToRead -= bufsize) > 0) { // Just do a normal write. Potentially lots to go. Don't break. outstream.write(bffr, 0, bufsize); @@ -557,18 +530,10 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] outstream.write(bffr, 0, (int) leftToRead + bufsize); break; } - if (useChunkedTransfer) { - outstream.write(chunkClose); - } } } - if (useChunkedTransfer) { - String chunkClosing = "0\r\n\r\n"; - outstream.write(chunkClosing.getBytes()); - } - logger.fine("di conversion param: " + di.getConversionParam() + ", value: " + di.getConversionParamValue()); // Downloads of thumbnail images (scaled down, low-res versions of graphic image files) and From 65f1704686395158010c50dcd1a329d9da81979a Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 7 Oct 2021 23:36:14 -0400 Subject: [PATCH 19/26] final (hopefully) rearrangement of case logic (#6937) --- .../dataverse/api/DownloadInstanceWriter.java | 78 ++++++++++--------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 47a17e475d9..1b5aeaef30c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -404,44 +404,52 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] long contentSize; - // User may have requested a range of bytes. + // User may have requested a rangeHeader of bytes. + // Ranges are only supported when the size of the content + // stream is known (i.e., it's not a dynamically generated + // stream. List ranges = new ArrayList<>(); - long fileSize = storageIO.getSize(); - String range = null; + String rangeHeader = null; HttpHeaders headers = di.getRequestHttpHeaders(); if (headers != null) { - range = headers.getHeaderString("Range"); + rangeHeader = headers.getHeaderString("Range"); } long offset = 0; long leftToRead = -1L; // Moving the "left to read" var. here; - since we may need - // to start counting our range bytes outside the main .write() + // to start counting our rangeHeader bytes outside the main .write() // loop, if it's a tabular file with a header. - try { - ranges = getRanges(range, fileSize); - } catch (Exception ex) { - logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage()); - // The message starts with "Datafile" because otherwise the message is not passed - // to the user due to how WebApplicationExceptionHandler works. - throw new NotFoundException("Datafile download error due to Range header: " + ex.getLocalizedMessage()); - } - if (!ranges.isEmpty()) { - // For now we only support a single range. - offset = ranges.get(0).getStart(); - leftToRead = ranges.get(0).getLength(); - } - if ((contentSize = getContentSize(storageIO)) > 0 && ranges.isEmpty()) { - logger.fine("Content size (retrieved from the AccessObject): " + contentSize); - httpHeaders.add("Content-Length", contentSize); - } else if (!ranges.isEmpty()) { - // For now we only support a single range. - long rangeContentSize = ranges.get(0).getLength(); - logger.fine("Content size (Range header in use): " + rangeContentSize); - httpHeaders.add("Content-Length", rangeContentSize); + if ((contentSize = getContentSize(storageIO)) > 0) { + try { + ranges = getRanges(rangeHeader, contentSize); + } catch (Exception ex) { + logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage()); + // The message starts with "Datafile" because otherwise the message is not passed + // to the user due to how WebApplicationExceptionHandler works. + throw new NotFoundException("Datafile download error due to Range header: " + ex.getLocalizedMessage()); + } + + if (ranges.isEmpty()) { + logger.fine("Content size (retrieved from the AccessObject): " + contentSize); + httpHeaders.add("Content-Length", contentSize); + } else { + // For now we only support a single rangeHeader. + long rangeContentSize = ranges.get(0).getLength(); + logger.fine("Content size (Range header in use): " + rangeContentSize); + httpHeaders.add("Content-Length", rangeContentSize); + + offset = ranges.get(0).getStart(); + leftToRead = rangeContentSize; + } } else { - // Neither a range request, nor do we know the size. Must be dynamically - // generated, such as a subsetting request. + // Content size unknown, must be a dynamically + // generated stream, such as a subsetting request. + // We do NOT want to support rangeHeader requests on such streams: + if (rangeHeader != null) { + throw new NotFoundException("Range headers are not supported on dynamically-generated content, such as tabular subsetting."); + } + } // (the httpHeaders map must be modified *before* writing any @@ -455,7 +463,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (storageIO.getVarHeader() != null) { logger.fine("storageIO.getVarHeader().getBytes().length: " + storageIO.getVarHeader().getBytes().length); if (storageIO.getVarHeader().getBytes().length > 0) { - // If a range is not being requested, let's call that the normal case. + // If a rangeHeader is not being requested, let's call that the normal case. // Write the entire line of variable headers. Later, the rest of the file // will be written. if (ranges.isEmpty()) { @@ -465,7 +473,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // Range requested. Since the output stream of a // tabular file is made up of the varHeader and the body of // the physical file, we should assume that the requested - // range may span any portion of the combined stream. + // rangeHeader may span any portion of the combined stream. // Thus we may or may not have to write the header, or a // portion thereof. int headerLength = storageIO.getVarHeader().getBytes().length; @@ -509,7 +517,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] outstream.write(bffr, 0, bufsize); } } else if (leftToRead > 0) { - // This is a range request, and we still have bytes to read + // This is a rangeHeader request, and we still have bytes to read // (for a tabular file, we may have already written enough // bytes from the variable header!) storageIO.setOffset(offset); @@ -518,8 +526,8 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // in StorageIO, for any future cases where we may not // be able to do that on the stream directly (?) -- L.A. logger.info("Range request of file id " + dataFile.getId()); - // Read a range of bytes instead of the whole file. We'll count down as we write. - // For now we only support a single range. + // Read a rangeHeader of bytes instead of the whole file. We'll count down as we write. + // For now we only support a single rangeHeader. //long leftToRead = ranges.get(0).getLength(); while ((bufsize = instream.read(bffr)) != -1) { if ((leftToRead -= bufsize) > 0) { @@ -678,14 +686,14 @@ private long getFileSize(DownloadInstance di, String extraHeader) { */ public List getRanges(String range, long fileSize) { // Inspired by https://gist.github.com/davinkevin/b97e39d7ce89198774b4 - // via https://stackoverflow.com/questions/28427339/how-to-implement-http-byte-range-requests-in-spring-mvc/28479001#28479001 + // via https://stackoverflow.com/questions/28427339/how-to-implement-http-byte-rangeHeader-requests-in-spring-mvc/28479001#28479001 List ranges = new ArrayList<>(); if (range != null) { logger.fine("Range header supplied: " + range); // Technically this regex supports multiple ranges. - // Below we have a check to enforce a single range. + // Below we have a check to enforce a single rangeHeader. if (!range.matches("^bytes=\\d*-\\d*(,\\d*-\\d*)*$")) { throw new RuntimeException("The format is bytes=- where start and end are optional."); } From cea1a140f9fa6beceb9e8148f33529e3b71950d0 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 8 Oct 2021 10:47:11 -0400 Subject: [PATCH 20/26] enable "last bytes from tabular" test (code now works) #6937 --- .../java/edu/harvard/iq/dataverse/api/FilesIT.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index c951c5667c5..66d720da411 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -1755,13 +1755,11 @@ public void testRange() throws IOException { .body(equalTo("name\tpounds\tspecies\n" + "\"Marshall\"")); -// // FIXME: This should be something like "er 21 cat" (the last line) like downloadOrigLastTen -// // FIXME: but right now it's showing "17 cat" (the second to last line). -// // last 10 bytes of tabular format -// Response downloadTabLast10 = UtilIT.downloadFile(fileIdCsv, "-10", null, null, authorApiToken); -// downloadTabLast10.then().assertThat() -// .statusCode(OK.getStatusCode()) -// .body(equalTo("er\"\t21\t\"cat\"\n")); + // last 16 bytes of tabular format + Response downloadTabLast16 = UtilIT.downloadFile(fileIdCsv, "-16", null, null, authorApiToken); + downloadTabLast16.then().assertThat() + .statusCode(OK.getStatusCode()) + .body(equalTo("nther\"\t21\t\"cat\"\n")); Response downloadTabMiddleBytesHeader = UtilIT.downloadFile(fileIdCsv, "1-7", null, null, authorApiToken); downloadTabMiddleBytesHeader.then().assertThat() From bdec095c2dbf9d576c71f2f7829a4b0f05f09c44 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 8 Oct 2021 10:48:03 -0400 Subject: [PATCH 21/26] reduce logging for range code #6937 --- .../iq/dataverse/api/DownloadInstanceWriter.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 1b5aeaef30c..4bf4bb92125 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -467,7 +467,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // Write the entire line of variable headers. Later, the rest of the file // will be written. if (ranges.isEmpty()) { - logger.info("writing the entire variable header"); + logger.fine("writing the entire variable header"); outstream.write(storageIO.getVarHeader().getBytes()); } else { // Range requested. Since the output stream of a @@ -483,7 +483,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // in the physical file; the number of bytes // left to write stays unchanged, since we haven't // written anything. - logger.info("Skipping the variable header completely."); + logger.fine("Skipping the variable header completely."); offset -= headerLength; } else { // We need to write some portion of the header; @@ -492,13 +492,13 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (offset + leftToRead <= headerLength) { // This is a more straightforward case - we just need to // write a portion of the header, and then we are done! - logger.info("Writing this many bytes of the variable header line: " + leftToRead); + logger.fine("Writing this many bytes of the variable header line: " + leftToRead); outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), (int)offset, (int)offset + (int)leftToRead)); // set "left to read" to zero, indicating that we are done: leftToRead = 0; } else { // write the requested portion of the header: - logger.info("Writing this many bytes of the variable header line: " + (headerLength - offset)); + logger.fine("Writing this many bytes of the variable header line: " + (headerLength - offset)); outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), (int)offset, headerLength)); // and adjust the file offset and remaining number of bytes accordingly: leftToRead -= (headerLength - offset); @@ -525,7 +525,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] // here... But I would like to have this offset functionality // in StorageIO, for any future cases where we may not // be able to do that on the stream directly (?) -- L.A. - logger.info("Range request of file id " + dataFile.getId()); + logger.fine("Range request of file id " + dataFile.getId()); // Read a rangeHeader of bytes instead of the whole file. We'll count down as we write. // For now we only support a single rangeHeader. //long leftToRead = ranges.get(0).getLength(); From b688d94c8439dd70ce0aeefbd5124d753c798159 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 8 Oct 2021 12:09:00 -0400 Subject: [PATCH 22/26] more tests and cleanup #6937 --- .../dataverse/api/DownloadInstanceWriter.java | 3 +- .../api/DownloadInstanceWriterTest.java | 13 ++++ .../edu/harvard/iq/dataverse/api/FilesIT.java | 69 +++---------------- 3 files changed, 24 insertions(+), 61 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index 4bf4bb92125..d64a6065bba 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -457,7 +457,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] int bufsize; byte[] bffr = new byte[4 * 8192]; - // before writing out any bytes from the input stream, flush + // Before writing out any bytes from the input stream, write // any extra content, such as the variable header for the // subsettable files: if (storageIO.getVarHeader() != null) { @@ -528,7 +528,6 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] logger.fine("Range request of file id " + dataFile.getId()); // Read a rangeHeader of bytes instead of the whole file. We'll count down as we write. // For now we only support a single rangeHeader. - //long leftToRead = ranges.get(0).getLength(); while ((bufsize = instream.read(bffr)) != -1) { if ((leftToRead -= bufsize) > 0) { // Just do a normal write. Potentially lots to go. Don't break. diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java b/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java index a59bc497c8b..56a94883e16 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java @@ -75,6 +75,19 @@ public void testGetRangeInvalidStartLargerThanEnd() { assertNotNull(expectedException); } + // Attempt to get invalid range (multiple ranges). + @Test + public void testGetRangeInvalidMultipleRanges() { + Exception expectedException = null; + try { + List ranges = diw.getRanges("bytes=0-9,90-99", 100); + } catch (Exception ex) { + System.out.println("exeption: " + ex); + expectedException = ex; + } + assertNotNull(expectedException); + } + // Test "junk" instead of "bytes=0-10" @Test public void testGetRangeInvalidJunk() { diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index 66d720da411..d76294f7a28 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -1585,65 +1585,6 @@ private void msgt(String m){ dashes(); msg(m); dashes(); } - @Ignore - @Test - public void test2toNull() { - Integer localFile = 15; - Integer s3file = 16; - Integer fileId = 0; - fileId = localFile; - String nullApiToken = null; - String nullFormat = null; - String nullThumbnail = null; - String byteRange = null; - byteRange = "2-"; - // java.lang.IndexOutOfBoundsException: Range [0, 0 + -2) out of bounds for length 32768 - Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullFormat, nullThumbnail, nullApiToken); - downloadFile.prettyPrint(); - } - - @Ignore - @Test - public void test0to100() { - Integer localFile = 16; - Integer tabularLocalFile = 156; - Integer s3file = 11; - Integer fileId = 0; - fileId = localFile; -// fileId = s3file; - fileId = tabularLocalFile; - String nullApiToken = null; - String format = null; - format = "original"; - String nullThumbnail = null; - String byteRange = null; -// byteRange = "0-9"; // first ten - byteRange = "0-150"; -// byteRange = "-9"; // last ten -// byteRange = "9-"; // last ten - Response downloadFile = UtilIT.downloadFile(fileId, byteRange, format, nullThumbnail, nullApiToken); - downloadFile.prettyPrint(); - } - - // test Range header with thumbnails, etc. - - @Ignore - @Test - public void testMultipleRanges() { - Integer localFile = 16; - Integer s3file = 11; - Integer fileId = 0; - fileId = localFile; -// fileId = s3file; - String nullApiToken = null; - String nullFormat = null; - String nullThumbnail = null; - String byteRange = null; - byteRange = "0-9,90-99"; - Response downloadFile = UtilIT.downloadFile(fileId, byteRange, nullFormat, nullThumbnail, nullApiToken); - downloadFile.prettyPrint(); - } - @Test public void testRange() throws IOException { @@ -1820,6 +1761,16 @@ public void testRange() throws IOException { // downloadThumbnail.prettyPrint(); downloadThumbnail.then().assertThat().statusCode(OK.getStatusCode()); + Response multipleRangesNotSupported = UtilIT.downloadFile(fileIdTxt, "0-9,20-29", null, null, authorApiToken); + // Datafile download error due to Range header: Only one range is allowed. +// multipleRangesNotSupported.prettyPrint(); + multipleRangesNotSupported.then().assertThat().statusCode(NOT_FOUND.getStatusCode()); + + Response startLargerThanEndError = UtilIT.downloadFile(fileIdTxt, "20-10", null, null, authorApiToken); + // Datafile download error due to Range header: Start is larger than end. + startLargerThanEndError.prettyPrint(); + startLargerThanEndError.then().assertThat().statusCode(NOT_FOUND.getStatusCode()); + // Response publishDataverse = UtilIT.publishDataverseViaNativeApi(dataverseAlias, authorApiToken); // publishDataverse.then().assertThat().statusCode(OK.getStatusCode()); // Response publishDataset = UtilIT.publishDatasetViaNativeApi(datasetPid, "major", authorApiToken); From eba6563854dba8698836d22c2142997910939b5b Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 14 Oct 2021 16:40:56 -0400 Subject: [PATCH 23/26] add curl example for Range header #6937 --- doc/sphinx-guides/source/api/dataaccess.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/doc/sphinx-guides/source/api/dataaccess.rst b/doc/sphinx-guides/source/api/dataaccess.rst index 3f99740f267..c22b1d8c442 100755 --- a/doc/sphinx-guides/source/api/dataaccess.rst +++ b/doc/sphinx-guides/source/api/dataaccess.rst @@ -147,6 +147,25 @@ Range Download a specified byte range. Examples: Only a single range is supported. The "If-Range" header is not supported. For more on the "Range" header, see https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests ============== =========== +Examples +~~~~~~~~ + +A curl example of using the ``Range`` header to download the first 10 bytes of a file using its file id (database id): + +.. code-block:: bash + + export SERVER_URL=https://demo.dataverse.org + export FILE_ID=42 + export RANGE=0-9 + + curl -H "Range:bytes=$RANGE" $SERVER_URL/api/access/datafile/$FILE_ID + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "Range:bytes=0-9" https://demo.dataverse.org/api/access/datafile/42 + Multiple File ("bundle") download --------------------------------- From 3024d6766725c6eab8f39215fc3bbad5939ced34 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 15 Oct 2021 11:19:23 -0400 Subject: [PATCH 24/26] clean up range header error handling #6937 Most importantly, this commit introduces using status code 416 (Range Not Satisfiable) when the range is invalid. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range says, "If the ranges are invalid, the server returns the 416 Range Not Satisfiable error." I also rearranged the logic around testing for a single range. Now it fails fast and reports that only one range is supported rather than potentially showing an error about the range being beyond the file size, for example. I also changed the error message "Start is larger than end" to "Start is larger than end or size of file" to more accurately reflect what the check is doing. Finally, I did some cleanup and added additional tests. --- .../dataverse/api/DownloadInstanceWriter.java | 23 ++++++++-------- .../api/DownloadInstanceWriterTest.java | 27 ++++++++++++++++--- .../edu/harvard/iq/dataverse/api/FilesIT.java | 16 +++++++---- 3 files changed, 46 insertions(+), 20 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index d64a6065bba..80b3f988953 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -41,6 +41,7 @@ import java.util.logging.Level; import java.util.logging.Logger; import javax.inject.Inject; +import javax.ws.rs.ClientErrorException; import javax.ws.rs.NotFoundException; import javax.ws.rs.RedirectionException; import javax.ws.rs.ServiceUnavailableException; @@ -425,9 +426,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] ranges = getRanges(rangeHeader, contentSize); } catch (Exception ex) { logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage()); - // The message starts with "Datafile" because otherwise the message is not passed - // to the user due to how WebApplicationExceptionHandler works. - throw new NotFoundException("Datafile download error due to Range header: " + ex.getLocalizedMessage()); + throw new ClientErrorException("Error due to Range header: " + ex.getLocalizedMessage(), Response.Status.REQUESTED_RANGE_NOT_SATISFIABLE); } if (ranges.isEmpty()) { @@ -692,13 +691,19 @@ public List getRanges(String range, long fileSize) { logger.fine("Range header supplied: " + range); // Technically this regex supports multiple ranges. - // Below we have a check to enforce a single rangeHeader. + // Below we have a check to enforce a single range. if (!range.matches("^bytes=\\d*-\\d*(,\\d*-\\d*)*$")) { throw new RuntimeException("The format is bytes=- where start and end are optional."); } // The 6 is to remove "bytes=" - for (String part : range.substring(6).split(",")) { + String[] parts = range.substring(6).split(","); + if (parts.length > 1) { + // Only allow a single range. + throw new RuntimeException("Only one range is allowed."); + } + // This loop is here in case we ever want to support multiple ranges. + for (String part : parts) { long start = getRangeStart(part); long end = getRangeEnd(part); @@ -715,14 +720,10 @@ public List getRanges(String range, long fileSize) { } if (start > end) { - throw new RuntimeException("Start is larger than end."); + throw new RuntimeException("Start is larger than end or size of file."); } - if (ranges.size() < 1) { - ranges.add(new Range(start, end)); - } else { - throw new RuntimeException("Only one range is allowed."); - } + ranges.add(new Range(start, end)); } } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java b/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java index 56a94883e16..6de52951077 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriterTest.java @@ -69,7 +69,8 @@ public void testGetRangeInvalidStartLargerThanEnd() { try { List ranges = diw.getRanges("bytes=20-10", 100); } catch (Exception ex) { - System.out.println("exeption: " + ex); + // "Start is larger than end or size of file." + System.out.println("exception: " + ex); expectedException = ex; } assertNotNull(expectedException); @@ -82,7 +83,23 @@ public void testGetRangeInvalidMultipleRanges() { try { List ranges = diw.getRanges("bytes=0-9,90-99", 100); } catch (Exception ex) { - System.out.println("exeption: " + ex); + // "Only one range is allowed." + System.out.println("exception: " + ex); + expectedException = ex; + } + assertNotNull(expectedException); + } + + // Attempt to get invalid range (multiple ranges, beyond file size). + @Test + public void testGetRangeInvalidMultipleRangesBeyondFileSize() { + Exception expectedException = null; + try { + List ranges = diw.getRanges("bytes=0-9,90-99", 40); + } catch (Exception ex) { + // "Only one range is allowed." + // We report the multiple ranges error before reporting the "beyond filesize" error. + System.out.println("exception: " + ex); expectedException = ex; } assertNotNull(expectedException); @@ -95,7 +112,8 @@ public void testGetRangeInvalidJunk() { try { List ranges = diw.getRanges("junk", 100); } catch (Exception ex) { - System.out.println("exeption: " + ex); + // "The format is bytes=- where start and end are optional." + System.out.println("exception: " + ex); expectedException = ex; } assertNotNull(expectedException); @@ -120,7 +138,8 @@ public void testGetRanges0to0and90toNull() { assertEquals(99, ranges.get(1).getEnd()); assertEquals(10, ranges.get(1).getLength()); } catch (Exception ex) { - System.out.println("exeption: " + ex); + // Only one range is allowed. + System.out.println("exception: " + ex); expectedException = ex; } assertNotNull(expectedException); diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index d76294f7a28..b82514ce083 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -31,6 +31,7 @@ import static javax.ws.rs.core.Response.Status.NOT_FOUND; import static javax.ws.rs.core.Response.Status.NO_CONTENT; import static javax.ws.rs.core.Response.Status.OK; +import static javax.ws.rs.core.Response.Status.REQUESTED_RANGE_NOT_SATISFIABLE; import static javax.ws.rs.core.Response.Status.UNAUTHORIZED; import static junit.framework.Assert.assertEquals; import org.hamcrest.CoreMatchers; @@ -1762,14 +1763,19 @@ public void testRange() throws IOException { downloadThumbnail.then().assertThat().statusCode(OK.getStatusCode()); Response multipleRangesNotSupported = UtilIT.downloadFile(fileIdTxt, "0-9,20-29", null, null, authorApiToken); - // Datafile download error due to Range header: Only one range is allowed. -// multipleRangesNotSupported.prettyPrint(); - multipleRangesNotSupported.then().assertThat().statusCode(NOT_FOUND.getStatusCode()); + // "Error due to Range header: Only one range is allowed." + multipleRangesNotSupported.prettyPrint(); + multipleRangesNotSupported.then().assertThat().statusCode(REQUESTED_RANGE_NOT_SATISFIABLE.getStatusCode()); Response startLargerThanEndError = UtilIT.downloadFile(fileIdTxt, "20-10", null, null, authorApiToken); - // Datafile download error due to Range header: Start is larger than end. + // "Error due to Range header: Start is larger than end or size of file." startLargerThanEndError.prettyPrint(); - startLargerThanEndError.then().assertThat().statusCode(NOT_FOUND.getStatusCode()); + startLargerThanEndError.then().assertThat().statusCode(REQUESTED_RANGE_NOT_SATISFIABLE.getStatusCode()); + + Response rangeBeyondFileSize = UtilIT.downloadFile(fileIdTxt, "88888-99999", null, null, authorApiToken); + // "Error due to Range header: Start is larger than end or size of file." + rangeBeyondFileSize.prettyPrint(); + rangeBeyondFileSize.then().assertThat().statusCode(REQUESTED_RANGE_NOT_SATISFIABLE.getStatusCode()); // Response publishDataverse = UtilIT.publishDataverseViaNativeApi(dataverseAlias, authorApiToken); // publishDataverse.then().assertThat().statusCode(OK.getStatusCode()); From 22d5ffb89188deab4b3ee3a89c1e5420ec5ea9b6 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 22 Oct 2021 16:25:46 -0400 Subject: [PATCH 25/26] move logic to getLength per code review #6937 --- .../java/edu/harvard/iq/dataverse/dataaccess/Range.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java index e07d6aaae94..2021219b1a9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java @@ -6,13 +6,10 @@ public class Range { private final long start; // Used to calculate the length. private final long end; - // Used to determine when to stop reading. - private final long length; public Range(long start, long end) { this.start = start; this.end = end; - this.length = end - start + 1; } public long getStart() { @@ -23,8 +20,9 @@ public long getEnd() { return end; } + // Used to determine when to stop reading. public long getLength() { - return length; + return end - start + 1; } } From 07ebf9a431880ac304bb75799bfd1afb77502184 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 8 Nov 2021 08:56:09 -0500 Subject: [PATCH 26/26] specify 5.9 as version #6937 --- doc/release-notes/6937-range.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release-notes/6937-range.md b/doc/release-notes/6937-range.md index 2c592d0c251..94a12ae704c 100644 --- a/doc/release-notes/6937-range.md +++ b/doc/release-notes/6937-range.md @@ -7,4 +7,4 @@ Dataverse now supports the HTTP "Range" header, which allows users to download p - `bytes=-10` gets the last 10 bytes. - `bytes=9-` gets all bytes except the first 10. -Only a single range is supported. For more information, see the [Data Access API](https://guides.dataverse.org/en/latest/api/dataaccess.html) section of the API Guide. +Only a single range is supported. For more information, see the [Data Access API](https://guides.dataverse.org/en/5.9/api/dataaccess.html) section of the API Guide.