IQSS · kcondon · Nov 8, 2021 · Sep 3, 2021 · Sep 8, 2021 · Sep 8, 2021
diff --git a/doc/release-notes/6937-range.md b/doc/release-notes/6937-range.md
@@ -0,0 +1,10 @@
+### Support for HTTP "Range" Header for Partial File Downloads
+
+Dataverse now supports the HTTP "Range" header, which allows users to download parts of a file. Here are some examples:
+
+- `bytes=0-9` gets the first 10 bytes.
+- `bytes=10-19` gets 10 bytes from the middle.
+- `bytes=-10` gets the last 10 bytes.
+- `bytes=9-` gets all bytes except the first 10.
+
+Only a single range is supported. For more information, see the [Data Access API](https://guides.dataverse.org/en/5.9/api/dataaccess.html) section of the API Guide.
diff --git a/doc/sphinx-guides/source/api/dataaccess.rst b/doc/sphinx-guides/source/api/dataaccess.rst
@@ -131,6 +131,41 @@ true            Generates a thumbnail image by rescaling to the default thumbnai
 ``N``           Rescales the image to ``N`` pixels wide. ``imageThumb=true`` and ``imageThumb=64`` are equivalent.
 ==============  ===========
 
+Headers:
+~~~~~~~~
+
+==============  ===========
+Header          Description
+==============  ===========
+Range           Download a specified byte range. Examples:
+
+                - ``bytes=0-9`` gets the first 10 bytes.
+                - ``bytes=10-19`` gets 10 bytes from the middle.
+                - ``bytes=-10`` gets the last 10 bytes.
+                - ``bytes=9-`` gets all bytes except the first 10.
+
+                Only a single range is supported. The "If-Range" header is not supported. For more on the "Range" header, see https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests
+==============  ===========
+
+Examples
+~~~~~~~~
+
+A curl example of using the ``Range`` header to download the first 10 bytes of a file using its file id (database id):
+
+.. code-block:: bash
+
+  export SERVER_URL=https://demo.dataverse.org
+  export FILE_ID=42
+  export RANGE=0-9
+
+  curl -H "Range:bytes=$RANGE" $SERVER_URL/api/access/datafile/$FILE_ID
+
+The fully expanded example above (without environment variables) looks like this:
+
+.. code-block:: bash
+
+  curl -H "Range:bytes=0-9" https://demo.dataverse.org/api/access/datafile/42
+
 Multiple File ("bundle") download
 ---------------------------------
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java
@@ -36,13 +36,16 @@
 import java.net.URISyntaxException;
 import java.net.URLEncoder;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import javax.inject.Inject;
+import javax.ws.rs.ClientErrorException;
 import javax.ws.rs.NotFoundException;
 import javax.ws.rs.RedirectionException;
 import javax.ws.rs.ServiceUnavailableException;
+import javax.ws.rs.core.HttpHeaders;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
@@ -401,52 +404,140 @@ public void writeTo(DownloadInstance di, Class<?> clazz, Type type, Annotation[]
                         httpHeaders.add("Content-Type", mimeType + "; name=\"" + finalFileName + "\"");
 
                         long contentSize;
-                        boolean useChunkedTransfer = false;
-                        //if ((contentSize = getFileSize(di, storageIO.getVarHeader())) > 0) {
+
+                        // User may have requested a rangeHeader of bytes.
+                        // Ranges are only supported when the size of the content 
+                        // stream is known (i.e., it's not a dynamically generated 
+                        // stream. 
+                        List<Range> ranges = new ArrayList<>();
+                        String rangeHeader = null;
+                        HttpHeaders headers = di.getRequestHttpHeaders();
+                        if (headers != null) {
+                            rangeHeader = headers.getHeaderString("Range");
+                        }
+                        long offset = 0;
+                        long leftToRead = -1L; 
+                        // Moving the "left to read" var. here; - since we may need 
+                        // to start counting our rangeHeader bytes outside the main .write()
+                        // loop, if it's a tabular file with a header. 
+
                         if ((contentSize = getContentSize(storageIO)) > 0) {
-                            logger.fine("Content size (retrieved from the AccessObject): " + contentSize);
-                            httpHeaders.add("Content-Length", contentSize);
+                            try {
+                                ranges = getRanges(rangeHeader, contentSize);
+                            } catch (Exception ex) {
+                                logger.fine("Exception caught processing Range header: " + ex.getLocalizedMessage());
+                                throw new ClientErrorException("Error due to Range header: " + ex.getLocalizedMessage(), Response.Status.REQUESTED_RANGE_NOT_SATISFIABLE);
+                            }
+
+                            if (ranges.isEmpty()) {
+                                logger.fine("Content size (retrieved from the AccessObject): " + contentSize);
+                                httpHeaders.add("Content-Length", contentSize);
+                            } else  {
+                                // For now we only support a single rangeHeader.
+                                long rangeContentSize = ranges.get(0).getLength();
+                                logger.fine("Content size (Range header in use): " + rangeContentSize);
+                                httpHeaders.add("Content-Length", rangeContentSize);
+
+                                offset = ranges.get(0).getStart();
+                                leftToRead = rangeContentSize;
+                            }
                         } else {
-                            //httpHeaders.add("Transfer-encoding", "chunked");
-                            //useChunkedTransfer = true;
+                            // Content size unknown, must be a dynamically
+                            // generated stream, such as a subsetting request.
+                            // We do NOT want to support rangeHeader requests on such streams:
+                            if (rangeHeader != null) {
+                                throw new NotFoundException("Range headers are not supported on dynamically-generated content, such as tabular subsetting.");
+                            }
+
                         }
 
                         // (the httpHeaders map must be modified *before* writing any
                         // data in the output stream!)
                         int bufsize;
                         byte[] bffr = new byte[4 * 8192];
-                        byte[] chunkClose = "\r\n".getBytes();
 
-                        // before writing out any bytes from the input stream, flush
+                        // Before writing out any bytes from the input stream, write
                         // any extra content, such as the variable header for the 
                         // subsettable files: 
                         if (storageIO.getVarHeader() != null) {
+                            logger.fine("storageIO.getVarHeader().getBytes().length: " + storageIO.getVarHeader().getBytes().length);
                             if (storageIO.getVarHeader().getBytes().length > 0) {
-                                if (useChunkedTransfer) {
-                                    String chunkSizeLine = String.format("%x\r\n", storageIO.getVarHeader().getBytes().length);
-                                    outstream.write(chunkSizeLine.getBytes());
-                                }
-                                outstream.write(storageIO.getVarHeader().getBytes());
-                                if (useChunkedTransfer) {
-                                    outstream.write(chunkClose);
+                                // If a rangeHeader is not being requested, let's call that the normal case.
+                                // Write the entire line of variable headers. Later, the rest of the file
+                                // will be written.
+                                if (ranges.isEmpty()) {
+                                    logger.fine("writing the entire variable header");
+                                    outstream.write(storageIO.getVarHeader().getBytes());
+                                } else {
+                                    // Range requested. Since the output stream of a 
+                                    // tabular file is made up of the varHeader and the body of 
+                                    // the physical file, we should assume that the requested 
+                                    // rangeHeader may span any portion of the combined stream.
+                                    // Thus we may or may not have to write the header, or a 
+                                    // portion thereof. 
+                                    int headerLength = storageIO.getVarHeader().getBytes().length;
+                                    if (offset >= headerLength) {
+                                        // We can skip the entire header. 
+                                        // All we need to do is adjust the byte offset 
+                                        // in the physical file; the number of bytes
+                                        // left to write stays unchanged, since we haven't
+                                        // written anything.
+                                        logger.fine("Skipping the variable header completely.");
+                                        offset -= headerLength;
+                                    } else {
+                                        // We need to write some portion of the header; 
+                                        // Once we are done, we may or may not still have 
+                                        // some bytes left to write from the main physical file.
+                                        if (offset + leftToRead <= headerLength) {
+                                            // This is a more straightforward case - we just need to 
+                                            // write a portion of the header, and then we are done!
+                                            logger.fine("Writing this many bytes of the variable header line: " + leftToRead);
+                                            outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), (int)offset, (int)offset + (int)leftToRead));
+                                            // set "left to read" to zero, indicating that we are done:
+                                            leftToRead = 0; 
+                                        } else {
+                                            // write the requested portion of the header:
+                                            logger.fine("Writing this many bytes of the variable header line: " + (headerLength - offset));
+                                            outstream.write(Arrays.copyOfRange(storageIO.getVarHeader().getBytes(), (int)offset, headerLength));
+                                            // and adjust the file offset and remaining number of bytes accordingly: 
+                                            leftToRead -= (headerLength - offset);
+                                            offset = 0;
+                                        }
+
+                                    }
                                 }
                             }
                         }
 
-                        while ((bufsize = instream.read(bffr)) != -1) {
-                            if (useChunkedTransfer) {
-                                String chunkSizeLine = String.format("%x\r\n", bufsize);
-                                outstream.write(chunkSizeLine.getBytes());
+                        // Dynamic streams, etc. Normal operation. No leftToRead.
+                        if (ranges.isEmpty()) {
+                            logger.fine("Normal, non-range request of file id " + dataFile.getId());
+                            while ((bufsize = instream.read(bffr)) != -1) {
+                                outstream.write(bffr, 0, bufsize);
                             }
-                            outstream.write(bffr, 0, bufsize);
-                            if (useChunkedTransfer) {
-                                outstream.write(chunkClose);
+                        } else if (leftToRead > 0) {
+                            // This is a rangeHeader request, and we still have bytes to read 
+                            // (for a tabular file, we may have already written enough
+                            // bytes from the variable header!)
+                            storageIO.setOffset(offset);
+                            // Thinking about it, we could just do instream.skip(offset) 
+                            // here... But I would like to have this offset functionality 
+                            // in StorageIO, for any future cases where we may not 
+                            // be able to do that on the stream directly (?) -- L.A.
+                            logger.fine("Range request of file id " + dataFile.getId());
+                            // Read a rangeHeader of bytes instead of the whole file. We'll count down as we write.
+                            // For now we only support a single rangeHeader.
+                            while ((bufsize = instream.read(bffr)) != -1) {
+                                if ((leftToRead -= bufsize) > 0) {
+                                    // Just do a normal write. Potentially lots to go. Don't break.
+                                    outstream.write(bffr, 0, bufsize);
+                                } else {
+                                    // Get those last bytes or bytes equal to bufsize. Last one. Then break.
+                                    outstream.write(bffr, 0, (int) leftToRead + bufsize);
+                                    break;
+                                }
                             }
-                        }
 
-                        if (useChunkedTransfer) {
-                            String chunkClosing = "0\r\n\r\n";
-                            outstream.write(chunkClosing.getBytes());
                         }
 
                         logger.fine("di conversion param: " + di.getConversionParam() + ", value: " + di.getConversionParamValue());
@@ -585,4 +676,77 @@ private long getFileSize(DownloadInstance di, String extraHeader) {
         }
         return -1;
     }
+
+    /**
+     * @param range "bytes 0-10" for example. Found in the "Range" HTTP header.
+     * @param fileSize File size in bytes.
+     * @throws RunTimeException on any problems processing the Range header.
+     */
+    public List<Range> getRanges(String range, long fileSize) {
+        // Inspired by https://gist.github.com/davinkevin/b97e39d7ce89198774b4
+        // via https://stackoverflow.com/questions/28427339/how-to-implement-http-byte-rangeHeader-requests-in-spring-mvc/28479001#28479001
+        List<Range> ranges = new ArrayList<>();
+
+        if (range != null) {
+            logger.fine("Range header supplied: " + range);
+
+            // Technically this regex supports multiple ranges.
+            // Below we have a check to enforce a single range.
+            if (!range.matches("^bytes=\\d*-\\d*(,\\d*-\\d*)*$")) {
+                throw new RuntimeException("The format is bytes=<range-start>-<range-end> where start and end are optional.");
+            }
+
+            // The 6 is to remove "bytes="
+            String[] parts = range.substring(6).split(",");
+            if (parts.length > 1) {
+                // Only allow a single range.
+                throw new RuntimeException("Only one range is allowed.");
+            }
+            // This loop is here in case we ever want to support multiple ranges.
+            for (String part : parts) {
+
+                long start = getRangeStart(part);
+                long end = getRangeEnd(part);
+
+                if (start == -1) {
+                    // start does not exist. Base start off of how many bytes from end.
+                    start = fileSize - end;
+                    end = fileSize - 1;
+                } else if (end == -1 || end > fileSize - 1) {
+                    // Set end when it doesn't exist.
+                    // Also, automatically set end to size of file if end is beyond
+                    // the file size (rather than throwing an error).
+                    end = fileSize - 1;
+                }
+
+                if (start > end) {
+                    throw new RuntimeException("Start is larger than end or size of file.");
+                }
+
+                ranges.add(new Range(start, end));
+
+            }
+        }
+
+        return ranges;
+    }
+
+    /**
+     * @return Return a positive long or -1 if start does not exist.
+     */
+    public static long getRangeStart(String part) {
+        // Get everything before the "-".
+        String start = part.substring(0, part.indexOf("-"));
+        return (start.length() > 0) ? Long.parseLong(start) : -1;
+    }
+
+    /**
+     * @return Return a positive long or -1 if end does not exist.
+     */
+    public static long getRangeEnd(String part) {
+        // Get everything after the "-".
+        String end = part.substring(part.indexOf("-") + 1, part.length());
+        return (end.length() > 0) ? Long.parseLong(end) : -1;
+    }
+
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/Range.java
@@ -0,0 +1,28 @@
+package edu.harvard.iq.dataverse.dataaccess;
+
+public class Range {
+
+    // Used to set the offset, how far to skip into the file.
+    private final long start;
+    // Used to calculate the length.
+    private final long end;
+
+    public Range(long start, long end) {
+        this.start = start;
+        this.end = end;
+    }
+
+    public long getStart() {
+        return start;
+    }
+
+    public long getEnd() {
+        return end;
+    }
+
+    // Used to determine when to stop reading.
+    public long getLength() {
+        return end - start + 1;
+    }
+
+}
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java
@@ -191,6 +191,12 @@ public boolean canWrite() {
 
     /*private int status;*/
     private long size;
+
+    /**
+     * Where in the file to seek to when reading (default is zero bytes, the
+     * start of the file).
+     */
+    private long offset;
 
     private String mimeType;
     private String fileName;
@@ -272,6 +278,10 @@ public long getSize() {
         return size;
     }
 
+    public long getOffset() {
+        return offset;
+    }
+
     public InputStream getInputStream() throws IOException {
         return in;
     }
@@ -381,6 +391,18 @@ public void setSize(long s) {
         size = s;
     }
 
+    // open() has already been called. Now we can skip, if need be.
+    public void setOffset(long offset) throws IOException {
+        InputStream inputStream = getInputStream();
+        if (inputStream != null) {
+            inputStream.skip(offset);
+            // The skip has already been done. Why not record it.
+            this.offset = offset;
+        } else {
+            throw new IOException("Could not skip into InputStream because it is null");
+        }
+    }
+
     public void setInputStream(InputStream is) {
         in = is;
     }