From 50bebcc95344eb7b6a5936439598b656fa3ad7f9 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sat, 26 Oct 2024 23:08:50 -0700
Subject: [PATCH 01/18] add Readonly document and change source op

---
 .../executor/SourceOperatorExecutor.scala     |  17 ---
 .../common/storage/DatasetFileDocument.scala  |  24 ++--
 .../storage/ReadonlyLocalFileDocument.scala   |  67 +++++++++++
 .../storage/ReadonlyVirtualDocument.scala     |  64 +++++++++++
 .../common/storage/VirtualDocument.scala      |   2 +-
 .../user/dataset/DatasetResource.scala        |  92 ----------------
 .../common/storage/FileResolver.scala         | 104 ++++++++++++++++--
 .../source/scan/FileScanSourceOpDesc.scala    |   4 +-
 .../source/scan/FileScanSourceOpExec.scala    |   7 +-
 .../source/scan/ScanSourceOpDesc.scala        |  18 +--
 .../source/scan/csv/CSVScanSourceOpDesc.scala |  14 +--
 .../source/scan/csv/CSVScanSourceOpExec.scala |   7 +-
 .../csv/ParallelCSVScanSourceOpDesc.scala     |  20 +---
 .../scan/csvOld/CSVOldScanSourceOpDesc.scala  |  23 +---
 .../scan/csvOld/CSVOldScanSourceOpExec.scala  |  14 +--
 .../scan/json/JSONLScanSourceOpDesc.scala     |   8 +-
 .../scan/json/JSONLScanSourceOpExec.scala     |   7 +-
 .../scan/text/FileScanSourceOpDescSpec.scala  |  18 +--
 18 files changed, 287 insertions(+), 223 deletions(-)
 create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
 create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala
index baff229db0b..d4c92b19a2a 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala
@@ -25,21 +25,4 @@ trait SourceOperatorExecutor extends OperatorExecutor {
     // We should move this to onFinishAllPorts later.
     produceTuple().map(t => (t, Option.empty))
   }
-
-  // this function create the input stream accordingly:
-  // - if filePath is set, create the stream from the file
-  // - if fileDesc is set, create the stream via JGit call
-  def createInputStream(filePath: String, datasetFileDocument: DatasetFileDocument): InputStream = {
-    if (filePath != null && datasetFileDocument != null) {
-      throw new RuntimeException(
-        "File Path and Dataset File Descriptor cannot present at the same time."
-      )
-    }
-    if (filePath != null) {
-      new FileInputStream(filePath)
-    } else {
-      // create stream from dataset file desc
-      datasetFileDocument.asInputStream()
-    }
-  }
 }
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
index bb5ef6b8e2b..3f013825ccb 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
@@ -1,16 +1,15 @@
 package edu.uci.ics.amber.engine.common.storage
 
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
+import org.jooq.types.UInteger
 
-import java.io.{File, InputStream, FileOutputStream}
+import java.io.{File, FileOutputStream, InputStream}
 import java.net.URI
 import java.nio.file.{Files, Path}
 
-class DatasetFileDocument(fileFullPath: Path) extends VirtualDocument[Nothing] {
-
-  private val (_, dataset, datasetVersion, fileRelativePath) =
-    DatasetResource.resolvePath(fileFullPath, shouldContainFile = true)
-
+class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path) extends VirtualDocument[Nothing] {
   private var tempFile: Option[File] = None
 
   override def getURI: URI =
@@ -19,12 +18,13 @@ class DatasetFileDocument(fileFullPath: Path) extends VirtualDocument[Nothing] {
     )
 
   override def asInputStream(): InputStream = {
-    fileRelativePath match {
-      case Some(path) =>
-        DatasetResource.getDatasetFile(dataset.getDid, datasetVersion.getDvid, path)
-      case None =>
-        throw new IllegalArgumentException("File relative path is missing.")
-    }
+    val datasetAbsolutePath = PathUtils.getDatasetPath(UInteger.valueOf(did))
+    GitVersionControlLocalFileStorage
+          .retrieveFileContentOfVersionAsInputStream(
+            datasetAbsolutePath,
+            datasetVersionHash,
+            datasetAbsolutePath.resolve(fileRelativePath)
+          )
   }
 
   override def asFile(): File = {
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
new file mode 100644
index 00000000000..86873e2c525
--- /dev/null
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
@@ -0,0 +1,67 @@
+package edu.uci.ics.amber.engine.common.storage
+
+import java.io.{File, FileInputStream, InputStream}
+import java.net.URI
+import java.nio.file.Path
+
+/**
+  * ReadonlyLocalFileDocument provides a read-only abstraction over a local file.
+  * Implements ReadonlyVirtualDocument without requiring a specific data type T.
+  * Unsupported methods throw NotImplementedError.
+  */
+class ReadonlyLocalFileDocument(uri: URI) extends ReadonlyVirtualDocument[Nothing] {
+
+  /**
+    * Get the URI of the corresponding document.
+    * @return the URI of the document
+    */
+  override def getURI: URI = uri
+
+  /**
+    * Get the file as an input stream for read operations.
+    * @return InputStream to read from the file
+    */
+  override def asInputStream(): InputStream = new FileInputStream(new File(uri))
+
+  /**
+    * Get the file as an input stream for read operations.
+    *
+    * @return InputStream to read from the file
+    */
+  override def asFile(): File = new File(uri)
+
+  /**
+    * Find ith item and return.
+    * For this implementation, items are unsupported, so this method is unimplemented.
+    */
+  override def getItem(i: Int): Nothing =
+    throw new NotImplementedError("getItem is not supported for ReadonlyLocalFileDocument")
+
+  /**
+    * Get an iterator that iterates over all indexed items.
+    * Unsupported in ReadonlyLocalFileDocument.
+    */
+  override def get(): Iterator[Nothing] =
+    throw new NotImplementedError("get is not supported for ReadonlyLocalFileDocument")
+
+  /**
+    * Get an iterator of a sequence from index `from` to `until`.
+    * Unsupported in ReadonlyLocalFileDocument.
+    */
+  override def getRange(from: Int, until: Int): Iterator[Nothing] =
+    throw new NotImplementedError("getRange is not supported for ReadonlyLocalFileDocument")
+
+  /**
+    * Get an iterator of all items after the specified index `offset`.
+    * Unsupported in ReadonlyLocalFileDocument.
+    */
+  override def getAfter(offset: Int): Iterator[Nothing] =
+    throw new NotImplementedError("getAfter is not supported for ReadonlyLocalFileDocument")
+
+  /**
+    * Get the count of items in the document.
+    * Unsupported in ReadonlyLocalFileDocument.
+    */
+  override def getCount: Long =
+    throw new NotImplementedError("getCount is not supported for ReadonlyLocalFileDocument")
+}
\ No newline at end of file
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala
new file mode 100644
index 00000000000..e9df984f036
--- /dev/null
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala
@@ -0,0 +1,64 @@
+package edu.uci.ics.amber.engine.common.storage
+
+import java.io.{File, InputStream}
+import java.net.URI
+
+/**
+  * ReadonlyVirtualDocument provides an abstraction for read operations over a single resource.
+  * This trait can be implemented by resources that only need to support read-related functionality.
+  * @tparam T the type of data that can use index to read.
+  */
+trait ReadonlyVirtualDocument[T] {
+
+  /**
+    * Get the URI of the corresponding document.
+    * @return the URI of the document
+    */
+  def getURI: URI
+
+  /**
+    * Find ith item and return.
+    * @param i index starting from 0
+    * @return data item of type T
+    */
+  def getItem(i: Int): T
+
+  /**
+    * Get an iterator that iterates over all indexed items.
+    * @return an iterator that returns data items of type T
+    */
+  def get(): Iterator[T]
+
+  /**
+    * Get an iterator of a sequence starting from index `from`, until index `until`.
+    * @param from the starting index (inclusive)
+    * @param until the ending index (exclusive)
+    * @return an iterator that returns data items of type T
+    */
+  def getRange(from: Int, until: Int): Iterator[T]
+
+  /**
+    * Get an iterator of all items after the specified index `offset`.
+    * @param offset the starting index (exclusive)
+    * @return an iterator that returns data items of type T
+    */
+  def getAfter(offset: Int): Iterator[T]
+
+  /**
+    * Get the count of items in the document.
+    * @return the count of items
+    */
+  def getCount: Long
+
+  /**
+    * Convert document to an input stream.
+    * @return the input stream
+    */
+  def asInputStream(): InputStream
+
+  /**
+    * Convert document to an file
+    */
+
+  def asFile(): File
+}
\ No newline at end of file
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
index 162de066543..f590f5dfcb3 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
@@ -9,7 +9,7 @@ import java.net.URI
   * e.g. for dataset file, supports for read/write using file stream are essential, whereas read & write using index are hard to support and are semantically meaningless
   * @tparam T the type of data that can use index to read and write.
   */
-abstract class VirtualDocument[T] {
+abstract class VirtualDocument[T] extends ReadonlyVirtualDocument[T] {
 
   /**
     * get the URI of corresponding document
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index fb81e62d011..3521c6b3411 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -135,33 +135,6 @@ object DatasetResource {
     dataset
   }
 
-  private def getDatasetByName(
-      ctx: DSLContext,
-      ownerEmail: String,
-      datasetName: String
-  ): Dataset = {
-    ctx
-      .select(DATASET.fields: _*)
-      .from(DATASET)
-      .leftJoin(USER)
-      .on(USER.UID.eq(DATASET.OWNER_UID))
-      .where(USER.EMAIL.eq(ownerEmail))
-      .and(DATASET.NAME.eq(datasetName))
-      .fetchOneInto(classOf[Dataset])
-  }
-
-  private def getDatasetVersionByName(
-      ctx: DSLContext,
-      did: UInteger,
-      versionName: String
-  ): DatasetVersion = {
-    ctx
-      .selectFrom(DATASET_VERSION)
-      .where(DATASET_VERSION.DID.eq(did))
-      .and(DATASET_VERSION.NAME.eq(versionName))
-      .fetchOneInto(classOf[DatasetVersion])
-  }
-
   // this function retrieve the version hash identified by dvid and did
   // read access will be checked
   private def getDatasetVersionByID(
@@ -176,56 +149,6 @@ object DatasetResource {
     version
   }
 
-  // @param shouldContainFile a boolean flag indicating whether the path includes a fileRelativePath
-  // when shouldContainFile is true, user given path is /ownerEmail/datasetName/versionName/fileRelativePath
-  // e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv
-  //      ownerName is bob@texera.com; datasetName is twitterDataset, versionName is v1, fileRelativePath is california/irvine/tw1.csv
-  // when shouldContainFile is false, user given path is /ownerEmail/datasetName/versionName
-  // e.g. /bob@texera.com/twitterDataset/v1
-  //      ownerName is bob@texera.com; datasetName is twitterDataset, versionName is v1
-  def resolvePath(
-      path: java.nio.file.Path,
-      shouldContainFile: Boolean
-  ): (String, Dataset, DatasetVersion, Option[java.nio.file.Path]) = {
-
-    val pathSegments = (0 until path.getNameCount).map(path.getName(_).toString).toArray
-
-    // The expected length of the path segments:
-    // - If shouldContainFile is true, the path should include 4 segments: /ownerEmail/datasetName/versionName/fileRelativePath
-    // - If shouldContainFile is false, the path should include only 3 segments: /ownerEmail/datasetName/versionName
-    val expectedLength = if (shouldContainFile) 4 else 3
-
-    if (pathSegments.length < expectedLength) {
-      throw new BadRequestException(
-        s"Invalid path format. Expected format: /ownerEmail/datasetName/versionName" +
-          (if (shouldContainFile) "/fileRelativePath" else "")
-      )
-    }
-
-    val ownerEmail = pathSegments(0)
-    val datasetName = pathSegments(1)
-    val versionName = pathSegments(2)
-
-    val fileRelativePath =
-      if (shouldContainFile) Some(Paths.get(pathSegments.drop(3).mkString("/"))) else None
-
-    withTransaction(context) { ctx =>
-      // Get the dataset by owner email and dataset name
-      val dataset = getDatasetByName(ctx, ownerEmail, datasetName)
-      if (dataset == null) {
-        throw new NotFoundException("Dataset not found")
-      }
-
-      // Get the dataset version by dataset ID and version name
-      val datasetVersion = getDatasetVersionByName(ctx, dataset.getDid, versionName)
-      if (datasetVersion == null) {
-        throw new NotFoundException("Dataset version not found")
-      }
-
-      (ownerEmail, dataset, datasetVersion, fileRelativePath)
-    }
-  }
-
   // this function retrieve the DashboardDataset(Dataset from DB+more information) identified by did
   // read access will be checked
   def getDashboardDataset(ctx: DSLContext, did: UInteger, uid: UInteger): DashboardDataset = {
@@ -299,21 +222,6 @@ object DatasetResource {
     }
   }
 
-  def getDatasetFile(
-      did: UInteger,
-      dvid: UInteger,
-      fileRelativePath: java.nio.file.Path
-  ): InputStream = {
-    val versionHash = getDatasetVersionByID(context, dvid).getVersionHash
-    val datasetPath = PathUtils.getDatasetPath(did)
-    GitVersionControlLocalFileStorage
-      .retrieveFileContentOfVersionAsInputStream(
-        PathUtils.getDatasetPath(did),
-        versionHash,
-        datasetPath.resolve(fileRelativePath)
-      )
-  }
-
   private def getFileNodesOfCertainVersion(
       ownerNode: DatasetFileNode,
       datasetName: String,
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index e1a94c8fc48..1c1b4f43c1a 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -1,14 +1,22 @@
 package edu.uci.ics.texera.workflow.common.storage
 
+import edu.uci.ics.amber.engine.common.Utils.withTransaction
+
 import java.nio.file.{Files, Paths}
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
+import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument, VirtualDocument}
+import edu.uci.ics.texera.web.SqlServer
+import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
+import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
+import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
 import org.apache.commons.vfs2.FileNotFoundException
+import org.jooq.DSLContext
 
+import java.net.URI
 import scala.util.{Success, Try}
 
 object FileResolver {
-
-  type FileResolverOutput = Either[String, DatasetFileDocument]
+  private val DatasetFileUriScheme = "vfs"
 
   /**
     * Attempts to resolve the given fileName using a list of resolver functions.
@@ -17,8 +25,8 @@ object FileResolver {
     * @throws FileNotFoundException if the file cannot be resolved by any resolver
     * @return Either[String, DatasetFileDocument] - the resolved path as a String or a DatasetFileDocument
     */
-  def resolve(fileName: String): FileResolverOutput = {
-    val resolvers: List[String => FileResolverOutput] = List(localResolveFunc, datasetResolveFunc)
+  def resolve(fileName: String): URI = {
+    val resolvers: List[String => URI] = List(localResolveFunc, datasetResolveFunc)
 
     // Try each resolver function in sequence
     resolvers.iterator
@@ -29,30 +37,104 @@ object FileResolver {
       .getOrElse(throw new FileNotFoundException(fileName))
   }
 
+  /**
+    * Open a file handle for the given fileUri
+    * @param fileUri the uri pointing to the file
+    * @return
+    */
+  def open(fileUri: URI): ReadonlyVirtualDocument[_] = {
+    fileUri.getScheme match {
+      case DatasetFileUriScheme =>
+        // Parse the host to get dataset ID and version hash
+        val hostParts = fileUri.getHost.split("\\.")
+        if (hostParts.length != 2) {
+          throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}")
+        }
+        val datasetId = hostParts(0).toInt
+        val versionHash = hostParts(1)
+
+        // The path within the URI represents the relative path of the file in the dataset
+        val fileRelativePath = Paths.get(fileUri.getPath.stripPrefix("/"))
+
+        // Create and return a DatasetFileDocument with the parsed values
+        new DatasetFileDocument(datasetId, versionHash, fileRelativePath)
+
+      case "file" =>
+        // For local files, create a ReadonlyLocalFileDocument
+        new ReadonlyLocalFileDocument(fileUri)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
+    }
+  }
   /**
     * Attempts to resolve a local file path.
     * @throws FileNotFoundException if the local file does not exist
     * @param fileName the name of the file to check
     */
-  private def localResolveFunc(fileName: String): FileResolverOutput = {
+  private def localResolveFunc(fileName: String): URI = {
     val filePath = Paths.get(fileName)
     if (Files.exists(filePath)) {
-      Left(fileName) // File exists locally, return the path as a string in the Left
+      filePath.toUri // File exists locally, return the path as a string in the Left
     } else {
       throw new FileNotFoundException(s"Local file $fileName does not exist")
     }
   }
 
   /**
-    * Attempts to resolve a DatasetFileDocument.
+    * Attempts to resolve a given fileName to a URI.
+    *
+    * The fileName format should be: /ownerEmail/datasetName/versionName/fileRelativePath
+    *   e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv
+    * The output dataset URI format is: {DatasetFileUriScheme}://{did}.{versionHash}/file-path
+    *   e.g. vfs://15.adeq233td/some/dir/file.txt
     *
     * @param fileName the name of the file to attempt resolving as a DatasetFileDocument
     * @return Either[String, DatasetFileDocument] - Right(document) if creation succeeds
     * @throws FileNotFoundException if the dataset file does not exist or cannot be created
     */
-  private def datasetResolveFunc(fileName: String): FileResolverOutput = {
+  private def datasetResolveFunc(fileName: String): URI = {
     val filePath = Paths.get(fileName)
-    val document = new DatasetFileDocument(filePath) // This will throw if creation fails
-    Right(document)
+    val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray
+
+    if (pathSegments.length < 4) {
+      throw new RuntimeException(
+        s"Invalid path format. Expected format: /ownerEmail/datasetName/versionName/fileRelativePath"
+      )
+    }
+
+    val ownerEmail = pathSegments(0)
+    val datasetName = pathSegments(1)
+    val versionName = pathSegments(2)
+    val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/"))
+
+    withTransaction(SqlServer.createDSLContext()) { ctx =>
+      val (dataset, datasetVersion) = getDatasetAndDatasetVersionByName(ctx, ownerEmail, datasetName, versionName)
+      if (dataset == null || datasetVersion == null) {
+        throw new FileNotFoundException(s"Dataset file $fileName")
+      }
+
+      // assemble dataset URI format
+      val host = s"${dataset.getDid.intValue()}.${datasetVersion.getVersionHash}"
+      new URI(DatasetFileUriScheme, host, fileRelativePath.toUri.getPath, null)
+    }
+  }
+
+  private def getDatasetAndDatasetVersionByName(ctx: DSLContext, ownerEmail: String, datasetName: String, datasetVersionName: String): (Dataset, DatasetVersion) = {
+    val dataset = ctx
+      .select(DATASET.fields: _*)
+      .from(DATASET)
+      .leftJoin(USER)
+      .on(USER.UID.eq(DATASET.OWNER_UID))
+      .where(USER.EMAIL.eq(ownerEmail))
+      .and(DATASET.NAME.eq(datasetName))
+      .fetchOneInto(classOf[Dataset])
+
+    val datasetVersion = ctx
+      .selectFrom(DATASET_VERSION)
+      .where(DATASET_VERSION.DID.eq(dataset.getDid))
+      .and(DATASET_VERSION.NAME.eq(datasetVersionName))
+      .fetchOneInto(classOf[DatasetVersion])
+    (dataset, datasetVersion)
   }
 }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala
index daa4c3864b5..ee1621e762f 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala
@@ -47,7 +47,6 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc {
       workflowId: WorkflowIdentity,
       executionId: ExecutionIdentity
   ): PhysicalOp = {
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
     PhysicalOp
       .sourcePhysicalOp(
         workflowId,
@@ -55,8 +54,7 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc {
         operatorIdentifier,
         OpExecInitInfo((_, _) =>
           new FileScanSourceOpExec(
-            filepath,
-            fileDesc,
+            fileUri.get,
             attributeType,
             encoding,
             extract,
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
index 0244195d4a9..5756252995d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
@@ -4,16 +4,17 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.model.tuple.TupleLike
 import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory}
 import org.apache.commons.io.IOUtils.toByteArray
 
 import java.io._
+import java.net.URI
 import scala.collection.mutable
 import scala.jdk.CollectionConverters.IteratorHasAsScala
 
 class FileScanSourceOpExec private[scan] (
-    filePath: String,
-    datasetFileDesc: DatasetFileDocument,
+    fileUri: String,
     fileAttributeType: FileAttributeType,
     fileEncoding: FileDecodingMethod,
     extract: Boolean,
@@ -26,7 +27,7 @@ class FileScanSourceOpExec private[scan] (
   override def produceTuple(): Iterator[TupleLike] = {
     var filenameIt: Iterator[String] = Iterator.empty
     val fileEntries: Iterator[InputStream] = {
-      val is = createInputStream(filePath, datasetFileDesc)
+      val is = FileResolver.open(new URI(fileUri)).asInputStream()
       if (extract) {
         val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream(
           new BufferedInputStream(is)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
index 01d03538513..7997b4846ae 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
@@ -30,9 +30,9 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
   @JsonPropertyDescription("decoding charset to use on input")
   var fileEncoding: FileDecodingMethod = FileDecodingMethod.UTF_8
 
-  // Unified file handle, can be either a local path (String) or DatasetFileDocument
+  // uri of the file
   @JsonIgnore
-  var fileHandle: FileResolver.FileResolverOutput = _
+  var fileUri: Option[String] = None
 
   @JsonIgnore
   var fileTypeName: Option[String] = None
@@ -50,7 +50,7 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
   var offset: Option[Int] = None
 
   override def sourceSchema(): Schema = {
-    if (fileHandle == null) return null
+    if (fileUri == null) return null
     inferSchema()
   }
 
@@ -61,8 +61,8 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
       throw new RuntimeException("no input file name")
     }
 
-    // Resolve the file and assign the result to fileHandle
-    fileHandle = FileResolver.resolve(fileName.get)
+    // Resolve the file and assign the result to file uri
+    fileUri = Some(FileResolver.resolve(fileName.get).toASCIIString)
   }
 
   override def operatorInfo: OperatorInfo = {
@@ -77,14 +77,6 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
 
   def inferSchema(): Schema
 
-  // Get the source file descriptor from the fileHandle
-  def determineFilePathOrDatasetFile(): (String, DatasetFileDocument) = {
-    fileHandle match {
-      case Left(path)      => (path, null) // File path is a local path as String
-      case Right(document) => (null, document) // File is a DatasetFileDocument
-    }
-  }
-
   override def equals(that: Any): Boolean =
     EqualsBuilder.reflectionEquals(this, that, "context", "fileHandle")
 }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
index e9336f2acec..2cc906bce4d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
@@ -9,9 +9,11 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, FileInputStream, IOException, InputStreamReader}
+import java.net.URI
 
 class CSVScanSourceOpDesc extends ScanSourceOpDesc {
 
@@ -37,7 +39,6 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty || customDelimiter.get.isEmpty)
       customDelimiter = Option(",")
 
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
     PhysicalOp
       .sourcePhysicalOp(
         workflowId,
@@ -45,8 +46,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
         operatorIdentifier,
         OpExecInitInfo((_, _) =>
           new CSVScanSourceOpExec(
-            filepath,
-            fileDesc,
+            fileUri.get,
             fileEncoding,
             limit,
             offset,
@@ -74,13 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
       return null
     }
 
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
-    val stream =
-      if (filepath != null) {
-        new FileInputStream(new File(filepath))
-      } else {
-        fileDesc.asInputStream()
-      }
+    val stream = FileResolver.open(new URI(fileUri.get)).asInputStream()
     val inputReader =
       new InputStreamReader(stream, fileEncoding.getCharset)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
index fe5a9a61051..05942761d7b 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
@@ -6,14 +6,15 @@ import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport}
 import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike}
 import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.io.InputStreamReader
+import java.net.URI
 import scala.collection.immutable.ArraySeq
 
 class CSVScanSourceOpExec private[csv] (
-    filePath: String,
-    datasetFileDesc: DatasetFileDocument,
+    fileUri: String,
     fileEncoding: FileDecodingMethod,
     limit: Option[Int],
     offset: Option[Int],
@@ -69,7 +70,7 @@ class CSVScanSourceOpExec private[csv] (
 
   override def open(): Unit = {
     inputReader = new InputStreamReader(
-      createInputStream(filePath, datasetFileDesc),
+      FileResolver.open(new URI(fileUri)).asInputStream(),
       fileEncoding.getCharset
     )
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 567fa40ed34..66ecde61726 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -9,9 +9,11 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, IOException}
+import java.net.URI
 
 class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
 
@@ -39,13 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
 
     // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here
     // TODO: consider a better way
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
-    val file =
-      if (filepath == null) {
-        fileDesc.asFile()
-      } else {
-        new File(filepath)
-      }
+    val file = FileResolver.open(new URI(fileUri.get)).asFile()
     val totalBytes: Long = file.length()
 
     PhysicalOp
@@ -86,13 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty) {
       return null
     }
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
-    val file =
-      if (filepath == null) {
-        fileDesc.asFile()
-      } else {
-        new File(filepath)
-      }
+    val file = FileResolver.open(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
 
@@ -102,7 +92,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
     reader.close()
 
     // reopen the file to read from the beginning
-    reader = CSVReader.open(filepath)(CustomFormat)
+    reader = CSVReader.open(file.toPath.toString)(CustomFormat)
     if (hasHeader)
       reader.readNext()
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
index ca17df52a26..5524e6ee51a 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
@@ -9,9 +9,11 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, IOException}
+import java.net.URI
 
 class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
 
@@ -36,17 +38,6 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
     // fill in default values
     if (customDelimiter.get.isEmpty)
       customDelimiter = Option(",")
-
-    val (filepath, datasetFileDocument) = determineFilePathOrDatasetFile()
-    // for CSVOldScanSourceOpDesc, it requires the full File presence when execute, so use temp file here
-    // TODO: figure out a better way
-    val path =
-      if (filepath == null) {
-        datasetFileDocument.asFile().toPath.toString
-      } else {
-        filepath
-      }
-
     PhysicalOp
       .sourcePhysicalOp(
         workflowId,
@@ -54,7 +45,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
         operatorIdentifier,
         OpExecInitInfo((_, _) =>
           new CSVOldScanSourceOpExec(
-            path,
+            fileUri.get,
             fileEncoding,
             limit,
             offset,
@@ -81,13 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty) {
       return null
     }
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
-    val file =
-      if (filepath != null) {
-        new File(filepath)
-      } else {
-        fileDesc.asFile()
-      }
+    val file = FileResolver.open(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
index 60dc8d81d02..812c8361093 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
@@ -2,18 +2,15 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld
 
 import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat}
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
-import edu.uci.ics.amber.engine.common.model.tuple.{
-  Attribute,
-  AttributeTypeUtils,
-  Schema,
-  TupleLike
-}
+import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
+import java.net.URI
 import scala.collection.compat.immutable.ArraySeq
 
 class CSVOldScanSourceOpExec private[csvOld] (
-    filePath: String,
+    fileUri: String,
     fileEncoding: FileDecodingMethod,
     limit: Option[Int],
     offset: Option[Int],
@@ -51,7 +48,8 @@ class CSVOldScanSourceOpExec private[csvOld] (
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
-    reader = CSVReader.open(filePath, fileEncoding.getCharset.name())(CustomFormat)
+    val filePath = FileResolver.open(new URI(fileUri)).asFile().toPath
+    reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat)
     // skip line if this worker reads the start of a file, and the file has a header line
     val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
index e3c3bcd6027..be56a302419 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
@@ -9,10 +9,12 @@ import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, Workf
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
 import java.io.{BufferedReader, FileInputStream, IOException, InputStream, InputStreamReader}
+import java.net.URI
 import scala.collection.mutable.ArrayBuffer
 import scala.jdk.CollectionConverters.IteratorHasAsScala
 
@@ -37,8 +39,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
       workflowId: WorkflowIdentity,
       executionId: ExecutionIdentity
   ): PhysicalOp = {
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
-    val stream = createInputStream(filepath, fileDesc)
+    val stream = FileResolver.open(new URI(fileUri.get)).asInputStream()
     // count lines and partition the task to each worker
     val reader = new BufferedReader(
       new InputStreamReader(stream, fileEncoding.getCharset)
@@ -60,8 +61,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
             offsetValue + (if (idx != workerCount - 1) count / workerCount * (idx + 1)
                            else count)
           new JSONLScanSourceOpExec(
-            filepath,
-            fileDesc,
+            fileUri.get,
             fileEncoding,
             startOffset,
             endOffset,
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
index e57f93021ec..6d9a1d29e6d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
@@ -5,16 +5,17 @@ import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
 import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
 import java.io.{BufferedReader, InputStreamReader}
+import java.net.URI
 import scala.jdk.CollectionConverters.IteratorHasAsScala
 import scala.util.{Failure, Success, Try}
 
 class JSONLScanSourceOpExec private[json] (
-    filePath: String,
-    datasetFileDesc: DatasetFileDocument,
+    fileUri: String,
     fileEncoding: FileDecodingMethod,
     startOffset: Int,
     endOffset: Int,
@@ -42,7 +43,7 @@ class JSONLScanSourceOpExec private[json] (
   override def open(): Unit = {
     schema = schemaFunc()
     reader = new BufferedReader(
-      new InputStreamReader(createInputStream(filePath, datasetFileDesc), fileEncoding.getCharset)
+      new InputStreamReader(FileResolver.open(new URI(fileUri)).asInputStream(), fileEncoding.getCharset)
     )
     rows = reader.lines().iterator().asScala.slice(startOffset, endOffset)
   }
diff --git a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala
index a766be3606d..7c16a38d05c 100644
--- a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala
+++ b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala
@@ -18,7 +18,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
 
   before {
     fileScanSourceOpDesc = new FileScanSourceOpDesc()
-    fileScanSourceOpDesc.fileHandle = Left(TestTextFilePath)
+    fileScanSourceOpDesc.fileUri = Left(TestTextFilePath)
     fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.UTF_8
   }
 
@@ -61,7 +61,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileHandle.left.getOrElse(""),
+        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
         null,
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
@@ -87,12 +87,12 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
   }
 
   it should "read first 5 lines of the input text file with CRLF separators into corresponding output tuples" in {
-    fileScanSourceOpDesc.fileHandle = Left(TestCRLFTextFilePath)
+    fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath)
     fileScanSourceOpDesc.attributeType = FileAttributeType.STRING
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileHandle.left.getOrElse(""),
+        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
         null,
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
@@ -121,7 +121,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     fileScanSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileHandle.left.getOrElse(""),
+        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
         null,
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
@@ -148,11 +148,11 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
   }
 
   it should "read first 5 lines of the input text into corresponding output INTEGER tuples" in {
-    fileScanSourceOpDesc.fileHandle = Left(TestNumbersFilePath)
+    fileScanSourceOpDesc.fileUri = Left(TestNumbersFilePath)
     fileScanSourceOpDesc.attributeType = FileAttributeType.INTEGER
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec = new FileScanSourceOpExec(
-      fileScanSourceOpDesc.fileHandle.left.getOrElse(""),
+      fileScanSourceOpDesc.fileUri.left.getOrElse(""),
       null,
       fileScanSourceOpDesc.attributeType,
       fileScanSourceOpDesc.fileEncoding,
@@ -178,13 +178,13 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
   }
 
   it should "read first 5 lines of the input text file with US_ASCII encoding" in {
-    fileScanSourceOpDesc.fileHandle = Left(TestCRLFTextFilePath)
+    fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath)
     fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.ASCII
     fileScanSourceOpDesc.attributeType = FileAttributeType.STRING
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileHandle.left.getOrElse(""),
+        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
         null,
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,

From 6ec1fb4eb2fc97d08069eecce71f0d50a870d659 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sat, 26 Oct 2024 23:36:39 -0700
Subject: [PATCH 02/18] fix more

---
 .../user/dataset/DatasetResource.scala        | 95 +++----------------
 .../common/storage/FileResolver.scala         | 33 +++----
 2 files changed, 29 insertions(+), 99 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index 3521c6b3411..96459561d03 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -4,63 +4,18 @@ import edu.uci.ics.amber.engine.common.Utils.withTransaction
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.auth.SessionUser
 import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege
-import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{
-  DatasetDao,
-  DatasetUserAccessDao,
-  DatasetVersionDao
-}
-import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{
-  Dataset,
-  DatasetUserAccess,
-  DatasetVersion,
-  User
-}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User}
 import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
 import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{
-  getDatasetUserAccessPrivilege,
-  getOwner,
-  userHasReadAccess,
-  userHasWriteAccess,
-  userOwnDataset
-}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{
-  DATASET_IS_PRIVATE,
-  DATASET_IS_PUBLIC,
-  DashboardDataset,
-  DashboardDatasetVersion,
-  DatasetDescriptionModification,
-  DatasetIDs,
-  DatasetNameModification,
-  DatasetVersionRootFileNodes,
-  DatasetVersionRootFileNodesResponse,
-  DatasetVersions,
-  ERR_DATASET_CREATION_FAILED_MESSAGE,
-  ERR_DATASET_NAME_ALREADY_EXISTS,
-  ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE,
-  ListDatasetsResponse,
-  calculateLatestDatasetVersionSize,
-  calculateDatasetVersionSize,
-  context,
-  createNewDatasetVersionFromFormData,
-  getDashboardDataset,
-  getDatasetByID,
-  getDatasetVersionByID,
-  getDatasetVersions,
-  getFileNodesOfCertainVersion,
-  getLatestDatasetVersionWithAccessCheck,
-  getUserDatasets,
-  resolvePath,
-  retrievePublicDatasets
-}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{
-  DatasetFileNode,
-  PhysicalFileNode
-}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{getDatasetUserAccessPrivilege, getOwner, userHasReadAccess, userHasWriteAccess, userOwnDataset}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{DATASET_IS_PRIVATE, DATASET_IS_PUBLIC, DashboardDataset, DashboardDatasetVersion, DatasetDescriptionModification, DatasetIDs, DatasetNameModification, DatasetVersionRootFileNodes, DatasetVersionRootFileNodesResponse, DatasetVersions, ERR_DATASET_CREATION_FAILED_MESSAGE, ERR_DATASET_NAME_ALREADY_EXISTS, ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, ListDatasetsResponse, calculateDatasetVersionSize, calculateLatestDatasetVersionSize, context, createNewDatasetVersionFromFormData, getDashboardDataset, getDatasetByID, getDatasetVersionByID, getDatasetVersions, getFileNodesOfCertainVersion, getLatestDatasetVersionWithAccessCheck, getUserDatasets, retrievePublicDatasets}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode}
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import io.dropwizard.auth.Auth
 import org.apache.commons.lang3.StringUtils
 import org.glassfish.jersey.media.multipart.{FormDataMultiPart, FormDataParam}
@@ -76,21 +31,9 @@ import java.util.zip.{ZipEntry, ZipOutputStream}
 import java.util
 import java.util.concurrent.locks.ReentrantLock
 import javax.annotation.security.RolesAllowed
-import javax.ws.rs.{
-  BadRequestException,
-  Consumes,
-  ForbiddenException,
-  GET,
-  NotFoundException,
-  POST,
-  Path,
-  PathParam,
-  Produces,
-  QueryParam,
-  WebApplicationException
-}
+import javax.ws.rs.{BadRequestException, Consumes, ForbiddenException, GET, NotFoundException, POST, Path, PathParam, Produces, QueryParam, WebApplicationException}
 import javax.ws.rs.core.{MediaType, Response, StreamingOutput}
-import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
+import scala.collection.convert.ImplicitConversions.{`collection AsScalaIterable`, `iterable AsScalaIterable`}
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 import scala.jdk.CollectionConverters._
@@ -277,17 +220,12 @@ object DatasetResource {
           .parse(filePathsValue)
           .as[List[String]]
           .foreach(pathStr => {
-            val (_, _, _, fileRelativePath) =
-              resolvePath(Paths.get(pathStr), shouldContainFile = true)
-
+            val fileRelativePath = Paths.get(FileResolver.resolve(pathStr).getPath)
             fileRelativePath
               .map { path =>
                 filesToRemove += datasetPath
                   .resolve(path) // When path exists, resolve it and add to filesToRemove
               }
-              .getOrElse {
-                throw new IllegalArgumentException("File relative path is missing")
-              }
           })
       }
     }
@@ -763,7 +701,7 @@ class DatasetResource {
         // if the file path is given, then only fetch the dataset and version this file is belonging to
         val decodedPathStr = URLDecoder.decode(filePathStr, StandardCharsets.UTF_8.name())
         val (ownerEmail, dataset, version, _) =
-          resolvePath(Paths.get(decodedPathStr), shouldContainFile = true)
+          FileResolver.parseFileNameForDataset(ctx, decodedPathStr)
         val accessPrivilege = getDatasetUserAccessPrivilege(ctx, dataset.getDid, uid)
         if (
           accessPrivilege == DatasetUserAccessPrivilege.NONE && dataset.getIsPublic == DATASET_IS_PRIVATE
@@ -988,10 +926,10 @@ class DatasetResource {
     val uid = user.getUid
     val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name())
 
-    val (_, dataset, dsVersion, fileRelativePath) =
-      resolvePath(Paths.get(decodedPathStr), shouldContainFile = true)
-
     withTransaction(context)(ctx => {
+      val (_, dataset, dsVersion, fileRelativePath) =
+        FileResolver.parseFileNameForDataset(ctx, decodedPathStr)
+
       val did = dataset.getDid
       val dvid = dsVersion.getDvid
 
@@ -1005,7 +943,7 @@ class DatasetResource {
       val streamingOutput = new StreamingOutput() {
         override def write(output: OutputStream): Unit = {
           fileRelativePath
-            .map { path =>
+            .foreach { path =>
               GitVersionControlLocalFileStorage.retrieveFileContentOfVersion(
                 targetDatasetPath,
                 datasetVersion.getVersionHash,
@@ -1013,9 +951,6 @@ class DatasetResource {
                 output
               )
             }
-            .getOrElse {
-              throw new IllegalArgumentException("File relative path is missing.")
-            }
         }
       }
 
@@ -1125,7 +1060,7 @@ class DatasetResource {
   ): (Dataset, DatasetVersion) = {
     val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name())
     val (_, dataset, dsVersion, _) =
-      resolvePath(Paths.get(decodedPathStr), shouldContainFile = false)
+      FileResolver.parseFileNameForDataset(context, decodedPathStr)
 
     validateUserAccess(dataset.getDid, user.getUid)
     (dataset, dsVersion)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index 1c1b4f43c1a..e4043941da1 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -2,7 +2,7 @@ package edu.uci.ics.texera.workflow.common.storage
 
 import edu.uci.ics.amber.engine.common.Utils.withTransaction
 
-import java.nio.file.{Files, Paths}
+import java.nio.file.{Files, Path, Paths}
 import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument, VirtualDocument}
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
@@ -67,6 +67,7 @@ object FileResolver {
         throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
     }
   }
+
   /**
     * Attempts to resolve a local file path.
     * @throws FileNotFoundException if the local file does not exist
@@ -94,22 +95,8 @@ object FileResolver {
     * @throws FileNotFoundException if the dataset file does not exist or cannot be created
     */
   private def datasetResolveFunc(fileName: String): URI = {
-    val filePath = Paths.get(fileName)
-    val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray
-
-    if (pathSegments.length < 4) {
-      throw new RuntimeException(
-        s"Invalid path format. Expected format: /ownerEmail/datasetName/versionName/fileRelativePath"
-      )
-    }
-
-    val ownerEmail = pathSegments(0)
-    val datasetName = pathSegments(1)
-    val versionName = pathSegments(2)
-    val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/"))
-
     withTransaction(SqlServer.createDSLContext()) { ctx =>
-      val (dataset, datasetVersion) = getDatasetAndDatasetVersionByName(ctx, ownerEmail, datasetName, versionName)
+      val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName)
       if (dataset == null || datasetVersion == null) {
         throw new FileNotFoundException(s"Dataset file $fileName")
       }
@@ -120,7 +107,15 @@ object FileResolver {
     }
   }
 
-  private def getDatasetAndDatasetVersionByName(ctx: DSLContext, ownerEmail: String, datasetName: String, datasetVersionName: String): (Dataset, DatasetVersion) = {
+  def parseFileNameForDataset(ctx: DSLContext, fileName: String): (String, Dataset, DatasetVersion, Path) = {
+    val filePath = Paths.get(fileName)
+    val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray
+
+    val ownerEmail = pathSegments(0)
+    val datasetName = pathSegments(1)
+    val versionName = pathSegments(2)
+    val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/"))
+
     val dataset = ctx
       .select(DATASET.fields: _*)
       .from(DATASET)
@@ -133,8 +128,8 @@ object FileResolver {
     val datasetVersion = ctx
       .selectFrom(DATASET_VERSION)
       .where(DATASET_VERSION.DID.eq(dataset.getDid))
-      .and(DATASET_VERSION.NAME.eq(datasetVersionName))
+      .and(DATASET_VERSION.NAME.eq(versionName))
       .fetchOneInto(classOf[DatasetVersion])
-    (dataset, datasetVersion)
+    (ownerEmail, dataset, datasetVersion, fileRelativePath)
   }
 }

From 1013205e10cd4991bfd0f8e03fd07bab9dc4b09f Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sun, 27 Oct 2024 08:07:09 -0700
Subject: [PATCH 03/18] fix more

---
 .../operators/source/scan/json/JSONLScanSourceOpDesc.scala     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
index be56a302419..99547e0182a 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
@@ -85,8 +85,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
-    val (filepath, fileDesc) = determineFilePathOrDatasetFile()
-    val stream = createInputStream(filepath, fileDesc)
+    val stream = FileResolver.open(new URI(fileUri.get)).asInputStream()
     val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset))
     var fieldNames = Set[String]()
 

From bb60d75f58bc6934536d1c4e12c126dc260384cb Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sun, 27 Oct 2024 09:28:12 -0700
Subject: [PATCH 04/18] make it work

---
 .../resource/SchemaPropagationResource.scala  |  1 +
 .../common/storage/FileResolver.scala         | 55 +++++++++++++------
 .../common/workflow/LogicalPlan.scala         | 24 ++++++++
 .../common/workflow/WorkflowCompiler.scala    |  3 +-
 .../source/scan/ScanSourceOpDesc.scala        | 15 +++--
 5 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala
index 1a48eca0ae9..dfcf38cedf3 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala
@@ -40,6 +40,7 @@ class SchemaPropagationResource extends LazyLogging {
     )
 
     val logicalPlan = LogicalPlan(logicalPlanPojo)
+    logicalPlan.resolveScanSourceOpFileName(None)
 
     // the PhysicalPlan with topology expanded.
     val physicalPlan = PhysicalPlan(context, logicalPlan)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index e4043941da1..39a430d0191 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -12,7 +12,8 @@ import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET
 import org.apache.commons.vfs2.FileNotFoundException
 import org.jooq.DSLContext
 
-import java.net.URI
+import java.net.{URI, URLDecoder, URLEncoder}
+import java.nio.charset.StandardCharsets
 import scala.util.{Success, Try}
 
 object FileResolver {
@@ -45,19 +46,26 @@ object FileResolver {
   def open(fileUri: URI): ReadonlyVirtualDocument[_] = {
     fileUri.getScheme match {
       case DatasetFileUriScheme =>
-        // Parse the host to get dataset ID and version hash
-        val hostParts = fileUri.getHost.split("\\.")
-        if (hostParts.length != 2) {
+        // Extract path components and decode them
+        val pathParts = fileUri.getPath.stripPrefix("/").split("/").map(part =>
+          URLDecoder.decode(part, StandardCharsets.UTF_8)
+        )
+
+        if (pathParts.length < 3) {
           throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}")
         }
-        val datasetId = hostParts(0).toInt
-        val versionHash = hostParts(1)
 
-        // The path within the URI represents the relative path of the file in the dataset
-        val fileRelativePath = Paths.get(fileUri.getPath.stripPrefix("/"))
+        // Parse the dataset ID and version hash, and build the file path
+        val did = pathParts(0).toInt
+        val versionHash = pathParts(1)
+        val fileRelativePath = Paths.get(pathParts.drop(2).mkString("/"))
 
-        // Create and return a DatasetFileDocument with the parsed values
-        new DatasetFileDocument(datasetId, versionHash, fileRelativePath)
+        // Create and return a DatasetFileDocument
+        new DatasetFileDocument(
+          did = did,
+          datasetVersionHash = versionHash,
+          fileRelativePath = fileRelativePath
+        )
 
       case "file" =>
         // For local files, create a ReadonlyLocalFileDocument
@@ -87,23 +95,38 @@ object FileResolver {
     *
     * The fileName format should be: /ownerEmail/datasetName/versionName/fileRelativePath
     *   e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv
-    * The output dataset URI format is: {DatasetFileUriScheme}://{did}.{versionHash}/file-path
-    *   e.g. vfs://15.adeq233td/some/dir/file.txt
+    * The output dataset URI format is: {DatasetFileUriScheme}:///{did}/{versionHash}/file-path
+    *   e.g. vfs:///15/adeq233td/some/dir/file.txt
     *
     * @param fileName the name of the file to attempt resolving as a DatasetFileDocument
     * @return Either[String, DatasetFileDocument] - Right(document) if creation succeeds
     * @throws FileNotFoundException if the dataset file does not exist or cannot be created
     */
+
+  import java.net.{URI, URISyntaxException, URLEncoder}
+  import java.nio.charset.StandardCharsets
+  import java.nio.file.Path
+  import org.apache.commons.vfs2.FileNotFoundException
+
   private def datasetResolveFunc(fileName: String): URI = {
     withTransaction(SqlServer.createDSLContext()) { ctx =>
       val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName)
+
       if (dataset == null || datasetVersion == null) {
-        throw new FileNotFoundException(s"Dataset file $fileName")
+        throw new FileNotFoundException(s"Dataset file $fileName not found.")
       }
 
-      // assemble dataset URI format
-      val host = s"${dataset.getDid.intValue()}.${datasetVersion.getVersionHash}"
-      new URI(DatasetFileUriScheme, host, fileRelativePath.toUri.getPath, null)
+      // Construct path as /{did}/{versionHash}/file-path
+      val did = dataset.getDid.intValue()
+      val versionHash = datasetVersion.getVersionHash
+      val encodedPath = s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}"
+
+      try {
+        new URI(DatasetFileUriScheme, null, encodedPath, null)
+      } catch {
+        case e: URISyntaxException =>
+          throw e
+      }
     }
   }
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
index 3ea3e24fd06..6e9da8e9fcb 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
@@ -8,6 +8,8 @@ import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 import edu.uci.ics.texera.web.model.websocket.request.LogicalPlanPojo
 import edu.uci.ics.texera.workflow.common.operators.LogicalOp
 import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 import org.jgrapht.graph.DirectedAcyclicGraph
 import org.jgrapht.util.SupplierUtil
 
@@ -144,6 +146,28 @@ case class LogicalPlan(
       .toMap
   }
 
+  def resolveScanSourceOpFileName(
+                                   errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]]
+                                 ): Unit = {
+    operators.foreach {
+      case operator@(scanOp: ScanSourceOpDesc) =>
+        Try {
+          // Resolve file path for ScanSourceOpDesc
+          val fileName = scanOp.fileName.getOrElse(throw new RuntimeException("no input file name"))
+          val fileUri = FileResolver.resolve(fileName) // Convert to URI
+
+          // Set the URI in the ScanSourceOpDesc
+          scanOp.setFileUri(fileUri)
+        } match {
+          case Success(_) => // Successfully resolved and set the file URI
+          case Failure(err) =>
+            logger.error("Error resolving file path for ScanSourceOpDesc", err)
+            errorList.foreach(_.append((operator.operatorIdentifier, err)))
+        }
+      case _ => // Skip non-ScanSourceOpDesc operators
+    }
+  }
+
   def propagateWorkflowSchema(
       context: WorkflowContext,
       errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]]
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala
index efa4e305275..e40252f5409 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala
@@ -51,7 +51,7 @@ class WorkflowCompiler(
       logicalPlanPojo.opsToViewResult,
       logicalPlan
     )
-
+    logicalPlan.resolveScanSourceOpFileName(Some(errorList))
     logicalPlan.propagateWorkflowSchema(context, Some(errorList))
     // map compilation errors with op id
     if (errorList.nonEmpty) {
@@ -121,6 +121,7 @@ class WorkflowCompiler(
       logicalPlan
     )
 
+    logicalPlan.resolveScanSourceOpFileName(Some(errorList))
     logicalPlan.propagateWorkflowSchema(context, Some(errorList))
 
     // report compilation errors
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
index 7997b4846ae..82817358304 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
@@ -12,6 +12,8 @@ import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescrip
 import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import org.apache.commons.lang3.builder.EqualsBuilder
 
+import java.net.URI
+
 abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
 
   /** in the case we do not want to read the entire large file, but only
@@ -50,19 +52,12 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
   var offset: Option[Int] = None
 
   override def sourceSchema(): Schema = {
-    if (fileUri == null) return null
+    if (fileUri.isEmpty) return null
     inferSchema()
   }
 
   override def setContext(workflowContext: WorkflowContext): Unit = {
     super.setContext(workflowContext)
-
-    if (fileName.isEmpty) {
-      throw new RuntimeException("no input file name")
-    }
-
-    // Resolve the file and assign the result to file uri
-    fileUri = Some(FileResolver.resolve(fileName.get).toASCIIString)
   }
 
   override def operatorInfo: OperatorInfo = {
@@ -77,6 +72,10 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
 
   def inferSchema(): Schema
 
+  def setFileUri(uri: URI): Unit = {
+    fileUri = Some(uri.toASCIIString)
+  }
+
   override def equals(that: Any): Boolean =
     EqualsBuilder.reflectionEquals(this, that, "context", "fileHandle")
 }

From 8b35d90023e97c461e1d4768ea4895d1c336bad2 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sun, 27 Oct 2024 09:41:33 -0700
Subject: [PATCH 05/18] fmt

---
 .../common/storage/DatasetFileDocument.scala  | 13 ++--
 .../storage/ReadonlyLocalFileDocument.scala   |  2 +-
 .../storage/ReadonlyVirtualDocument.scala     |  2 +-
 .../user/dataset/DatasetResource.scala        | 77 +++++++++++++++++--
 .../common/storage/FileResolver.scala         | 34 ++++----
 .../common/workflow/LogicalPlan.scala         |  6 +-
 .../scan/csvOld/CSVOldScanSourceOpExec.scala  |  7 +-
 .../scan/json/JSONLScanSourceOpExec.scala     |  5 +-
 8 files changed, 110 insertions(+), 36 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
index 3f013825ccb..cbc90d70fe6 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
@@ -9,7 +9,8 @@ import java.io.{File, FileOutputStream, InputStream}
 import java.net.URI
 import java.nio.file.{Files, Path}
 
-class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path) extends VirtualDocument[Nothing] {
+class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path)
+    extends VirtualDocument[Nothing] {
   private var tempFile: Option[File] = None
 
   override def getURI: URI =
@@ -20,11 +21,11 @@ class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath
   override def asInputStream(): InputStream = {
     val datasetAbsolutePath = PathUtils.getDatasetPath(UInteger.valueOf(did))
     GitVersionControlLocalFileStorage
-          .retrieveFileContentOfVersionAsInputStream(
-            datasetAbsolutePath,
-            datasetVersionHash,
-            datasetAbsolutePath.resolve(fileRelativePath)
-          )
+      .retrieveFileContentOfVersionAsInputStream(
+        datasetAbsolutePath,
+        datasetVersionHash,
+        datasetAbsolutePath.resolve(fileRelativePath)
+      )
   }
 
   override def asFile(): File = {
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
index 86873e2c525..2ea19ee887d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
@@ -64,4 +64,4 @@ class ReadonlyLocalFileDocument(uri: URI) extends ReadonlyVirtualDocument[Nothin
     */
   override def getCount: Long =
     throw new NotImplementedError("getCount is not supported for ReadonlyLocalFileDocument")
-}
\ No newline at end of file
+}
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala
index e9df984f036..81acadd5aff 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala
@@ -61,4 +61,4 @@ trait ReadonlyVirtualDocument[T] {
     */
 
   def asFile(): File
-}
\ No newline at end of file
+}
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index 96459561d03..9e47377ee57 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -4,15 +4,60 @@ import edu.uci.ics.amber.engine.common.Utils.withTransaction
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.auth.SessionUser
 import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege
-import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao}
-import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{
+  DatasetDao,
+  DatasetUserAccessDao,
+  DatasetVersionDao
+}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{
+  Dataset,
+  DatasetUserAccess,
+  DatasetVersion,
+  User
+}
 import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
 import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{getDatasetUserAccessPrivilege, getOwner, userHasReadAccess, userHasWriteAccess, userOwnDataset}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{DATASET_IS_PRIVATE, DATASET_IS_PUBLIC, DashboardDataset, DashboardDatasetVersion, DatasetDescriptionModification, DatasetIDs, DatasetNameModification, DatasetVersionRootFileNodes, DatasetVersionRootFileNodesResponse, DatasetVersions, ERR_DATASET_CREATION_FAILED_MESSAGE, ERR_DATASET_NAME_ALREADY_EXISTS, ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, ListDatasetsResponse, calculateDatasetVersionSize, calculateLatestDatasetVersionSize, context, createNewDatasetVersionFromFormData, getDashboardDataset, getDatasetByID, getDatasetVersionByID, getDatasetVersions, getFileNodesOfCertainVersion, getLatestDatasetVersionWithAccessCheck, getUserDatasets, retrievePublicDatasets}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{
+  getDatasetUserAccessPrivilege,
+  getOwner,
+  userHasReadAccess,
+  userHasWriteAccess,
+  userOwnDataset
+}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{
+  DATASET_IS_PRIVATE,
+  DATASET_IS_PUBLIC,
+  DashboardDataset,
+  DashboardDatasetVersion,
+  DatasetDescriptionModification,
+  DatasetIDs,
+  DatasetNameModification,
+  DatasetVersionRootFileNodes,
+  DatasetVersionRootFileNodesResponse,
+  DatasetVersions,
+  ERR_DATASET_CREATION_FAILED_MESSAGE,
+  ERR_DATASET_NAME_ALREADY_EXISTS,
+  ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE,
+  ListDatasetsResponse,
+  calculateDatasetVersionSize,
+  calculateLatestDatasetVersionSize,
+  context,
+  createNewDatasetVersionFromFormData,
+  getDashboardDataset,
+  getDatasetByID,
+  getDatasetVersionByID,
+  getDatasetVersions,
+  getFileNodesOfCertainVersion,
+  getLatestDatasetVersionWithAccessCheck,
+  getUserDatasets,
+  retrievePublicDatasets
+}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{
+  DatasetFileNode,
+  PhysicalFileNode
+}
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
 import edu.uci.ics.texera.workflow.common.storage.FileResolver
@@ -31,9 +76,24 @@ import java.util.zip.{ZipEntry, ZipOutputStream}
 import java.util
 import java.util.concurrent.locks.ReentrantLock
 import javax.annotation.security.RolesAllowed
-import javax.ws.rs.{BadRequestException, Consumes, ForbiddenException, GET, NotFoundException, POST, Path, PathParam, Produces, QueryParam, WebApplicationException}
+import javax.ws.rs.{
+  BadRequestException,
+  Consumes,
+  ForbiddenException,
+  GET,
+  NotFoundException,
+  POST,
+  Path,
+  PathParam,
+  Produces,
+  QueryParam,
+  WebApplicationException
+}
 import javax.ws.rs.core.{MediaType, Response, StreamingOutput}
-import scala.collection.convert.ImplicitConversions.{`collection AsScalaIterable`, `iterable AsScalaIterable`}
+import scala.collection.convert.ImplicitConversions.{
+  `collection AsScalaIterable`,
+  `iterable AsScalaIterable`
+}
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 import scala.jdk.CollectionConverters._
@@ -220,7 +280,8 @@ object DatasetResource {
           .parse(filePathsValue)
           .as[List[String]]
           .foreach(pathStr => {
-            val fileRelativePath = Paths.get(FileResolver.resolve(pathStr).getPath)
+            // TODO: refactor this part
+            val (_, _, _, fileRelativePath) = FileResolver.parseFileNameForDataset(context, pathStr)
             fileRelativePath
               .map { path =>
                 filesToRemove += datasetPath
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index 39a430d0191..a3d74e4ec43 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -3,7 +3,12 @@ package edu.uci.ics.texera.workflow.common.storage
 import edu.uci.ics.amber.engine.common.Utils.withTransaction
 
 import java.nio.file.{Files, Path, Paths}
-import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument, VirtualDocument}
+import edu.uci.ics.amber.engine.common.storage.{
+  DatasetFileDocument,
+  ReadonlyLocalFileDocument,
+  ReadonlyVirtualDocument,
+  VirtualDocument
+}
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
 import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
@@ -47,24 +52,19 @@ object FileResolver {
     fileUri.getScheme match {
       case DatasetFileUriScheme =>
         // Extract path components and decode them
-        val pathParts = fileUri.getPath.stripPrefix("/").split("/").map(part =>
-          URLDecoder.decode(part, StandardCharsets.UTF_8)
-        )
+        val pathParts = fileUri.getPath
+          .stripPrefix("/")
+          .split("/")
+          .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8))
 
         if (pathParts.length < 3) {
           throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}")
         }
 
-        // Parse the dataset ID and version hash, and build the file path
-        val did = pathParts(0).toInt
-        val versionHash = pathParts(1)
-        val fileRelativePath = Paths.get(pathParts.drop(2).mkString("/"))
-
-        // Create and return a DatasetFileDocument
         new DatasetFileDocument(
-          did = did,
-          datasetVersionHash = versionHash,
-          fileRelativePath = fileRelativePath
+          did = pathParts(0).toInt,
+          datasetVersionHash = pathParts(1),
+          fileRelativePath = Paths.get(pathParts.drop(2).mkString("/"))
         )
 
       case "file" =>
@@ -119,7 +119,8 @@ object FileResolver {
       // Construct path as /{did}/{versionHash}/file-path
       val did = dataset.getDid.intValue()
       val versionHash = datasetVersion.getVersionHash
-      val encodedPath = s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}"
+      val encodedPath =
+        s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}"
 
       try {
         new URI(DatasetFileUriScheme, null, encodedPath, null)
@@ -130,7 +131,10 @@ object FileResolver {
     }
   }
 
-  def parseFileNameForDataset(ctx: DSLContext, fileName: String): (String, Dataset, DatasetVersion, Path) = {
+  def parseFileNameForDataset(
+      ctx: DSLContext,
+      fileName: String
+  ): (String, Dataset, DatasetVersion, Path) = {
     val filePath = Paths.get(fileName)
     val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
index 6e9da8e9fcb..8af7d60fd05 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
@@ -147,10 +147,10 @@ case class LogicalPlan(
   }
 
   def resolveScanSourceOpFileName(
-                                   errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]]
-                                 ): Unit = {
+      errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]]
+  ): Unit = {
     operators.foreach {
-      case operator@(scanOp: ScanSourceOpDesc) =>
+      case operator @ (scanOp: ScanSourceOpDesc) =>
         Try {
           // Resolve file path for ScanSourceOpDesc
           val fileName = scanOp.fileName.getOrElse(throw new RuntimeException("no input file name"))
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
index 812c8361093..7c11005d643 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
@@ -2,7 +2,12 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld
 
 import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat}
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
-import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike}
+import edu.uci.ics.amber.engine.common.model.tuple.{
+  Attribute,
+  AttributeTypeUtils,
+  Schema,
+  TupleLike
+}
 import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
index 6d9a1d29e6d..f00a07802fc 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
@@ -43,7 +43,10 @@ class JSONLScanSourceOpExec private[json] (
   override def open(): Unit = {
     schema = schemaFunc()
     reader = new BufferedReader(
-      new InputStreamReader(FileResolver.open(new URI(fileUri)).asInputStream(), fileEncoding.getCharset)
+      new InputStreamReader(
+        FileResolver.open(new URI(fileUri)).asInputStream(),
+        fileEncoding.getCharset
+      )
     )
     rows = reader.lines().iterator().asScala.slice(startOffset, endOffset)
   }

From 4aba44135b26c7b43a538ab9e766bc00d10e8670 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sun, 27 Oct 2024 09:59:14 -0700
Subject: [PATCH 06/18] add type alias

---
 .../uci/ics/texera/workflow/common/storage/FileResolver.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index a3d74e4ec43..8908970c522 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -22,6 +22,8 @@ import java.nio.charset.StandardCharsets
 import scala.util.{Success, Try}
 
 object FileResolver {
+  type FileHandle = ReadonlyVirtualDocument[_]
+
   private val DatasetFileUriScheme = "vfs"
 
   /**
@@ -48,7 +50,7 @@ object FileResolver {
     * @param fileUri the uri pointing to the file
     * @return
     */
-  def open(fileUri: URI): ReadonlyVirtualDocument[_] = {
+  def open(fileUri: URI): FileHandle = {
     fileUri.getScheme match {
       case DatasetFileUriScheme =>
         // Extract path components and decode them

From 2c4a9da9fa88003f12d2b709cbdf7df1b37f10f3 Mon Sep 17 00:00:00 2001
From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com>
Date: Sun, 27 Oct 2024 13:06:16 -0700
Subject: [PATCH 07/18] some handy changes

---
 .../common/storage/DatasetFileDocument.scala  |   1 -
 .../storage/ReadonlyLocalFileDocument.scala   |   1 -
 .../user/dataset/DatasetResource.scala        | 102 ++++--------------
 .../common/storage/FileResolver.scala         |  31 ++----
 4 files changed, 29 insertions(+), 106 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
index cbc90d70fe6..22d7672d1d4 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
@@ -1,6 +1,5 @@
 package edu.uci.ics.amber.engine.common.storage
 
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
 import org.jooq.types.UInteger
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
index 2ea19ee887d..ce1f0329625 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala
@@ -2,7 +2,6 @@ package edu.uci.ics.amber.engine.common.storage
 
 import java.io.{File, FileInputStream, InputStream}
 import java.net.URI
-import java.nio.file.Path
 
 /**
   * ReadonlyLocalFileDocument provides a read-only abstraction over a local file.
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index 9e47377ee57..752278f0526 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -4,103 +4,41 @@ import edu.uci.ics.amber.engine.common.Utils.withTransaction
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.auth.SessionUser
 import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege
-import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{
-  DatasetDao,
-  DatasetUserAccessDao,
-  DatasetVersionDao
-}
-import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{
-  Dataset,
-  DatasetUserAccess,
-  DatasetVersion,
-  User
-}
 import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
-import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{
-  getDatasetUserAccessPrivilege,
-  getOwner,
-  userHasReadAccess,
-  userHasWriteAccess,
-  userOwnDataset
-}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{
-  DATASET_IS_PRIVATE,
-  DATASET_IS_PUBLIC,
-  DashboardDataset,
-  DashboardDatasetVersion,
-  DatasetDescriptionModification,
-  DatasetIDs,
-  DatasetNameModification,
-  DatasetVersionRootFileNodes,
-  DatasetVersionRootFileNodesResponse,
-  DatasetVersions,
-  ERR_DATASET_CREATION_FAILED_MESSAGE,
-  ERR_DATASET_NAME_ALREADY_EXISTS,
-  ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE,
-  ListDatasetsResponse,
-  calculateDatasetVersionSize,
-  calculateLatestDatasetVersionSize,
-  context,
-  createNewDatasetVersionFromFormData,
-  getDashboardDataset,
-  getDatasetByID,
-  getDatasetVersionByID,
-  getDatasetVersions,
-  getFileNodesOfCertainVersion,
-  getLatestDatasetVersionWithAccessCheck,
-  getUserDatasets,
-  retrievePublicDatasets
-}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{
-  DatasetFileNode,
-  PhysicalFileNode
-}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
+import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource._
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{context, _}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode}
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
 import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import io.dropwizard.auth.Auth
 import org.apache.commons.lang3.StringUtils
 import org.glassfish.jersey.media.multipart.{FormDataMultiPart, FormDataParam}
-import org.jooq.{DSLContext, EnumType}
 import org.jooq.types.UInteger
+import org.jooq.{DSLContext, EnumType}
 import play.api.libs.json.Json
 
 import java.io.{IOException, InputStream, OutputStream}
 import java.net.URLDecoder
 import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, Paths}
-import java.util.zip.{ZipEntry, ZipOutputStream}
+import java.nio.file.Files
 import java.util
 import java.util.concurrent.locks.ReentrantLock
+import java.util.zip.{ZipEntry, ZipOutputStream}
 import javax.annotation.security.RolesAllowed
-import javax.ws.rs.{
-  BadRequestException,
-  Consumes,
-  ForbiddenException,
-  GET,
-  NotFoundException,
-  POST,
-  Path,
-  PathParam,
-  Produces,
-  QueryParam,
-  WebApplicationException
-}
 import javax.ws.rs.core.{MediaType, Response, StreamingOutput}
-import scala.collection.convert.ImplicitConversions.{
-  `collection AsScalaIterable`,
-  `iterable AsScalaIterable`
-}
+import javax.ws.rs._
+import scala. jdk. CollectionConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
-import scala.jdk.CollectionConverters._
 import scala.jdk.OptionConverters._
-import scala.util.Using
 import scala.util.control.NonFatal
-import scala.util.{Failure, Success, Try}
+import scala.util.{Failure, Success, Try, Using}
 
 object DatasetResource {
   val DATASET_IS_PUBLIC: Byte = 1;
@@ -282,7 +220,7 @@ object DatasetResource {
           .foreach(pathStr => {
             // TODO: refactor this part
             val (_, _, _, fileRelativePath) = FileResolver.parseFileNameForDataset(context, pathStr)
-            fileRelativePath
+            fileRelativePath.asScala
               .map { path =>
                 filesToRemove += datasetPath
                   .resolve(path) // When path exists, resolve it and add to filesToRemove
@@ -352,7 +290,7 @@ object DatasetResource {
       .orderBy(DATASET_VERSION.CREATION_TIME.desc()) // or .asc() for ascending
       .fetchInto(classOf[DatasetVersion])
 
-    result.toList
+    result.asScala.toList
   }
 
   // apply the dataset operation to create a new dataset version
@@ -420,7 +358,7 @@ object DatasetResource {
             .into(classOf[DatasetVersion]),
           DatasetFileNode.fromPhysicalFileNodes(
             Map(
-              (ownerEmail, datasetName, versionName) -> physicalFileNodes.toList
+              (ownerEmail, datasetName, versionName) -> physicalFileNodes.asScala.toList
             )
           )
         )
@@ -808,7 +746,7 @@ class DatasetResource {
                 ownerEmail = ownerEmail,
                 size = calculateLatestDatasetVersionSize(dataset.getDid)
               )
-            })
+            }).asScala
         )
 
         // then we fetch the public datasets and merge it as a part of the result if not exist
@@ -851,7 +789,7 @@ class DatasetResource {
                     PathUtils.getDatasetPath(did),
                     version.getVersionHash
                   )
-                  .toList)
+                  .asScala)
               }
               DashboardDatasetVersion(
                 version,
@@ -918,7 +856,7 @@ class DatasetResource {
                   datasetPath,
                   latestVersion.getVersionHash
                 )
-                .toList
+                .asScala.toList
           )
         )
         .head
@@ -951,7 +889,7 @@ class DatasetResource {
       val size = calculateDatasetVersionSize(did, dvid)
       val ownerFileNode = DatasetFileNode
         .fromPhysicalFileNodes(
-          Map((dataset.ownerEmail, datasetName, datasetVersion.getName) -> fileNodes.toList)
+          Map((dataset.ownerEmail, datasetName, datasetVersion.getName) -> fileNodes.asScala.toList)
         )
         .head
 
@@ -1003,7 +941,7 @@ class DatasetResource {
 
       val streamingOutput = new StreamingOutput() {
         override def write(output: OutputStream): Unit = {
-          fileRelativePath
+          fileRelativePath.asScala
             .foreach { path =>
               GitVersionControlLocalFileStorage.retrieveFileContentOfVersion(
                 targetDatasetPath,
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index 8908970c522..5d06175ad25 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -1,24 +1,18 @@
 package edu.uci.ics.texera.workflow.common.storage
 
 import edu.uci.ics.amber.engine.common.Utils.withTransaction
-
-import java.nio.file.{Files, Path, Paths}
-import edu.uci.ics.amber.engine.common.storage.{
-  DatasetFileDocument,
-  ReadonlyLocalFileDocument,
-  ReadonlyVirtualDocument,
-  VirtualDocument
-}
+import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument}
 import edu.uci.ics.texera.web.SqlServer
-import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
 import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
-import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
+import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
+import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
 import org.apache.commons.vfs2.FileNotFoundException
 import org.jooq.DSLContext
 
-import java.net.{URI, URLDecoder, URLEncoder}
+import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder}
 import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Path, Paths}
 import scala.util.{Success, Try}
 
 object FileResolver {
@@ -34,10 +28,10 @@ object FileResolver {
     * @return Either[String, DatasetFileDocument] - the resolved path as a String or a DatasetFileDocument
     */
   def resolve(fileName: String): URI = {
-    val resolvers: List[String => URI] = List(localResolveFunc, datasetResolveFunc)
+    val resolvers: Seq[String => URI] = Seq(localResolveFunc, datasetResolveFunc)
 
     // Try each resolver function in sequence
-    resolvers.iterator
+    resolvers
       .map(resolver => Try(resolver(fileName)))
       .collectFirst {
         case Success(output) => output
@@ -85,11 +79,10 @@ object FileResolver {
     */
   private def localResolveFunc(fileName: String): URI = {
     val filePath = Paths.get(fileName)
-    if (Files.exists(filePath)) {
-      filePath.toUri // File exists locally, return the path as a string in the Left
-    } else {
+    if (!Files.exists(filePath)) {
       throw new FileNotFoundException(s"Local file $fileName does not exist")
     }
+    filePath.toUri
   }
 
   /**
@@ -104,12 +97,6 @@ object FileResolver {
     * @return Either[String, DatasetFileDocument] - Right(document) if creation succeeds
     * @throws FileNotFoundException if the dataset file does not exist or cannot be created
     */
-
-  import java.net.{URI, URISyntaxException, URLEncoder}
-  import java.nio.charset.StandardCharsets
-  import java.nio.file.Path
-  import org.apache.commons.vfs2.FileNotFoundException
-
   private def datasetResolveFunc(fileName: String): URI = {
     withTransaction(SqlServer.createDSLContext()) { ctx =>
       val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName)

From bb576ffb1daae6dc0e1edb14fab2314af56cfb6c Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sun, 27 Oct 2024 19:11:44 -0700
Subject: [PATCH 08/18] save working version

---
 .../common/storage/DatasetFileDocument.scala  |  23 +++-
 .../user/dataset/DatasetResource.scala        | 104 ++++++---------
 .../workflow/common/storage/FileOpener.scala  |  23 ++++
 .../common/storage/FileResolver.scala         | 121 ++++++------------
 .../source/scan/FileScanSourceOpExec.scala    |   4 +-
 .../source/scan/ScanSourceOpDesc.scala        |   5 +-
 .../source/scan/csv/CSVScanSourceOpDesc.scala |   4 +-
 .../source/scan/csv/CSVScanSourceOpExec.scala |   4 +-
 .../csv/ParallelCSVScanSourceOpDesc.scala     |   6 +-
 .../scan/csvOld/CSVOldScanSourceOpDesc.scala  |   4 +-
 .../scan/csvOld/CSVOldScanSourceOpExec.scala  |  11 +-
 .../scan/json/JSONLScanSourceOpDesc.scala     |   6 +-
 .../scan/json/JSONLScanSourceOpExec.scala     |   4 +-
 13 files changed, 140 insertions(+), 179 deletions(-)
 create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
index 22d7672d1d4..6c2c8cd38cf 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
@@ -5,11 +5,22 @@ import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
 import org.jooq.types.UInteger
 
 import java.io.{File, FileOutputStream, InputStream}
-import java.net.URI
-import java.nio.file.{Files, Path}
+import java.net.{URI, URLDecoder}
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 
-class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path)
+class DatasetFileDocument(uri: URI)
     extends VirtualDocument[Nothing] {
+  // Extract path components and decode them
+  private val pathParts = uri.getPath
+    .stripPrefix("/")
+    .split("/")
+    .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8))
+
+  private val did = pathParts(0).toInt
+  private val datasetVersionHash = pathParts(1)
+  private val fileRelativePath = Paths.get(pathParts.drop(2).head, pathParts.drop(2).tail: _*)
+
   private var tempFile: Option[File] = None
 
   override def getURI: URI =
@@ -53,9 +64,15 @@ class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath
   }
 
   override def remove(): Unit = {
+    // first remove the temporary file
     tempFile match {
       case Some(file) => Files.delete(file.toPath)
       case None       => // Do nothing
     }
+    // then remove the dataset file
+    GitVersionControlLocalFileStorage.removeFileFromRepo(
+      PathUtils.getDatasetPath(UInteger.valueOf(did)),
+      PathUtils.getDatasetPath(UInteger.valueOf(did)).resolve(fileRelativePath)
+    )
   }
 }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index 752278f0526..bf84370cd8b 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -1,6 +1,7 @@
 package edu.uci.ics.texera.web.resource.dashboard.user.dataset
 
 import edu.uci.ics.amber.engine.common.Utils.withTransaction
+import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.auth.SessionUser
 import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege
@@ -24,7 +25,7 @@ import org.jooq.{DSLContext, EnumType}
 import play.api.libs.json.Json
 
 import java.io.{IOException, InputStream, OutputStream}
-import java.net.URLDecoder
+import java.net.{URI, URLDecoder}
 import java.nio.charset.StandardCharsets
 import java.nio.file.Files
 import java.util
@@ -33,7 +34,7 @@ import java.util.zip.{ZipEntry, ZipOutputStream}
 import javax.annotation.security.RolesAllowed
 import javax.ws.rs.core.{MediaType, Response, StreamingOutput}
 import javax.ws.rs._
-import scala. jdk. CollectionConverters._
+import scala.jdk.CollectionConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 import scala.jdk.OptionConverters._
@@ -182,7 +183,7 @@ object DatasetResource {
   // DatasetOperation defines the operations that will be applied when creating a new dataset version
   private case class DatasetOperation(
       filesToAdd: Map[java.nio.file.Path, InputStream],
-      filesToRemove: List[java.nio.file.Path]
+      filesToRemove: List[URI]
   )
 
   private def parseUserUploadedFormToDatasetOperations(
@@ -193,7 +194,7 @@ object DatasetResource {
 
     // Mutable collections for constructing DatasetOperation
     val filesToAdd = mutable.Map[java.nio.file.Path, InputStream]()
-    val filesToRemove = mutable.ListBuffer[java.nio.file.Path]()
+    val filesToRemove = mutable.ListBuffer[URI]()
 
     val fields = multiPart.getFields.keySet.iterator() // Get all field names
 
@@ -218,13 +219,7 @@ object DatasetResource {
           .parse(filePathsValue)
           .as[List[String]]
           .foreach(pathStr => {
-            // TODO: refactor this part
-            val (_, _, _, fileRelativePath) = FileResolver.parseFileNameForDataset(context, pathStr)
-            fileRelativePath.asScala
-              .map { path =>
-                filesToRemove += datasetPath
-                  .resolve(path) // When path exists, resolve it and add to filesToRemove
-              }
+            filesToRemove += FileResolver.resolve(pathStr)
           })
       }
     }
@@ -328,11 +323,8 @@ object DatasetResource {
               GitVersionControlLocalFileStorage.writeFileToRepo(datasetPath, filePath, fileStream)
           }
 
-          datasetOperation.filesToRemove.foreach { filePath =>
-            GitVersionControlLocalFileStorage.removeFileFromRepo(
-              datasetPath,
-              filePath
-            )
+          datasetOperation.filesToRemove.foreach { fileUri =>
+            new DatasetFileDocument(fileUri).remove()
           }
         }
       )
@@ -690,18 +682,16 @@ class DatasetResource {
       @Auth user: SessionUser,
       @QueryParam("includeVersions") includeVersions: Boolean = false,
       @QueryParam("includeFileNodes") includeFileNodes: Boolean = false,
-      @QueryParam("path") filePathStr: String
+      @QueryParam("did") datasetId: UInteger,
   ): ListDatasetsResponse = {
     val uid = user.getUid
     withTransaction(context)(ctx => {
       var accessibleDatasets: ListBuffer[DashboardDataset] = ListBuffer()
 
-      if (filePathStr != null && filePathStr.nonEmpty) {
-        // if the file path is given, then only fetch the dataset and version this file is belonging to
-        val decodedPathStr = URLDecoder.decode(filePathStr, StandardCharsets.UTF_8.name())
-        val (ownerEmail, dataset, version, _) =
-          FileResolver.parseFileNameForDataset(ctx, decodedPathStr)
-        val accessPrivilege = getDatasetUserAccessPrivilege(ctx, dataset.getDid, uid)
+      if (datasetId != null) {
+        // if dataset id is given, retrieve only one dataset
+        val dataset = getDatasetByID(ctx, datasetId)
+        val accessPrivilege = getDatasetUserAccessPrivilege(ctx, datasetId, uid)
         if (
           accessPrivilege == DatasetUserAccessPrivilege.NONE && dataset.getIsPublic == DATASET_IS_PRIVATE
         ) {
@@ -709,15 +699,10 @@ class DatasetResource {
         }
         accessibleDatasets = accessibleDatasets :+ DashboardDataset(
           dataset = dataset,
-          ownerEmail = ownerEmail,
+          ownerEmail = getOwner(ctx, datasetId).getEmail,
           accessPrivilege = accessPrivilege,
           isOwner = dataset.getOwnerUid == uid,
-          versions = List(
-            DashboardDatasetVersion(
-              datasetVersion = version,
-              fileNodes = List()
-            )
-          ),
+          versions = List(),
           size = calculateLatestDatasetVersionSize(dataset.getDid)
         )
       } else {
@@ -789,7 +774,7 @@ class DatasetResource {
                     PathUtils.getDatasetPath(did),
                     version.getVersionHash
                   )
-                  .asScala)
+                  .asScala.toList)
               }
               DashboardDatasetVersion(
                 version,
@@ -926,30 +911,20 @@ class DatasetResource {
     val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name())
 
     withTransaction(context)(ctx => {
-      val (_, dataset, dsVersion, fileRelativePath) =
-        FileResolver.parseFileNameForDataset(ctx, decodedPathStr)
-
-      val did = dataset.getDid
-      val dvid = dsVersion.getDvid
-
-      if (!userHasReadAccess(ctx, dataset.getDid, uid)) {
-        throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE)
-      }
-
-      val targetDatasetPath = PathUtils.getDatasetPath(did)
-      val datasetVersion = getDatasetVersionByID(ctx, dvid)
-
+      val fileUri = FileResolver.resolve(decodedPathStr)
       val streamingOutput = new StreamingOutput() {
         override def write(output: OutputStream): Unit = {
-          fileRelativePath.asScala
-            .foreach { path =>
-              GitVersionControlLocalFileStorage.retrieveFileContentOfVersion(
-                targetDatasetPath,
-                datasetVersion.getVersionHash,
-                targetDatasetPath.resolve(path),
-                output
-              )
+          val inputStream = new DatasetFileDocument(fileUri).asInputStream()
+          try {
+            val buffer = new Array[Byte](8192) // buffer size
+            var bytesRead = inputStream.read(buffer)
+            while (bytesRead != -1) {
+              output.write(buffer, 0, bytesRead)
+              bytesRead = inputStream.read(buffer)
             }
+          } finally {
+            inputStream.close()
+          }
         }
       }
 
@@ -987,16 +962,21 @@ class DatasetResource {
   @GET
   @Path("/version-zip")
   def retrieveDatasetVersionZip(
-      @QueryParam("path") pathStr: String,
-      @QueryParam("getLatest") getLatest: Boolean,
       @QueryParam("did") did: UInteger,
+      @QueryParam("dvid") dvid: UInteger,
       @Auth user: SessionUser
   ): Response = {
-    val (dataset, version) = if (getLatest) {
+    val (dataset, version) = if (dvid == null) {
       getLatestVersionInfo(did, user)
     } else {
-      resolveAndValidatePath(pathStr, user)
+      withTransaction(context) {ctx =>
+        if (!userHasReadAccess(ctx, did, dvid)) {
+          throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE)
+        }
+        (getDatasetByID(ctx, did), getDatasetVersionByID(ctx, dvid))
+      }
     }
+
     val targetDatasetPath = PathUtils.getDatasetPath(dataset.getDid)
     val fileNodes = GitVersionControlLocalFileStorage.retrieveRootFileNodesOfVersion(
       targetDatasetPath,
@@ -1053,18 +1033,6 @@ class DatasetResource {
       .build()
   }
 
-  private def resolveAndValidatePath(
-      pathStr: String,
-      user: SessionUser
-  ): (Dataset, DatasetVersion) = {
-    val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name())
-    val (_, dataset, dsVersion, _) =
-      FileResolver.parseFileNameForDataset(context, decodedPathStr)
-
-    validateUserAccess(dataset.getDid, user.getUid)
-    (dataset, dsVersion)
-  }
-
   private def getLatestVersionInfo(did: UInteger, user: SessionUser): (Dataset, DatasetVersion) = {
     validateUserAccess(did, user.getUid)
     val dataset = getDatasetByID(context, did)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
new file mode 100644
index 00000000000..08de3b603c5
--- /dev/null
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
@@ -0,0 +1,23 @@
+package edu.uci.ics.texera.workflow.common.storage
+
+import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
+
+import java.net.URI
+
+object FileOpener {
+  type FileHandle = ReadonlyVirtualDocument[_]
+  def openFile(fileUri: URI): FileHandle = {
+    fileUri.getScheme match {
+      case DATASET_FILE_URI_SCHEME =>
+        new DatasetFileDocument(fileUri)
+
+      case "file" =>
+        // For local files, create a ReadonlyLocalFileDocument
+        new ReadonlyLocalFileDocument(fileUri)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
+    }
+  }
+}
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index 5d06175ad25..c8bee7d7669 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -1,24 +1,20 @@
 package edu.uci.ics.texera.workflow.common.storage
 
 import edu.uci.ics.amber.engine.common.Utils.withTransaction
-import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument}
 import edu.uci.ics.texera.web.SqlServer
 import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
 import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
 import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
 import org.apache.commons.vfs2.FileNotFoundException
-import org.jooq.DSLContext
 
-import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder}
+import java.net.{URI, URLEncoder}
 import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, Path, Paths}
+import java.nio.file.{Files, Paths}
 import scala.util.{Success, Try}
 
 object FileResolver {
-  type FileHandle = ReadonlyVirtualDocument[_]
-
-  private val DatasetFileUriScheme = "vfs"
+  val DATASET_FILE_URI_SCHEME = "vfs"
 
   /**
     * Attempts to resolve the given fileName using a list of resolver functions.
@@ -39,39 +35,6 @@ object FileResolver {
       .getOrElse(throw new FileNotFoundException(fileName))
   }
 
-  /**
-    * Open a file handle for the given fileUri
-    * @param fileUri the uri pointing to the file
-    * @return
-    */
-  def open(fileUri: URI): FileHandle = {
-    fileUri.getScheme match {
-      case DatasetFileUriScheme =>
-        // Extract path components and decode them
-        val pathParts = fileUri.getPath
-          .stripPrefix("/")
-          .split("/")
-          .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8))
-
-        if (pathParts.length < 3) {
-          throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}")
-        }
-
-        new DatasetFileDocument(
-          did = pathParts(0).toInt,
-          datasetVersionHash = pathParts(1),
-          fileRelativePath = Paths.get(pathParts.drop(2).mkString("/"))
-        )
-
-      case "file" =>
-        // For local files, create a ReadonlyLocalFileDocument
-        new ReadonlyLocalFileDocument(fileUri)
-
-      case _ =>
-        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
-    }
-  }
-
   /**
     * Attempts to resolve a local file path.
     * @throws FileNotFoundException if the local file does not exist
@@ -90,7 +53,7 @@ object FileResolver {
     *
     * The fileName format should be: /ownerEmail/datasetName/versionName/fileRelativePath
     *   e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv
-    * The output dataset URI format is: {DatasetFileUriScheme}:///{did}/{versionHash}/file-path
+    * The output dataset URI format is: {DATASET_FILE_URI_SCHEME}:///{did}/{versionHash}/file-path
     *   e.g. vfs:///15/adeq233td/some/dir/file.txt
     *
     * @param fileName the name of the file to attempt resolving as a DatasetFileDocument
@@ -98,54 +61,50 @@ object FileResolver {
     * @throws FileNotFoundException if the dataset file does not exist or cannot be created
     */
   private def datasetResolveFunc(fileName: String): URI = {
-    withTransaction(SqlServer.createDSLContext()) { ctx =>
-      val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName)
+    val filePath = Paths.get(fileName)
+    val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray
 
-      if (dataset == null || datasetVersion == null) {
-        throw new FileNotFoundException(s"Dataset file $fileName not found.")
+    // extract info from the user-given fileName
+    val ownerEmail = pathSegments(0)
+    val datasetName = pathSegments(1)
+    val versionName = pathSegments(2)
+    val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/"))
+
+    // fetch the dataset and version from DB to get dataset ID and version hash
+    val (dataset, datasetVersion) =
+      withTransaction(SqlServer.createDSLContext()) { ctx =>
+        // fetch the dataset from DB
+        val dataset = ctx
+          .select(DATASET.fields: _*)
+          .from(DATASET)
+          .leftJoin(USER)
+          .on(USER.UID.eq(DATASET.OWNER_UID))
+          .where(USER.EMAIL.eq(ownerEmail))
+          .and(DATASET.NAME.eq(datasetName))
+          .fetchOneInto(classOf[Dataset])
+
+        // fetch the dataset version from DB
+        val datasetVersion = ctx
+          .selectFrom(DATASET_VERSION)
+          .where(DATASET_VERSION.DID.eq(dataset.getDid))
+          .and(DATASET_VERSION.NAME.eq(versionName))
+          .fetchOneInto(classOf[DatasetVersion])
+
+        if (dataset == null || datasetVersion == null) {
+          throw new FileNotFoundException(s"Dataset file $fileName not found.")
+        }
+        (dataset, datasetVersion)
       }
 
       // Construct path as /{did}/{versionHash}/file-path
-      val did = dataset.getDid.intValue()
-      val versionHash = datasetVersion.getVersionHash
       val encodedPath =
-        s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}"
+        s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}"
 
       try {
-        new URI(DatasetFileUriScheme, null, encodedPath, null)
+        new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null)
       } catch {
-        case e: URISyntaxException =>
-          throw e
+        case e: Exception =>
+          throw new FileNotFoundException(s"Dataset file $fileName not found.")
       }
     }
-  }
-
-  def parseFileNameForDataset(
-      ctx: DSLContext,
-      fileName: String
-  ): (String, Dataset, DatasetVersion, Path) = {
-    val filePath = Paths.get(fileName)
-    val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray
-
-    val ownerEmail = pathSegments(0)
-    val datasetName = pathSegments(1)
-    val versionName = pathSegments(2)
-    val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/"))
-
-    val dataset = ctx
-      .select(DATASET.fields: _*)
-      .from(DATASET)
-      .leftJoin(USER)
-      .on(USER.UID.eq(DATASET.OWNER_UID))
-      .where(USER.EMAIL.eq(ownerEmail))
-      .and(DATASET.NAME.eq(datasetName))
-      .fetchOneInto(classOf[Dataset])
-
-    val datasetVersion = ctx
-      .selectFrom(DATASET_VERSION)
-      .where(DATASET_VERSION.DID.eq(dataset.getDid))
-      .and(DATASET_VERSION.NAME.eq(versionName))
-      .fetchOneInto(classOf[DatasetVersion])
-    (ownerEmail, dataset, datasetVersion, fileRelativePath)
-  }
 }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
index 5756252995d..8878d9f2092 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
@@ -4,7 +4,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.model.tuple.TupleLike
 import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory}
 import org.apache.commons.io.IOUtils.toByteArray
 
@@ -27,7 +27,7 @@ class FileScanSourceOpExec private[scan] (
   override def produceTuple(): Iterator[TupleLike] = {
     var filenameIt: Iterator[String] = Iterator.empty
     val fileEntries: Iterator[InputStream] = {
-      val is = FileResolver.open(new URI(fileUri)).asInputStream()
+      val is = FileOpener.openFile(new URI(fileUri)).asInputStream()
       if (extract) {
         val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream(
           new BufferedInputStream(is)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
index 82817358304..c638abfd701 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
@@ -5,17 +5,16 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle
 import edu.uci.ics.amber.engine.common.model.WorkflowContext
 import edu.uci.ics.amber.engine.common.model.tuple.Schema
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
+import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument}
 import edu.uci.ics.amber.engine.common.workflow.OutputPort
 import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo}
 import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
 import org.apache.commons.lang3.builder.EqualsBuilder
 
 import java.net.URI
 
 abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
-
   /** in the case we do not want to read the entire large file, but only
     * the first a few lines of it to do the type inference.
     */
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
index 2cc906bce4d..9227fd887d2 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, FileInputStream, IOException, InputStreamReader}
@@ -74,7 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
       return null
     }
 
-    val stream = FileResolver.open(new URI(fileUri.get)).asInputStream()
+    val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream()
     val inputReader =
       new InputStreamReader(stream, fileEncoding.getCharset)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
index 05942761d7b..7213bb0456a 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
@@ -6,7 +6,7 @@ import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport}
 import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike}
 import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.io.InputStreamReader
@@ -70,7 +70,7 @@ class CSVScanSourceOpExec private[csv] (
 
   override def open(): Unit = {
     inputReader = new InputStreamReader(
-      FileResolver.open(new URI(fileUri)).asInputStream(),
+      FileOpener.openFile(new URI(fileUri)).asInputStream(),
       fileEncoding.getCharset
     )
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 66ecde61726..534fe28a154 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, IOException}
@@ -41,7 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
 
     // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here
     // TODO: consider a better way
-    val file = FileResolver.open(new URI(fileUri.get)).asFile()
+    val file = FileOpener.openFile(new URI(fileUri.get)).asFile()
     val totalBytes: Long = file.length()
 
     PhysicalOp
@@ -82,7 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty) {
       return null
     }
-    val file = FileResolver.open(new URI(fileUri.get)).asFile()
+    val file = FileOpener.openFile(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
index 5524e6ee51a..d5a32cfa8a9 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, IOException}
@@ -72,7 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty) {
       return null
     }
-    val file = FileResolver.open(new URI(fileUri.get)).asFile()
+    val file = FileOpener.openFile(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
index 7c11005d643..fb764793b8f 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
@@ -2,13 +2,8 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld
 
 import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat}
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
-import edu.uci.ics.amber.engine.common.model.tuple.{
-  Attribute,
-  AttributeTypeUtils,
-  Schema,
-  TupleLike
-}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike}
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.net.URI
@@ -53,7 +48,7 @@ class CSVOldScanSourceOpExec private[csvOld] (
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
-    val filePath = FileResolver.open(new URI(fileUri)).asFile().toPath
+    val filePath = FileOpener.openFile(new URI(fileUri)).asFile().toPath
     reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat)
     // skip line if this worker reads the start of a file, and the file has a header line
     val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
index 99547e0182a..79542ffee07 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, Workf
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
@@ -39,7 +39,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
       workflowId: WorkflowIdentity,
       executionId: ExecutionIdentity
   ): PhysicalOp = {
-    val stream = FileResolver.open(new URI(fileUri.get)).asInputStream()
+    val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream()
     // count lines and partition the task to each worker
     val reader = new BufferedReader(
       new InputStreamReader(stream, fileEncoding.getCharset)
@@ -85,7 +85,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
-    val stream = FileResolver.open(new URI(fileUri.get)).asInputStream()
+    val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream()
     val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset))
     var fieldNames = Set[String]()
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
index f00a07802fc..43c4cdfff87 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
@@ -5,7 +5,7 @@ import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
 import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
@@ -44,7 +44,7 @@ class JSONLScanSourceOpExec private[json] (
     schema = schemaFunc()
     reader = new BufferedReader(
       new InputStreamReader(
-        FileResolver.open(new URI(fileUri)).asInputStream(),
+        FileOpener.openFile(new URI(fileUri)).asInputStream(),
         fileEncoding.getCharset
       )
     )

From 4fa96d131eeca2696135e107db11326f358b7769 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Sun, 27 Oct 2024 19:12:19 -0700
Subject: [PATCH 09/18] fmt

---
 .../common/storage/DatasetFileDocument.scala  |  3 +-
 .../user/dataset/DatasetResource.scala        | 31 ++++++++++++++-----
 .../workflow/common/storage/FileOpener.scala  |  6 +++-
 .../common/storage/FileResolver.scala         | 21 +++++++------
 .../source/scan/ScanSourceOpDesc.scala        |  7 ++++-
 .../scan/csvOld/CSVOldScanSourceOpExec.scala  |  7 ++++-
 6 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
index 6c2c8cd38cf..8b02580c00d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
@@ -9,8 +9,7 @@ import java.net.{URI, URLDecoder}
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, Paths}
 
-class DatasetFileDocument(uri: URI)
-    extends VirtualDocument[Nothing] {
+class DatasetFileDocument(uri: URI) extends VirtualDocument[Nothing] {
   // Extract path components and decode them
   private val pathParts = uri.getPath
     .stripPrefix("/")
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index bf84370cd8b..d39e4395ca5 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -9,11 +9,23 @@ import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS
 import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION
 import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
-import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao}
-import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{
+  DatasetDao,
+  DatasetUserAccessDao,
+  DatasetVersionDao
+}
+import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{
+  Dataset,
+  DatasetUserAccess,
+  DatasetVersion,
+  User
+}
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource._
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{context, _}
-import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode}
+import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{
+  DatasetFileNode,
+  PhysicalFileNode
+}
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage
 import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils
 import edu.uci.ics.texera.workflow.common.storage.FileResolver
@@ -682,7 +694,7 @@ class DatasetResource {
       @Auth user: SessionUser,
       @QueryParam("includeVersions") includeVersions: Boolean = false,
       @QueryParam("includeFileNodes") includeFileNodes: Boolean = false,
-      @QueryParam("did") datasetId: UInteger,
+      @QueryParam("did") datasetId: UInteger
   ): ListDatasetsResponse = {
     val uid = user.getUid
     withTransaction(context)(ctx => {
@@ -731,7 +743,8 @@ class DatasetResource {
                 ownerEmail = ownerEmail,
                 size = calculateLatestDatasetVersionSize(dataset.getDid)
               )
-            }).asScala
+            })
+            .asScala
         )
 
         // then we fetch the public datasets and merge it as a part of the result if not exist
@@ -774,7 +787,8 @@ class DatasetResource {
                     PathUtils.getDatasetPath(did),
                     version.getVersionHash
                   )
-                  .asScala.toList)
+                  .asScala
+                  .toList)
               }
               DashboardDatasetVersion(
                 version,
@@ -841,7 +855,8 @@ class DatasetResource {
                   datasetPath,
                   latestVersion.getVersionHash
                 )
-                .asScala.toList
+                .asScala
+                .toList
           )
         )
         .head
@@ -969,7 +984,7 @@ class DatasetResource {
     val (dataset, version) = if (dvid == null) {
       getLatestVersionInfo(did, user)
     } else {
-      withTransaction(context) {ctx =>
+      withTransaction(context) { ctx =>
         if (!userHasReadAccess(ctx, did, dvid)) {
           throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE)
         }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
index 08de3b603c5..308d25b33ec 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
@@ -1,6 +1,10 @@
 package edu.uci.ics.texera.workflow.common.storage
 
-import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument}
+import edu.uci.ics.amber.engine.common.storage.{
+  DatasetFileDocument,
+  ReadonlyLocalFileDocument,
+  ReadonlyVirtualDocument
+}
 import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
 
 import java.net.URI
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index c8bee7d7669..fb4f5b70967 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -96,15 +96,18 @@ object FileResolver {
         (dataset, datasetVersion)
       }
 
-      // Construct path as /{did}/{versionHash}/file-path
-      val encodedPath =
-        s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}"
+    // Construct path as /{did}/{versionHash}/file-path
+    val encodedPath =
+      s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString
+        .split("/")
+        .map(URLEncoder.encode(_, StandardCharsets.UTF_8))
+        .mkString("/")}"
 
-      try {
-        new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null)
-      } catch {
-        case e: Exception =>
-          throw new FileNotFoundException(s"Dataset file $fileName not found.")
-      }
+    try {
+      new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null)
+    } catch {
+      case e: Exception =>
+        throw new FileNotFoundException(s"Dataset file $fileName not found.")
     }
+  }
 }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
index c638abfd701..3b353b1adfe 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
@@ -5,7 +5,11 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle
 import edu.uci.ics.amber.engine.common.model.WorkflowContext
 import edu.uci.ics.amber.engine.common.model.tuple.Schema
-import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument}
+import edu.uci.ics.amber.engine.common.storage.{
+  DatasetFileDocument,
+  ReadonlyLocalFileDocument,
+  ReadonlyVirtualDocument
+}
 import edu.uci.ics.amber.engine.common.workflow.OutputPort
 import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo}
 import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor
@@ -15,6 +19,7 @@ import org.apache.commons.lang3.builder.EqualsBuilder
 import java.net.URI
 
 abstract class ScanSourceOpDesc extends SourceOperatorDescriptor {
+
   /** in the case we do not want to read the entire large file, but only
     * the first a few lines of it to do the type inference.
     */
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
index fb764793b8f..80eed48ea76 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
@@ -2,7 +2,12 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld
 
 import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat}
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
-import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike}
+import edu.uci.ics.amber.engine.common.model.tuple.{
+  Attribute,
+  AttributeTypeUtils,
+  Schema,
+  TupleLike
+}
 import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 

From e84070b92cc06e732a4018070ce21fc3b2430b5b Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Mon, 28 Oct 2024 13:14:16 -0700
Subject: [PATCH 10/18] merge dataset resource

---
 .../web/resource/dashboard/user/dataset/DatasetResource.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index 49e5c5ccecf..b0d83ab5b21 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -724,7 +724,7 @@ class DatasetResource {
               ownerEmail = ownerEmail,
               size = calculateLatestDatasetVersionSize(dataset.getDid)
             )
-          })
+          }).asScala
       )
 
       // then we fetch the public datasets and merge it as a part of the result if not exist

From 402f413c424029fbc797082f9426a223a8a38d30 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Mon, 28 Oct 2024 13:35:29 -0700
Subject: [PATCH 11/18] move open to virtual document

---
 .../common/storage/VirtualDocument.scala      | 20 ++++++++++++++
 .../user/dataset/DatasetResource.scala        |  3 ++-
 .../workflow/common/storage/FileOpener.scala  | 27 -------------------
 .../source/scan/FileScanSourceOpExec.scala    |  5 ++--
 .../source/scan/csv/CSVScanSourceOpDesc.scala |  4 +--
 .../source/scan/csv/CSVScanSourceOpExec.scala |  5 ++--
 .../csv/ParallelCSVScanSourceOpDesc.scala     |  6 ++---
 .../scan/csvOld/CSVOldScanSourceOpDesc.scala  |  6 ++---
 .../scan/csvOld/CSVOldScanSourceOpExec.scala  |  4 +--
 .../scan/json/JSONLScanSourceOpDesc.scala     |  6 ++---
 .../scan/json/JSONLScanSourceOpExec.scala     |  5 ++--
 11 files changed, 41 insertions(+), 50 deletions(-)
 delete mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
index 0fb0c1e7897..b586958d674 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
@@ -1,8 +1,28 @@
 package edu.uci.ics.amber.engine.common.storage
 
+import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
+
 import java.io.{File, InputStream}
 import java.net.URI
 
+object VirtualDocument {
+  type FileHandle = ReadonlyVirtualDocument[_]
+
+  def openFile(fileUri: URI): FileHandle = {
+    fileUri.getScheme match {
+      case DATASET_FILE_URI_SCHEME =>
+        new DatasetFileDocument(fileUri)
+
+      case "file" =>
+        // For local files, create a ReadonlyLocalFileDocument
+        new ReadonlyLocalFileDocument(fileUri)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
+    }
+  }
+}
+
 /**
   * TODO: break this base definition into more self-contained pieces, including Writeonly, IteratorBased
   * VirtualDocument provides the abstraction of doing read/write/copy/delete operations over a single resource in Texera system.
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
index b0d83ab5b21..742545af59e 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala
@@ -724,7 +724,8 @@ class DatasetResource {
               ownerEmail = ownerEmail,
               size = calculateLatestDatasetVersionSize(dataset.getDid)
             )
-          }).asScala
+          })
+          .asScala
       )
 
       // then we fetch the public datasets and merge it as a part of the result if not exist
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
deleted file mode 100644
index 308d25b33ec..00000000000
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-package edu.uci.ics.texera.workflow.common.storage
-
-import edu.uci.ics.amber.engine.common.storage.{
-  DatasetFileDocument,
-  ReadonlyLocalFileDocument,
-  ReadonlyVirtualDocument
-}
-import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
-
-import java.net.URI
-
-object FileOpener {
-  type FileHandle = ReadonlyVirtualDocument[_]
-  def openFile(fileUri: URI): FileHandle = {
-    fileUri.getScheme match {
-      case DATASET_FILE_URI_SCHEME =>
-        new DatasetFileDocument(fileUri)
-
-      case "file" =>
-        // For local files, create a ReadonlyLocalFileDocument
-        new ReadonlyLocalFileDocument(fileUri)
-
-      case _ =>
-        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
-    }
-  }
-}
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
index 8878d9f2092..702c3f8a0db 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
@@ -2,9 +2,8 @@ package edu.uci.ics.texera.workflow.operators.source.scan
 
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.model.tuple.TupleLike
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory}
 import org.apache.commons.io.IOUtils.toByteArray
 
@@ -27,7 +26,7 @@ class FileScanSourceOpExec private[scan] (
   override def produceTuple(): Iterator[TupleLike] = {
     var filenameIt: Iterator[String] = Iterator.empty
     val fileEntries: Iterator[InputStream] = {
-      val is = FileOpener.openFile(new URI(fileUri)).asInputStream()
+      val is = openFile(new URI(fileUri)).asInputStream()
       if (extract) {
         val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream(
           new BufferedInputStream(is)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
index 9227fd887d2..47489ea64d6 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, FileInputStream, IOException, InputStreamReader}
@@ -74,7 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
       return null
     }
 
-    val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream()
+    val stream = openFile(new URI(fileUri.get)).asInputStream()
     val inputReader =
       new InputStreamReader(stream, fileEncoding.getCharset)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
index 7213bb0456a..0117cf02ba0 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
@@ -5,8 +5,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport}
 import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike}
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.io.InputStreamReader
@@ -70,7 +69,7 @@ class CSVScanSourceOpExec private[csv] (
 
   override def open(): Unit = {
     inputReader = new InputStreamReader(
-      FileOpener.openFile(new URI(fileUri)).asInputStream(),
+      openFile(new URI(fileUri)).asInputStream(),
       fileEncoding.getCharset
     )
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 534fe28a154..6c589169104 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{File, IOException}
@@ -41,7 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
 
     // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here
     // TODO: consider a better way
-    val file = FileOpener.openFile(new URI(fileUri.get)).asFile()
+    val file = openFile(new URI(fileUri.get)).asFile()
     val totalBytes: Long = file.length()
 
     PhysicalOp
@@ -82,7 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty) {
       return null
     }
-    val file = FileOpener.openFile(new URI(fileUri.get)).asFile()
+    val file = openFile(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
index d5a32cfa8a9..7498877171b 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
@@ -9,10 +9,10 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
-import java.io.{File, IOException}
+import java.io.IOException
 import java.net.URI
 
 class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
@@ -72,7 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty) {
       return null
     }
-    val file = FileOpener.openFile(new URI(fileUri.get)).asFile()
+    val file = openFile(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
index 80eed48ea76..cf455e66d70 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
@@ -8,7 +8,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{
   Schema,
   TupleLike
 }
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.net.URI
@@ -53,7 +53,7 @@ class CSVOldScanSourceOpExec private[csvOld] (
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
-    val filePath = FileOpener.openFile(new URI(fileUri)).asFile().toPath
+    val filePath = openFile(new URI(fileUri)).asFile().toPath
     reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat)
     // skip line if this worker reads the start of a file, and the file has a header line
     val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
index 79542ffee07..6a1eeae9bc8 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, Workf
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema}
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
@@ -39,7 +39,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
       workflowId: WorkflowIdentity,
       executionId: ExecutionIdentity
   ): PhysicalOp = {
-    val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream()
+    val stream = openFile(new URI(fileUri.get)).asInputStream()
     // count lines and partition the task to each worker
     val reader = new BufferedReader(
       new InputStreamReader(stream, fileEncoding.getCharset)
@@ -85,7 +85,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
-    val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream()
+    val stream = openFile(new URI(fileUri.get)).asInputStream()
     val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset))
     var fieldNames = Set[String]()
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
index 43c4cdfff87..5553f676123 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
@@ -1,11 +1,10 @@
 package edu.uci.ics.texera.workflow.operators.source.scan.json
 
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
 import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike}
-import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver}
+import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
@@ -44,7 +43,7 @@ class JSONLScanSourceOpExec private[json] (
     schema = schemaFunc()
     reader = new BufferedReader(
       new InputStreamReader(
-        FileOpener.openFile(new URI(fileUri)).asInputStream(),
+        openFile(new URI(fileUri)).asInputStream(),
         fileEncoding.getCharset
       )
     )

From 4d8e72bc8c6f6e7b3ef6b38178b343329f4de8ab Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Mon, 28 Oct 2024 17:32:47 -0700
Subject: [PATCH 12/18] fix tests

---
 .../source/scan/csv/CSVScanSourceOpDesc.scala |  2 +-
 .../csv/ParallelCSVScanSourceOpDesc.scala     |  2 +-
 .../scan/csvOld/CSVOldScanSourceOpDesc.scala  |  2 +-
 .../scan/json/JSONLScanSourceOpDesc.scala     |  3 +++
 .../scan/csv/CSVScanSourceOpDescSpec.scala    | 11 +++++++++
 .../scan/text/FileScanSourceOpDescSpec.scala  | 24 ++++++++-----------
 6 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
index 47489ea64d6..d61ab6f7fcb 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
@@ -70,7 +70,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
-    if (customDelimiter.isEmpty) {
+    if (customDelimiter.isEmpty || fileUri.isEmpty) {
       return null
     }
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 6c589169104..57af21d4a47 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -79,7 +79,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
-    if (customDelimiter.isEmpty) {
+    if (customDelimiter.isEmpty || fileUri.isEmpty) {
       return null
     }
     val file = openFile(new URI(fileUri.get)).asFile()
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
index 7498877171b..8bc67f5629d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
@@ -69,7 +69,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
-    if (customDelimiter.isEmpty) {
+    if (customDelimiter.isEmpty || fileUri.isEmpty) {
       return null
     }
     val file = openFile(new URI(fileUri.get)).asFile()
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
index 6a1eeae9bc8..066203e115e 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
@@ -85,6 +85,9 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
     */
   @Override
   def inferSchema(): Schema = {
+    if (fileUri.isEmpty) {
+      return null
+    }
     val stream = openFile(new URI(fileUri.get)).asInputStream()
     val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset))
     var fieldNames = Set[String]()
diff --git a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala
index 21ce3de60a0..858b2188309 100644
--- a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala
+++ b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala
@@ -4,6 +4,7 @@ import edu.uci.ics.amber.engine.common.model.WorkflowContext
 import edu.uci.ics.amber.engine.common.model.tuple.{AttributeType, Schema}
 import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 import WorkflowContext.{DEFAULT_EXECUTION_ID, DEFAULT_WORKFLOW_ID}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import org.scalatest.BeforeAndAfter
 import org.scalatest.flatspec.AnyFlatSpec
 
@@ -27,6 +28,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     parallelCsvScanSourceOpDesc.customDelimiter = Some(",")
     parallelCsvScanSourceOpDesc.hasHeader = true
     parallelCsvScanSourceOpDesc.setContext(workflowContext)
+    parallelCsvScanSourceOpDesc.setFileUri(
+      FileResolver.resolve(parallelCsvScanSourceOpDesc.fileName.get)
+    )
     val inferredSchema: Schema = parallelCsvScanSourceOpDesc.inferSchema()
 
     assert(inferredSchema.getAttributes.length == 14)
@@ -42,6 +46,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     parallelCsvScanSourceOpDesc.customDelimiter = Some(",")
     parallelCsvScanSourceOpDesc.hasHeader = false
     parallelCsvScanSourceOpDesc.setContext(workflowContext)
+    parallelCsvScanSourceOpDesc.setFileUri(
+      FileResolver.resolve(parallelCsvScanSourceOpDesc.fileName.get)
+    )
 
     val inferredSchema: Schema = parallelCsvScanSourceOpDesc.inferSchema()
 
@@ -56,6 +63,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     csvScanSourceOpDesc.customDelimiter = Some(",")
     csvScanSourceOpDesc.hasHeader = true
     csvScanSourceOpDesc.setContext(workflowContext)
+    csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get))
 
     val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema()
 
@@ -70,6 +78,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     csvScanSourceOpDesc.customDelimiter = Some(",")
     csvScanSourceOpDesc.hasHeader = false
     csvScanSourceOpDesc.setContext(workflowContext)
+    csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get))
 
     val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema()
 
@@ -85,6 +94,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     csvScanSourceOpDesc.customDelimiter = Some(";")
     csvScanSourceOpDesc.hasHeader = false
     csvScanSourceOpDesc.setContext(workflowContext)
+    csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get))
 
     val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema()
 
@@ -100,6 +110,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     csvScanSourceOpDesc.customDelimiter = Some(";")
     csvScanSourceOpDesc.hasHeader = false
     csvScanSourceOpDesc.setContext(workflowContext)
+    csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get))
 
     assert(
       !csvScanSourceOpDesc
diff --git a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala
index 7c16a38d05c..de1d60c51b8 100644
--- a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala
+++ b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala
@@ -1,6 +1,7 @@
 package edu.uci.ics.texera.workflow.operators.source.scan.text
 
 import edu.uci.ics.amber.engine.common.model.tuple.{AttributeType, Schema, SchemaEnforceable, Tuple}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
 import edu.uci.ics.texera.workflow.operators.source.scan.{
   FileAttributeType,
   FileDecodingMethod,
@@ -18,7 +19,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
 
   before {
     fileScanSourceOpDesc = new FileScanSourceOpDesc()
-    fileScanSourceOpDesc.fileUri = Left(TestTextFilePath)
+    fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestTextFilePath).toASCIIString)
     fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.UTF_8
   }
 
@@ -61,8 +62,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
-        null,
+        fileScanSourceOpDesc.fileUri.getOrElse(""),
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
         fileScanSourceOpDesc.extract,
@@ -87,13 +87,12 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
   }
 
   it should "read first 5 lines of the input text file with CRLF separators into corresponding output tuples" in {
-    fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath)
+    fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestCRLFTextFilePath).toASCIIString)
     fileScanSourceOpDesc.attributeType = FileAttributeType.STRING
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
-        null,
+        fileScanSourceOpDesc.fileUri.getOrElse(""),
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
         fileScanSourceOpDesc.extract,
@@ -121,8 +120,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
     fileScanSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
-        null,
+        fileScanSourceOpDesc.fileUri.getOrElse(""),
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
         fileScanSourceOpDesc.extract,
@@ -148,12 +146,11 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
   }
 
   it should "read first 5 lines of the input text into corresponding output INTEGER tuples" in {
-    fileScanSourceOpDesc.fileUri = Left(TestNumbersFilePath)
+    fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestNumbersFilePath).toASCIIString)
     fileScanSourceOpDesc.attributeType = FileAttributeType.INTEGER
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec = new FileScanSourceOpExec(
-      fileScanSourceOpDesc.fileUri.left.getOrElse(""),
-      null,
+      fileScanSourceOpDesc.fileUri.getOrElse(""),
       fileScanSourceOpDesc.attributeType,
       fileScanSourceOpDesc.fileEncoding,
       fileScanSourceOpDesc.extract,
@@ -178,14 +175,13 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
   }
 
   it should "read first 5 lines of the input text file with US_ASCII encoding" in {
-    fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath)
+    fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestCRLFTextFilePath).toASCIIString)
     fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.ASCII
     fileScanSourceOpDesc.attributeType = FileAttributeType.STRING
     fileScanSourceOpDesc.fileScanLimit = Option(5)
     val FileScanSourceOpExec =
       new FileScanSourceOpExec(
-        fileScanSourceOpDesc.fileUri.left.getOrElse(""),
-        null,
+        fileScanSourceOpDesc.fileUri.getOrElse(""),
         fileScanSourceOpDesc.attributeType,
         fileScanSourceOpDesc.fileEncoding,
         fileScanSourceOpDesc.extract,

From 5fa9ca7377b17cf96ae86c1b8b58565fa9de2a15 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Mon, 28 Oct 2024 17:40:33 -0700
Subject: [PATCH 13/18] fmt

---
 .../engine/common/executor/SourceOperatorExecutor.scala     | 3 ---
 .../workflow/operators/source/scan/ScanSourceOpDesc.scala   | 6 ------
 .../operators/source/scan/csv/CSVScanSourceOpDesc.scala     | 2 +-
 .../source/scan/csv/ParallelCSVScanSourceOpDesc.scala       | 2 +-
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala
index d4c92b19a2a..453a41c5ac6 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala
@@ -1,11 +1,8 @@
 package edu.uci.ics.amber.engine.common.executor
 
 import edu.uci.ics.amber.engine.common.model.tuple.{Tuple, TupleLike}
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
 import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 
-import java.io.{FileInputStream, InputStream}
-
 trait SourceOperatorExecutor extends OperatorExecutor {
   override def open(): Unit = {}
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
index 3b353b1adfe..46ff9196ef0 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala
@@ -5,15 +5,9 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle
 import edu.uci.ics.amber.engine.common.model.WorkflowContext
 import edu.uci.ics.amber.engine.common.model.tuple.Schema
-import edu.uci.ics.amber.engine.common.storage.{
-  DatasetFileDocument,
-  ReadonlyLocalFileDocument,
-  ReadonlyVirtualDocument
-}
 import edu.uci.ics.amber.engine.common.workflow.OutputPort
 import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo}
 import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor
-import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
 import org.apache.commons.lang3.builder.EqualsBuilder
 
 import java.net.URI
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
index d61ab6f7fcb..97eeab439e3 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
@@ -12,7 +12,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Sc
 import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
-import java.io.{File, FileInputStream, IOException, InputStreamReader}
+import java.io.{IOException, InputStreamReader}
 import java.net.URI
 
 class CSVScanSourceOpDesc extends ScanSourceOpDesc {
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 57af21d4a47..76537550f4c 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -12,7 +12,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Sc
 import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
-import java.io.{File, IOException}
+import java.io.IOException
 import java.net.URI
 
 class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {

From 238b31cbd73d6618b87564a0d616469c5508e2da Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Tue, 29 Oct 2024 15:03:12 -0700
Subject: [PATCH 14/18] fix naming

---
 .../common/storage/DocumentFactory.scala      | 21 +++++++++++++++++++
 .../common/storage/VirtualDocument.scala      | 18 ----------------
 .../source/scan/FileScanSourceOpExec.scala    |  4 ++--
 .../source/scan/csv/CSVScanSourceOpDesc.scala |  4 ++--
 .../source/scan/csv/CSVScanSourceOpExec.scala |  4 ++--
 .../csv/ParallelCSVScanSourceOpDesc.scala     |  6 +++---
 .../scan/csvOld/CSVOldScanSourceOpDesc.scala  |  4 ++--
 .../scan/csvOld/CSVOldScanSourceOpExec.scala  |  4 ++--
 .../scan/json/JSONLScanSourceOpDesc.scala     |  7 +++----
 .../scan/json/JSONLScanSourceOpExec.scala     |  4 ++--
 10 files changed, 39 insertions(+), 37 deletions(-)
 create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala
new file mode 100644
index 00000000000..f216504d096
--- /dev/null
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala
@@ -0,0 +1,21 @@
+package edu.uci.ics.amber.engine.common.storage
+
+import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
+
+import java.net.URI
+
+object DocumentFactory {
+  def newReadonlyDocument(fileUri: URI): ReadonlyVirtualDocument[_] = {
+    fileUri.getScheme match {
+      case DATASET_FILE_URI_SCHEME =>
+        new DatasetFileDocument(fileUri)
+
+      case "file" =>
+        // For local files, create a ReadonlyLocalFileDocument
+        new ReadonlyLocalFileDocument(fileUri)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
+    }
+  }
+}
diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
index b586958d674..7fbd5050556 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
@@ -5,24 +5,6 @@ import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_
 import java.io.{File, InputStream}
 import java.net.URI
 
-object VirtualDocument {
-  type FileHandle = ReadonlyVirtualDocument[_]
-
-  def openFile(fileUri: URI): FileHandle = {
-    fileUri.getScheme match {
-      case DATASET_FILE_URI_SCHEME =>
-        new DatasetFileDocument(fileUri)
-
-      case "file" =>
-        // For local files, create a ReadonlyLocalFileDocument
-        new ReadonlyLocalFileDocument(fileUri)
-
-      case _ =>
-        throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}")
-    }
-  }
-}
-
 /**
   * TODO: break this base definition into more self-contained pieces, including Writeonly, IteratorBased
   * VirtualDocument provides the abstraction of doing read/write/copy/delete operations over a single resource in Texera system.
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
index 702c3f8a0db..e7c7be0768a 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala
@@ -3,7 +3,7 @@ package edu.uci.ics.texera.workflow.operators.source.scan
 import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.model.tuple.TupleLike
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory}
 import org.apache.commons.io.IOUtils.toByteArray
 
@@ -26,7 +26,7 @@ class FileScanSourceOpExec private[scan] (
   override def produceTuple(): Iterator[TupleLike] = {
     var filenameIt: Iterator[String] = Iterator.empty
     val fileEntries: Iterator[InputStream] = {
-      val is = openFile(new URI(fileUri)).asInputStream()
+      val is = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream()
       if (extract) {
         val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream(
           new BufferedInputStream(is)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
index 97eeab439e3..5e5ccdfd26b 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.{IOException, InputStreamReader}
@@ -74,7 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc {
       return null
     }
 
-    val stream = openFile(new URI(fileUri.get)).asInputStream()
+    val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream()
     val inputReader =
       new InputStreamReader(stream, fileEncoding.getCharset)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
index 0117cf02ba0..245e67e1dfe 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala
@@ -5,7 +5,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.workflow.PortIdentity
 import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport}
 import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike}
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.io.InputStreamReader
@@ -69,7 +69,7 @@ class CSVScanSourceOpExec private[csv] (
 
   override def open(): Unit = {
     inputReader = new InputStreamReader(
-      openFile(new URI(fileUri)).asInputStream(),
+      DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream(),
       fileEncoding.getCharset
     )
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 76537550f4c..4496fd3fd18 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.{DocumentFactory, VirtualDocument}
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.IOException
@@ -41,7 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
 
     // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here
     // TODO: consider a better way
-    val file = openFile(new URI(fileUri.get)).asFile()
+    val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile()
     val totalBytes: Long = file.length()
 
     PhysicalOp
@@ -82,7 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty || fileUri.isEmpty) {
       return null
     }
-    val file = openFile(new URI(fileUri.get)).asFile()
+    val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
index 8bc67f5629d..38625b62b0f 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.IOException
@@ -72,7 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc {
     if (customDelimiter.isEmpty || fileUri.isEmpty) {
       return null
     }
-    val file = openFile(new URI(fileUri.get)).asFile()
+    val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile()
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
index cf455e66d70..f6b2bd96c16 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala
@@ -8,7 +8,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{
   Schema,
   TupleLike
 }
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 
 import java.net.URI
@@ -53,7 +53,7 @@ class CSVOldScanSourceOpExec private[csvOld] (
     implicit object CustomFormat extends DefaultCSVFormat {
       override val delimiter: Char = customDelimiter.get.charAt(0)
     }
-    val filePath = openFile(new URI(fileUri)).asFile().toPath
+    val filePath = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asFile().toPath
     reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat)
     // skip line if this worker reads the start of a file, and the file has a header line
     val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0)
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
index 066203e115e..c622a6e8853 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala
@@ -4,12 +4,11 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
 import com.fasterxml.jackson.databind.JsonNode
 import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.OpExecInitInfo
 import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
-import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument
+import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, DocumentFactory}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema}
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
@@ -39,7 +38,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
       workflowId: WorkflowIdentity,
       executionId: ExecutionIdentity
   ): PhysicalOp = {
-    val stream = openFile(new URI(fileUri.get)).asInputStream()
+    val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream()
     // count lines and partition the task to each worker
     val reader = new BufferedReader(
       new InputStreamReader(stream, fileEncoding.getCharset)
@@ -88,7 +87,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc {
     if (fileUri.isEmpty) {
       return null
     }
-    val stream = openFile(new URI(fileUri.get)).asInputStream()
+    val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream()
     val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset))
     var fieldNames = Set[String]()
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
index 5553f676123..f58f9adcb8d 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala
@@ -4,7 +4,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor
 import edu.uci.ics.amber.engine.common.Utils.objectMapper
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField
 import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike}
-import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod
 import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap
 
@@ -43,7 +43,7 @@ class JSONLScanSourceOpExec private[json] (
     schema = schemaFunc()
     reader = new BufferedReader(
       new InputStreamReader(
-        openFile(new URI(fileUri)).asInputStream(),
+        DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream(),
         fileEncoding.getCharset
       )
     )

From ecad8e33b88839fee1d436efdcecad08f1f36d8a Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Tue, 29 Oct 2024 16:08:38 -0700
Subject: [PATCH 15/18] fmt

---
 .../common/storage/DatasetFileDocument.scala  | 32 +++++++++++--------
 .../common/storage/FileResolver.scala         | 29 ++++++++++++-----
 .../common/workflow/LogicalPlan.scala         |  4 +++
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
index 8b02580c00d..fa4d740b5c6 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala
@@ -7,25 +7,31 @@ import org.jooq.types.UInteger
 import java.io.{File, FileOutputStream, InputStream}
 import java.net.{URI, URLDecoder}
 import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, Paths}
+import java.nio.file.{Files, Path, Paths}
+import scala.jdk.CollectionConverters.IteratorHasAsScala
 
 class DatasetFileDocument(uri: URI) extends VirtualDocument[Nothing] {
-  // Extract path components and decode them
-  private val pathParts = uri.getPath
-    .stripPrefix("/")
-    .split("/")
-    .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8))
+  // Utility function to parse and decode URI segments into individual components
+  private def parseUri(uri: URI): (Int, String, Path) = {
+    val segments = Paths.get(uri.getPath).iterator().asScala.map(_.toString).toArray
+    if (segments.length < 3)
+      throw new IllegalArgumentException("URI format is incorrect")
 
-  private val did = pathParts(0).toInt
-  private val datasetVersionHash = pathParts(1)
-  private val fileRelativePath = Paths.get(pathParts.drop(2).head, pathParts.drop(2).tail: _*)
+    val did = segments(0).toInt
+    val datasetVersionHash = URLDecoder.decode(segments(1), StandardCharsets.UTF_8)
+    val decodedRelativeSegments =
+      segments.drop(2).map(part => URLDecoder.decode(part, StandardCharsets.UTF_8))
+    val fileRelativePath = Paths.get(decodedRelativeSegments.head, decodedRelativeSegments.tail: _*)
+
+    (did, datasetVersionHash, fileRelativePath)
+  }
+
+  // Extract components from URI using the utility function
+  private val (did, datasetVersionHash, fileRelativePath) = parseUri(uri)
 
   private var tempFile: Option[File] = None
 
-  override def getURI: URI =
-    throw new UnsupportedOperationException(
-      "The URI cannot be acquired because the file is not physically located"
-    )
+  override def getURI: URI = uri
 
   override def asInputStream(): InputStream = {
     val datasetAbsolutePath = PathUtils.getDatasetPath(UInteger.valueOf(did))
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
index fb4f5b70967..af5eb26a47c 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala
@@ -8,9 +8,11 @@ import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER
 import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion}
 import org.apache.commons.vfs2.FileNotFoundException
 
+import java.io.File
 import java.net.{URI, URLEncoder}
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, Paths}
+import scala.jdk.CollectionConverters.IteratorHasAsScala
 import scala.util.{Success, Try}
 
 object FileResolver {
@@ -68,7 +70,7 @@ object FileResolver {
     val ownerEmail = pathSegments(0)
     val datasetName = pathSegments(1)
     val versionName = pathSegments(2)
-    val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/"))
+    val fileRelativePath = Paths.get(pathSegments.drop(3).head, pathSegments.drop(3).tail: _*)
 
     // fetch the dataset and version from DB to get dataset ID and version hash
     val (dataset, datasetVersion) =
@@ -96,15 +98,26 @@ object FileResolver {
         (dataset, datasetVersion)
       }
 
-    // Construct path as /{did}/{versionHash}/file-path
-    val encodedPath =
-      s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString
-        .split("/")
-        .map(URLEncoder.encode(_, StandardCharsets.UTF_8))
-        .mkString("/")}"
+    // Convert each segment of fileRelativePath to an encoded String
+    val encodedFileRelativePath = fileRelativePath
+      .iterator()
+      .asScala
+      .map { segment =>
+        URLEncoder.encode(segment.toString, StandardCharsets.UTF_8)
+      }
+      .toArray
+
+    // Prepend did and versionHash to the encoded path segments
+    val allPathSegments = Array(
+      dataset.getDid.intValue().toString,
+      datasetVersion.getVersionHash
+    ) ++ encodedFileRelativePath
+
+    // Build the the format /{did}/{versionHash}/{fileRelativePath}
+    val encodedPath = Paths.get(File.separator, allPathSegments: _*)
 
     try {
-      new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null)
+      new URI(DATASET_FILE_URI_SCHEME, "", encodedPath.toString, null)
     } catch {
       case e: Exception =>
         throw new FileNotFoundException(s"Dataset file $fileName not found.")
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
index 8af7d60fd05..1574f1cd373 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala
@@ -146,6 +146,10 @@ case class LogicalPlan(
       .toMap
   }
 
+  /**
+    * Resolve all user-given filename for the scan source operators to URIs, and call op.setFileUri to set the URi
+    * @param errorList if given, put errors during resolving to it
+    */
   def resolveScanSourceOpFileName(
       errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]]
   ): Unit = {

From a6d2cef7a2e0435d0ec871895d9f05dc2288fd0c Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Tue, 29 Oct 2024 16:12:55 -0700
Subject: [PATCH 16/18] fix fmt

---
 .../uci/ics/amber/engine/common/storage/VirtualDocument.scala   | 2 --
 .../operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
index 7fbd5050556..0fb0c1e7897 100644
--- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala
@@ -1,7 +1,5 @@
 package edu.uci.ics.amber.engine.common.storage
 
-import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME
-
 import java.io.{File, InputStream}
 import java.net.URI
 
diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
index 4496fd3fd18..2978cb56992 100644
--- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
+++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala
@@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc}
 import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity}
 import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows
 import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema}
-import edu.uci.ics.amber.engine.common.storage.{DocumentFactory, VirtualDocument}
+import edu.uci.ics.amber.engine.common.storage.DocumentFactory
 import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc
 
 import java.io.IOException

From 3cd305127ffe754d3698ad8cf094030854358c4a Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Tue, 29 Oct 2024 16:59:55 -0700
Subject: [PATCH 17/18] fix test

---
 .../edu/uci/ics/amber/engine/e2e/TestOperators.scala     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala
index 2dda4568e49..cfb382e7544 100644
--- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala
+++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala
@@ -1,10 +1,7 @@
 package edu.uci.ics.amber.engine.e2e
 
-import edu.uci.ics.texera.workflow.operators.aggregate.{
-  AggregateOpDesc,
-  AggregationFunction,
-  AggregationOperation
-}
+import edu.uci.ics.texera.workflow.common.storage.FileResolver
+import edu.uci.ics.texera.workflow.operators.aggregate.{AggregateOpDesc, AggregationFunction, AggregationOperation}
 import edu.uci.ics.texera.workflow.operators.hashJoin.HashJoinOpDesc
 import edu.uci.ics.texera.workflow.operators.keywordSearch.KeywordSearchOpDesc
 import edu.uci.ics.texera.workflow.operators.sink.managed.ProgressiveSinkOpDesc
@@ -48,6 +45,7 @@ object TestOperators {
     csvHeaderlessOp.fileName = Some(fileName)
     csvHeaderlessOp.customDelimiter = Some(",")
     csvHeaderlessOp.hasHeader = header
+    csvHeaderlessOp.setFileUri(FileResolver.resolve(fileName))
     csvHeaderlessOp
 
   }
@@ -56,6 +54,7 @@ object TestOperators {
     val jsonlOp = new JSONLScanSourceOpDesc
     jsonlOp.fileName = Some(fileName)
     jsonlOp.flatten = flatten
+    jsonlOp.setFileUri(FileResolver.resolve(fileName))
     jsonlOp
   }
 

From 5a75f26e10186704e6de6e648b45051d38d60408 Mon Sep 17 00:00:00 2001
From: Jiadong Bai <bobbaicloudwithpants@gmail.com>
Date: Tue, 29 Oct 2024 17:55:43 -0700
Subject: [PATCH 18/18] fmt

---
 .../scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala
index cfb382e7544..0b891472dcb 100644
--- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala
+++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala
@@ -1,7 +1,11 @@
 package edu.uci.ics.amber.engine.e2e
 
 import edu.uci.ics.texera.workflow.common.storage.FileResolver
-import edu.uci.ics.texera.workflow.operators.aggregate.{AggregateOpDesc, AggregationFunction, AggregationOperation}
+import edu.uci.ics.texera.workflow.operators.aggregate.{
+  AggregateOpDesc,
+  AggregationFunction,
+  AggregationOperation
+}
 import edu.uci.ics.texera.workflow.operators.hashJoin.HashJoinOpDesc
 import edu.uci.ics.texera.workflow.operators.keywordSearch.KeywordSearchOpDesc
 import edu.uci.ics.texera.workflow.operators.sink.managed.ProgressiveSinkOpDesc