From 50bebcc95344eb7b6a5936439598b656fa3ad7f9 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sat, 26 Oct 2024 23:08:50 -0700 Subject: [PATCH 01/18] add Readonly document and change source op --- .../executor/SourceOperatorExecutor.scala | 17 --- .../common/storage/DatasetFileDocument.scala | 24 ++-- .../storage/ReadonlyLocalFileDocument.scala | 67 +++++++++++ .../storage/ReadonlyVirtualDocument.scala | 64 +++++++++++ .../common/storage/VirtualDocument.scala | 2 +- .../user/dataset/DatasetResource.scala | 92 ---------------- .../common/storage/FileResolver.scala | 104 ++++++++++++++++-- .../source/scan/FileScanSourceOpDesc.scala | 4 +- .../source/scan/FileScanSourceOpExec.scala | 7 +- .../source/scan/ScanSourceOpDesc.scala | 18 +-- .../source/scan/csv/CSVScanSourceOpDesc.scala | 14 +-- .../source/scan/csv/CSVScanSourceOpExec.scala | 7 +- .../csv/ParallelCSVScanSourceOpDesc.scala | 20 +--- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 23 +--- .../scan/csvOld/CSVOldScanSourceOpExec.scala | 14 +-- .../scan/json/JSONLScanSourceOpDesc.scala | 8 +- .../scan/json/JSONLScanSourceOpExec.scala | 7 +- .../scan/text/FileScanSourceOpDescSpec.scala | 18 +-- 18 files changed, 287 insertions(+), 223 deletions(-) create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala index baff229db0b..d4c92b19a2a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala @@ -25,21 +25,4 @@ trait SourceOperatorExecutor extends OperatorExecutor { // We should move this to onFinishAllPorts later. produceTuple().map(t => (t, Option.empty)) } - - // this function create the input stream accordingly: - // - if filePath is set, create the stream from the file - // - if fileDesc is set, create the stream via JGit call - def createInputStream(filePath: String, datasetFileDocument: DatasetFileDocument): InputStream = { - if (filePath != null && datasetFileDocument != null) { - throw new RuntimeException( - "File Path and Dataset File Descriptor cannot present at the same time." - ) - } - if (filePath != null) { - new FileInputStream(filePath) - } else { - // create stream from dataset file desc - datasetFileDocument.asInputStream() - } - } } diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala index bb5ef6b8e2b..3f013825ccb 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala @@ -1,16 +1,15 @@ package edu.uci.ics.amber.engine.common.storage import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils +import org.jooq.types.UInteger -import java.io.{File, InputStream, FileOutputStream} +import java.io.{File, FileOutputStream, InputStream} import java.net.URI import java.nio.file.{Files, Path} -class DatasetFileDocument(fileFullPath: Path) extends VirtualDocument[Nothing] { - - private val (_, dataset, datasetVersion, fileRelativePath) = - DatasetResource.resolvePath(fileFullPath, shouldContainFile = true) - +class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path) extends VirtualDocument[Nothing] { private var tempFile: Option[File] = None override def getURI: URI = @@ -19,12 +18,13 @@ class DatasetFileDocument(fileFullPath: Path) extends VirtualDocument[Nothing] { ) override def asInputStream(): InputStream = { - fileRelativePath match { - case Some(path) => - DatasetResource.getDatasetFile(dataset.getDid, datasetVersion.getDvid, path) - case None => - throw new IllegalArgumentException("File relative path is missing.") - } + val datasetAbsolutePath = PathUtils.getDatasetPath(UInteger.valueOf(did)) + GitVersionControlLocalFileStorage + .retrieveFileContentOfVersionAsInputStream( + datasetAbsolutePath, + datasetVersionHash, + datasetAbsolutePath.resolve(fileRelativePath) + ) } override def asFile(): File = { diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala new file mode 100644 index 00000000000..86873e2c525 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala @@ -0,0 +1,67 @@ +package edu.uci.ics.amber.engine.common.storage + +import java.io.{File, FileInputStream, InputStream} +import java.net.URI +import java.nio.file.Path + +/** + * ReadonlyLocalFileDocument provides a read-only abstraction over a local file. + * Implements ReadonlyVirtualDocument without requiring a specific data type T. + * Unsupported methods throw NotImplementedError. + */ +class ReadonlyLocalFileDocument(uri: URI) extends ReadonlyVirtualDocument[Nothing] { + + /** + * Get the URI of the corresponding document. + * @return the URI of the document + */ + override def getURI: URI = uri + + /** + * Get the file as an input stream for read operations. + * @return InputStream to read from the file + */ + override def asInputStream(): InputStream = new FileInputStream(new File(uri)) + + /** + * Get the file as an input stream for read operations. + * + * @return InputStream to read from the file + */ + override def asFile(): File = new File(uri) + + /** + * Find ith item and return. + * For this implementation, items are unsupported, so this method is unimplemented. + */ + override def getItem(i: Int): Nothing = + throw new NotImplementedError("getItem is not supported for ReadonlyLocalFileDocument") + + /** + * Get an iterator that iterates over all indexed items. + * Unsupported in ReadonlyLocalFileDocument. + */ + override def get(): Iterator[Nothing] = + throw new NotImplementedError("get is not supported for ReadonlyLocalFileDocument") + + /** + * Get an iterator of a sequence from index `from` to `until`. + * Unsupported in ReadonlyLocalFileDocument. + */ + override def getRange(from: Int, until: Int): Iterator[Nothing] = + throw new NotImplementedError("getRange is not supported for ReadonlyLocalFileDocument") + + /** + * Get an iterator of all items after the specified index `offset`. + * Unsupported in ReadonlyLocalFileDocument. + */ + override def getAfter(offset: Int): Iterator[Nothing] = + throw new NotImplementedError("getAfter is not supported for ReadonlyLocalFileDocument") + + /** + * Get the count of items in the document. + * Unsupported in ReadonlyLocalFileDocument. + */ + override def getCount: Long = + throw new NotImplementedError("getCount is not supported for ReadonlyLocalFileDocument") +} \ No newline at end of file diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala new file mode 100644 index 00000000000..e9df984f036 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala @@ -0,0 +1,64 @@ +package edu.uci.ics.amber.engine.common.storage + +import java.io.{File, InputStream} +import java.net.URI + +/** + * ReadonlyVirtualDocument provides an abstraction for read operations over a single resource. + * This trait can be implemented by resources that only need to support read-related functionality. + * @tparam T the type of data that can use index to read. + */ +trait ReadonlyVirtualDocument[T] { + + /** + * Get the URI of the corresponding document. + * @return the URI of the document + */ + def getURI: URI + + /** + * Find ith item and return. + * @param i index starting from 0 + * @return data item of type T + */ + def getItem(i: Int): T + + /** + * Get an iterator that iterates over all indexed items. + * @return an iterator that returns data items of type T + */ + def get(): Iterator[T] + + /** + * Get an iterator of a sequence starting from index `from`, until index `until`. + * @param from the starting index (inclusive) + * @param until the ending index (exclusive) + * @return an iterator that returns data items of type T + */ + def getRange(from: Int, until: Int): Iterator[T] + + /** + * Get an iterator of all items after the specified index `offset`. + * @param offset the starting index (exclusive) + * @return an iterator that returns data items of type T + */ + def getAfter(offset: Int): Iterator[T] + + /** + * Get the count of items in the document. + * @return the count of items + */ + def getCount: Long + + /** + * Convert document to an input stream. + * @return the input stream + */ + def asInputStream(): InputStream + + /** + * Convert document to an file + */ + + def asFile(): File +} \ No newline at end of file diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala index 162de066543..f590f5dfcb3 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala @@ -9,7 +9,7 @@ import java.net.URI * e.g. for dataset file, supports for read/write using file stream are essential, whereas read & write using index are hard to support and are semantically meaningless * @tparam T the type of data that can use index to read and write. */ -abstract class VirtualDocument[T] { +abstract class VirtualDocument[T] extends ReadonlyVirtualDocument[T] { /** * get the URI of corresponding document diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index fb81e62d011..3521c6b3411 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -135,33 +135,6 @@ object DatasetResource { dataset } - private def getDatasetByName( - ctx: DSLContext, - ownerEmail: String, - datasetName: String - ): Dataset = { - ctx - .select(DATASET.fields: _*) - .from(DATASET) - .leftJoin(USER) - .on(USER.UID.eq(DATASET.OWNER_UID)) - .where(USER.EMAIL.eq(ownerEmail)) - .and(DATASET.NAME.eq(datasetName)) - .fetchOneInto(classOf[Dataset]) - } - - private def getDatasetVersionByName( - ctx: DSLContext, - did: UInteger, - versionName: String - ): DatasetVersion = { - ctx - .selectFrom(DATASET_VERSION) - .where(DATASET_VERSION.DID.eq(did)) - .and(DATASET_VERSION.NAME.eq(versionName)) - .fetchOneInto(classOf[DatasetVersion]) - } - // this function retrieve the version hash identified by dvid and did // read access will be checked private def getDatasetVersionByID( @@ -176,56 +149,6 @@ object DatasetResource { version } - // @param shouldContainFile a boolean flag indicating whether the path includes a fileRelativePath - // when shouldContainFile is true, user given path is /ownerEmail/datasetName/versionName/fileRelativePath - // e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv - // ownerName is bob@texera.com; datasetName is twitterDataset, versionName is v1, fileRelativePath is california/irvine/tw1.csv - // when shouldContainFile is false, user given path is /ownerEmail/datasetName/versionName - // e.g. /bob@texera.com/twitterDataset/v1 - // ownerName is bob@texera.com; datasetName is twitterDataset, versionName is v1 - def resolvePath( - path: java.nio.file.Path, - shouldContainFile: Boolean - ): (String, Dataset, DatasetVersion, Option[java.nio.file.Path]) = { - - val pathSegments = (0 until path.getNameCount).map(path.getName(_).toString).toArray - - // The expected length of the path segments: - // - If shouldContainFile is true, the path should include 4 segments: /ownerEmail/datasetName/versionName/fileRelativePath - // - If shouldContainFile is false, the path should include only 3 segments: /ownerEmail/datasetName/versionName - val expectedLength = if (shouldContainFile) 4 else 3 - - if (pathSegments.length < expectedLength) { - throw new BadRequestException( - s"Invalid path format. Expected format: /ownerEmail/datasetName/versionName" + - (if (shouldContainFile) "/fileRelativePath" else "") - ) - } - - val ownerEmail = pathSegments(0) - val datasetName = pathSegments(1) - val versionName = pathSegments(2) - - val fileRelativePath = - if (shouldContainFile) Some(Paths.get(pathSegments.drop(3).mkString("/"))) else None - - withTransaction(context) { ctx => - // Get the dataset by owner email and dataset name - val dataset = getDatasetByName(ctx, ownerEmail, datasetName) - if (dataset == null) { - throw new NotFoundException("Dataset not found") - } - - // Get the dataset version by dataset ID and version name - val datasetVersion = getDatasetVersionByName(ctx, dataset.getDid, versionName) - if (datasetVersion == null) { - throw new NotFoundException("Dataset version not found") - } - - (ownerEmail, dataset, datasetVersion, fileRelativePath) - } - } - // this function retrieve the DashboardDataset(Dataset from DB+more information) identified by did // read access will be checked def getDashboardDataset(ctx: DSLContext, did: UInteger, uid: UInteger): DashboardDataset = { @@ -299,21 +222,6 @@ object DatasetResource { } } - def getDatasetFile( - did: UInteger, - dvid: UInteger, - fileRelativePath: java.nio.file.Path - ): InputStream = { - val versionHash = getDatasetVersionByID(context, dvid).getVersionHash - val datasetPath = PathUtils.getDatasetPath(did) - GitVersionControlLocalFileStorage - .retrieveFileContentOfVersionAsInputStream( - PathUtils.getDatasetPath(did), - versionHash, - datasetPath.resolve(fileRelativePath) - ) - } - private def getFileNodesOfCertainVersion( ownerNode: DatasetFileNode, datasetName: String, diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index e1a94c8fc48..1c1b4f43c1a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -1,14 +1,22 @@ package edu.uci.ics.texera.workflow.common.storage +import edu.uci.ics.amber.engine.common.Utils.withTransaction + import java.nio.file.{Files, Paths} -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument +import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument, VirtualDocument} +import edu.uci.ics.texera.web.SqlServer +import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} +import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET +import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER +import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION import org.apache.commons.vfs2.FileNotFoundException +import org.jooq.DSLContext +import java.net.URI import scala.util.{Success, Try} object FileResolver { - - type FileResolverOutput = Either[String, DatasetFileDocument] + private val DatasetFileUriScheme = "vfs" /** * Attempts to resolve the given fileName using a list of resolver functions. @@ -17,8 +25,8 @@ object FileResolver { * @throws FileNotFoundException if the file cannot be resolved by any resolver * @return Either[String, DatasetFileDocument] - the resolved path as a String or a DatasetFileDocument */ - def resolve(fileName: String): FileResolverOutput = { - val resolvers: List[String => FileResolverOutput] = List(localResolveFunc, datasetResolveFunc) + def resolve(fileName: String): URI = { + val resolvers: List[String => URI] = List(localResolveFunc, datasetResolveFunc) // Try each resolver function in sequence resolvers.iterator @@ -29,30 +37,104 @@ object FileResolver { .getOrElse(throw new FileNotFoundException(fileName)) } + /** + * Open a file handle for the given fileUri + * @param fileUri the uri pointing to the file + * @return + */ + def open(fileUri: URI): ReadonlyVirtualDocument[_] = { + fileUri.getScheme match { + case DatasetFileUriScheme => + // Parse the host to get dataset ID and version hash + val hostParts = fileUri.getHost.split("\\.") + if (hostParts.length != 2) { + throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}") + } + val datasetId = hostParts(0).toInt + val versionHash = hostParts(1) + + // The path within the URI represents the relative path of the file in the dataset + val fileRelativePath = Paths.get(fileUri.getPath.stripPrefix("/")) + + // Create and return a DatasetFileDocument with the parsed values + new DatasetFileDocument(datasetId, versionHash, fileRelativePath) + + case "file" => + // For local files, create a ReadonlyLocalFileDocument + new ReadonlyLocalFileDocument(fileUri) + + case _ => + throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") + } + } /** * Attempts to resolve a local file path. * @throws FileNotFoundException if the local file does not exist * @param fileName the name of the file to check */ - private def localResolveFunc(fileName: String): FileResolverOutput = { + private def localResolveFunc(fileName: String): URI = { val filePath = Paths.get(fileName) if (Files.exists(filePath)) { - Left(fileName) // File exists locally, return the path as a string in the Left + filePath.toUri // File exists locally, return the path as a string in the Left } else { throw new FileNotFoundException(s"Local file $fileName does not exist") } } /** - * Attempts to resolve a DatasetFileDocument. + * Attempts to resolve a given fileName to a URI. + * + * The fileName format should be: /ownerEmail/datasetName/versionName/fileRelativePath + * e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv + * The output dataset URI format is: {DatasetFileUriScheme}://{did}.{versionHash}/file-path + * e.g. vfs://15.adeq233td/some/dir/file.txt * * @param fileName the name of the file to attempt resolving as a DatasetFileDocument * @return Either[String, DatasetFileDocument] - Right(document) if creation succeeds * @throws FileNotFoundException if the dataset file does not exist or cannot be created */ - private def datasetResolveFunc(fileName: String): FileResolverOutput = { + private def datasetResolveFunc(fileName: String): URI = { val filePath = Paths.get(fileName) - val document = new DatasetFileDocument(filePath) // This will throw if creation fails - Right(document) + val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray + + if (pathSegments.length < 4) { + throw new RuntimeException( + s"Invalid path format. Expected format: /ownerEmail/datasetName/versionName/fileRelativePath" + ) + } + + val ownerEmail = pathSegments(0) + val datasetName = pathSegments(1) + val versionName = pathSegments(2) + val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/")) + + withTransaction(SqlServer.createDSLContext()) { ctx => + val (dataset, datasetVersion) = getDatasetAndDatasetVersionByName(ctx, ownerEmail, datasetName, versionName) + if (dataset == null || datasetVersion == null) { + throw new FileNotFoundException(s"Dataset file $fileName") + } + + // assemble dataset URI format + val host = s"${dataset.getDid.intValue()}.${datasetVersion.getVersionHash}" + new URI(DatasetFileUriScheme, host, fileRelativePath.toUri.getPath, null) + } + } + + private def getDatasetAndDatasetVersionByName(ctx: DSLContext, ownerEmail: String, datasetName: String, datasetVersionName: String): (Dataset, DatasetVersion) = { + val dataset = ctx + .select(DATASET.fields: _*) + .from(DATASET) + .leftJoin(USER) + .on(USER.UID.eq(DATASET.OWNER_UID)) + .where(USER.EMAIL.eq(ownerEmail)) + .and(DATASET.NAME.eq(datasetName)) + .fetchOneInto(classOf[Dataset]) + + val datasetVersion = ctx + .selectFrom(DATASET_VERSION) + .where(DATASET_VERSION.DID.eq(dataset.getDid)) + .and(DATASET_VERSION.NAME.eq(datasetVersionName)) + .fetchOneInto(classOf[DatasetVersion]) + (dataset, datasetVersion) } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala index daa4c3864b5..ee1621e762f 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpDesc.scala @@ -47,7 +47,6 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val (filepath, fileDesc) = determineFilePathOrDatasetFile() PhysicalOp .sourcePhysicalOp( workflowId, @@ -55,8 +54,7 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { operatorIdentifier, OpExecInitInfo((_, _) => new FileScanSourceOpExec( - filepath, - fileDesc, + fileUri.get, attributeType, encoding, extract, diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala index 0244195d4a9..5756252995d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala @@ -4,16 +4,17 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.model.tuple.TupleLike import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField +import edu.uci.ics.texera.workflow.common.storage.FileResolver import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory} import org.apache.commons.io.IOUtils.toByteArray import java.io._ +import java.net.URI import scala.collection.mutable import scala.jdk.CollectionConverters.IteratorHasAsScala class FileScanSourceOpExec private[scan] ( - filePath: String, - datasetFileDesc: DatasetFileDocument, + fileUri: String, fileAttributeType: FileAttributeType, fileEncoding: FileDecodingMethod, extract: Boolean, @@ -26,7 +27,7 @@ class FileScanSourceOpExec private[scan] ( override def produceTuple(): Iterator[TupleLike] = { var filenameIt: Iterator[String] = Iterator.empty val fileEntries: Iterator[InputStream] = { - val is = createInputStream(filePath, datasetFileDesc) + val is = FileResolver.open(new URI(fileUri)).asInputStream() if (extract) { val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream( new BufferedInputStream(is) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala index 01d03538513..7997b4846ae 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala @@ -30,9 +30,9 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { @JsonPropertyDescription("decoding charset to use on input") var fileEncoding: FileDecodingMethod = FileDecodingMethod.UTF_8 - // Unified file handle, can be either a local path (String) or DatasetFileDocument + // uri of the file @JsonIgnore - var fileHandle: FileResolver.FileResolverOutput = _ + var fileUri: Option[String] = None @JsonIgnore var fileTypeName: Option[String] = None @@ -50,7 +50,7 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { var offset: Option[Int] = None override def sourceSchema(): Schema = { - if (fileHandle == null) return null + if (fileUri == null) return null inferSchema() } @@ -61,8 +61,8 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { throw new RuntimeException("no input file name") } - // Resolve the file and assign the result to fileHandle - fileHandle = FileResolver.resolve(fileName.get) + // Resolve the file and assign the result to file uri + fileUri = Some(FileResolver.resolve(fileName.get).toASCIIString) } override def operatorInfo: OperatorInfo = { @@ -77,14 +77,6 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { def inferSchema(): Schema - // Get the source file descriptor from the fileHandle - def determineFilePathOrDatasetFile(): (String, DatasetFileDocument) = { - fileHandle match { - case Left(path) => (path, null) // File path is a local path as String - case Right(document) => (null, document) // File is a DatasetFileDocument - } - } - override def equals(that: Any): Boolean = EqualsBuilder.reflectionEquals(this, that, "context", "fileHandle") } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala index e9336f2acec..2cc906bce4d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala @@ -9,9 +9,11 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, FileInputStream, IOException, InputStreamReader} +import java.net.URI class CSVScanSourceOpDesc extends ScanSourceOpDesc { @@ -37,7 +39,6 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty || customDelimiter.get.isEmpty) customDelimiter = Option(",") - val (filepath, fileDesc) = determineFilePathOrDatasetFile() PhysicalOp .sourcePhysicalOp( workflowId, @@ -45,8 +46,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { operatorIdentifier, OpExecInitInfo((_, _) => new CSVScanSourceOpExec( - filepath, - fileDesc, + fileUri.get, fileEncoding, limit, offset, @@ -74,13 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { return null } - val (filepath, fileDesc) = determineFilePathOrDatasetFile() - val stream = - if (filepath != null) { - new FileInputStream(new File(filepath)) - } else { - fileDesc.asInputStream() - } + val stream = FileResolver.open(new URI(fileUri.get)).asInputStream() val inputReader = new InputStreamReader(stream, fileEncoding.getCharset) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala index fe5a9a61051..05942761d7b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala @@ -6,14 +6,15 @@ import edu.uci.ics.amber.engine.common.workflow.PortIdentity import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport} import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike} import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.io.InputStreamReader +import java.net.URI import scala.collection.immutable.ArraySeq class CSVScanSourceOpExec private[csv] ( - filePath: String, - datasetFileDesc: DatasetFileDocument, + fileUri: String, fileEncoding: FileDecodingMethod, limit: Option[Int], offset: Option[Int], @@ -69,7 +70,7 @@ class CSVScanSourceOpExec private[csv] ( override def open(): Unit = { inputReader = new InputStreamReader( - createInputStream(filePath, datasetFileDesc), + FileResolver.open(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 567fa40ed34..66ecde61726 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -9,9 +9,11 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, IOException} +import java.net.URI class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { @@ -39,13 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here // TODO: consider a better way - val (filepath, fileDesc) = determineFilePathOrDatasetFile() - val file = - if (filepath == null) { - fileDesc.asFile() - } else { - new File(filepath) - } + val file = FileResolver.open(new URI(fileUri.get)).asFile() val totalBytes: Long = file.length() PhysicalOp @@ -86,13 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty) { return null } - val (filepath, fileDesc) = determineFilePathOrDatasetFile() - val file = - if (filepath == null) { - fileDesc.asFile() - } else { - new File(filepath) - } + val file = FileResolver.open(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) @@ -102,7 +92,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { reader.close() // reopen the file to read from the beginning - reader = CSVReader.open(filepath)(CustomFormat) + reader = CSVReader.open(file.toPath.toString)(CustomFormat) if (hasHeader) reader.readNext() diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index ca17df52a26..5524e6ee51a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -9,9 +9,11 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, IOException} +import java.net.URI class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { @@ -36,17 +38,6 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { // fill in default values if (customDelimiter.get.isEmpty) customDelimiter = Option(",") - - val (filepath, datasetFileDocument) = determineFilePathOrDatasetFile() - // for CSVOldScanSourceOpDesc, it requires the full File presence when execute, so use temp file here - // TODO: figure out a better way - val path = - if (filepath == null) { - datasetFileDocument.asFile().toPath.toString - } else { - filepath - } - PhysicalOp .sourcePhysicalOp( workflowId, @@ -54,7 +45,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { operatorIdentifier, OpExecInitInfo((_, _) => new CSVOldScanSourceOpExec( - path, + fileUri.get, fileEncoding, limit, offset, @@ -81,13 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty) { return null } - val (filepath, fileDesc) = determineFilePathOrDatasetFile() - val file = - if (filepath != null) { - new File(filepath) - } else { - fileDesc.asFile() - } + val file = FileResolver.open(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala index 60dc8d81d02..812c8361093 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -2,18 +2,15 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor -import edu.uci.ics.amber.engine.common.model.tuple.{ - Attribute, - AttributeTypeUtils, - Schema, - TupleLike -} +import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod +import java.net.URI import scala.collection.compat.immutable.ArraySeq class CSVOldScanSourceOpExec private[csvOld] ( - filePath: String, + fileUri: String, fileEncoding: FileDecodingMethod, limit: Option[Int], offset: Option[Int], @@ -51,7 +48,8 @@ class CSVOldScanSourceOpExec private[csvOld] ( implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } - reader = CSVReader.open(filePath, fileEncoding.getCharset.name())(CustomFormat) + val filePath = FileResolver.open(new URI(fileUri)).asFile().toPath + reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat) // skip line if this worker reads the start of a file, and the file has a header line val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala index e3c3bcd6027..be56a302419 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala @@ -9,10 +9,12 @@ import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, Workf import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap import java.io.{BufferedReader, FileInputStream, IOException, InputStream, InputStreamReader} +import java.net.URI import scala.collection.mutable.ArrayBuffer import scala.jdk.CollectionConverters.IteratorHasAsScala @@ -37,8 +39,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val (filepath, fileDesc) = determineFilePathOrDatasetFile() - val stream = createInputStream(filepath, fileDesc) + val stream = FileResolver.open(new URI(fileUri.get)).asInputStream() // count lines and partition the task to each worker val reader = new BufferedReader( new InputStreamReader(stream, fileEncoding.getCharset) @@ -60,8 +61,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { offsetValue + (if (idx != workerCount - 1) count / workerCount * (idx + 1) else count) new JSONLScanSourceOpExec( - filepath, - fileDesc, + fileUri.get, fileEncoding, startOffset, endOffset, diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala index e57f93021ec..6d9a1d29e6d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala @@ -5,16 +5,17 @@ import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap import java.io.{BufferedReader, InputStreamReader} +import java.net.URI import scala.jdk.CollectionConverters.IteratorHasAsScala import scala.util.{Failure, Success, Try} class JSONLScanSourceOpExec private[json] ( - filePath: String, - datasetFileDesc: DatasetFileDocument, + fileUri: String, fileEncoding: FileDecodingMethod, startOffset: Int, endOffset: Int, @@ -42,7 +43,7 @@ class JSONLScanSourceOpExec private[json] ( override def open(): Unit = { schema = schemaFunc() reader = new BufferedReader( - new InputStreamReader(createInputStream(filePath, datasetFileDesc), fileEncoding.getCharset) + new InputStreamReader(FileResolver.open(new URI(fileUri)).asInputStream(), fileEncoding.getCharset) ) rows = reader.lines().iterator().asScala.slice(startOffset, endOffset) } diff --git a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala index a766be3606d..7c16a38d05c 100644 --- a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala @@ -18,7 +18,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { before { fileScanSourceOpDesc = new FileScanSourceOpDesc() - fileScanSourceOpDesc.fileHandle = Left(TestTextFilePath) + fileScanSourceOpDesc.fileUri = Left(TestTextFilePath) fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.UTF_8 } @@ -61,7 +61,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileHandle.left.getOrElse(""), + fileScanSourceOpDesc.fileUri.left.getOrElse(""), null, fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, @@ -87,12 +87,12 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text file with CRLF separators into corresponding output tuples" in { - fileScanSourceOpDesc.fileHandle = Left(TestCRLFTextFilePath) + fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath) fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileHandle.left.getOrElse(""), + fileScanSourceOpDesc.fileUri.left.getOrElse(""), null, fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, @@ -121,7 +121,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { fileScanSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileHandle.left.getOrElse(""), + fileScanSourceOpDesc.fileUri.left.getOrElse(""), null, fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, @@ -148,11 +148,11 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text into corresponding output INTEGER tuples" in { - fileScanSourceOpDesc.fileHandle = Left(TestNumbersFilePath) + fileScanSourceOpDesc.fileUri = Left(TestNumbersFilePath) fileScanSourceOpDesc.attributeType = FileAttributeType.INTEGER fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileHandle.left.getOrElse(""), + fileScanSourceOpDesc.fileUri.left.getOrElse(""), null, fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, @@ -178,13 +178,13 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text file with US_ASCII encoding" in { - fileScanSourceOpDesc.fileHandle = Left(TestCRLFTextFilePath) + fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath) fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.ASCII fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileHandle.left.getOrElse(""), + fileScanSourceOpDesc.fileUri.left.getOrElse(""), null, fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, From 6ec1fb4eb2fc97d08069eecce71f0d50a870d659 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sat, 26 Oct 2024 23:36:39 -0700 Subject: [PATCH 02/18] fix more --- .../user/dataset/DatasetResource.scala | 95 +++---------------- .../common/storage/FileResolver.scala | 33 +++---- 2 files changed, 29 insertions(+), 99 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index 3521c6b3411..96459561d03 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -4,63 +4,18 @@ import edu.uci.ics.amber.engine.common.Utils.withTransaction import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.auth.SessionUser import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege -import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{ - DatasetDao, - DatasetUserAccessDao, - DatasetVersionDao -} -import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{ - Dataset, - DatasetUserAccess, - DatasetVersion, - User -} +import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao} +import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User} import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{ - getDatasetUserAccessPrivilege, - getOwner, - userHasReadAccess, - userHasWriteAccess, - userOwnDataset -} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{ - DATASET_IS_PRIVATE, - DATASET_IS_PUBLIC, - DashboardDataset, - DashboardDatasetVersion, - DatasetDescriptionModification, - DatasetIDs, - DatasetNameModification, - DatasetVersionRootFileNodes, - DatasetVersionRootFileNodesResponse, - DatasetVersions, - ERR_DATASET_CREATION_FAILED_MESSAGE, - ERR_DATASET_NAME_ALREADY_EXISTS, - ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, - ListDatasetsResponse, - calculateLatestDatasetVersionSize, - calculateDatasetVersionSize, - context, - createNewDatasetVersionFromFormData, - getDashboardDataset, - getDatasetByID, - getDatasetVersionByID, - getDatasetVersions, - getFileNodesOfCertainVersion, - getLatestDatasetVersionWithAccessCheck, - getUserDatasets, - resolvePath, - retrievePublicDatasets -} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{ - DatasetFileNode, - PhysicalFileNode -} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{getDatasetUserAccessPrivilege, getOwner, userHasReadAccess, userHasWriteAccess, userOwnDataset} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{DATASET_IS_PRIVATE, DATASET_IS_PUBLIC, DashboardDataset, DashboardDatasetVersion, DatasetDescriptionModification, DatasetIDs, DatasetNameModification, DatasetVersionRootFileNodes, DatasetVersionRootFileNodesResponse, DatasetVersions, ERR_DATASET_CREATION_FAILED_MESSAGE, ERR_DATASET_NAME_ALREADY_EXISTS, ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, ListDatasetsResponse, calculateDatasetVersionSize, calculateLatestDatasetVersionSize, context, createNewDatasetVersionFromFormData, getDashboardDataset, getDatasetByID, getDatasetVersionByID, getDatasetVersions, getFileNodesOfCertainVersion, getLatestDatasetVersionWithAccessCheck, getUserDatasets, retrievePublicDatasets} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode} import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils +import edu.uci.ics.texera.workflow.common.storage.FileResolver import io.dropwizard.auth.Auth import org.apache.commons.lang3.StringUtils import org.glassfish.jersey.media.multipart.{FormDataMultiPart, FormDataParam} @@ -76,21 +31,9 @@ import java.util.zip.{ZipEntry, ZipOutputStream} import java.util import java.util.concurrent.locks.ReentrantLock import javax.annotation.security.RolesAllowed -import javax.ws.rs.{ - BadRequestException, - Consumes, - ForbiddenException, - GET, - NotFoundException, - POST, - Path, - PathParam, - Produces, - QueryParam, - WebApplicationException -} +import javax.ws.rs.{BadRequestException, Consumes, ForbiddenException, GET, NotFoundException, POST, Path, PathParam, Produces, QueryParam, WebApplicationException} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} -import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` +import scala.collection.convert.ImplicitConversions.{`collection AsScalaIterable`, `iterable AsScalaIterable`} import scala.collection.mutable import scala.collection.mutable.ListBuffer import scala.jdk.CollectionConverters._ @@ -277,17 +220,12 @@ object DatasetResource { .parse(filePathsValue) .as[List[String]] .foreach(pathStr => { - val (_, _, _, fileRelativePath) = - resolvePath(Paths.get(pathStr), shouldContainFile = true) - + val fileRelativePath = Paths.get(FileResolver.resolve(pathStr).getPath) fileRelativePath .map { path => filesToRemove += datasetPath .resolve(path) // When path exists, resolve it and add to filesToRemove } - .getOrElse { - throw new IllegalArgumentException("File relative path is missing") - } }) } } @@ -763,7 +701,7 @@ class DatasetResource { // if the file path is given, then only fetch the dataset and version this file is belonging to val decodedPathStr = URLDecoder.decode(filePathStr, StandardCharsets.UTF_8.name()) val (ownerEmail, dataset, version, _) = - resolvePath(Paths.get(decodedPathStr), shouldContainFile = true) + FileResolver.parseFileNameForDataset(ctx, decodedPathStr) val accessPrivilege = getDatasetUserAccessPrivilege(ctx, dataset.getDid, uid) if ( accessPrivilege == DatasetUserAccessPrivilege.NONE && dataset.getIsPublic == DATASET_IS_PRIVATE @@ -988,10 +926,10 @@ class DatasetResource { val uid = user.getUid val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) - val (_, dataset, dsVersion, fileRelativePath) = - resolvePath(Paths.get(decodedPathStr), shouldContainFile = true) - withTransaction(context)(ctx => { + val (_, dataset, dsVersion, fileRelativePath) = + FileResolver.parseFileNameForDataset(ctx, decodedPathStr) + val did = dataset.getDid val dvid = dsVersion.getDvid @@ -1005,7 +943,7 @@ class DatasetResource { val streamingOutput = new StreamingOutput() { override def write(output: OutputStream): Unit = { fileRelativePath - .map { path => + .foreach { path => GitVersionControlLocalFileStorage.retrieveFileContentOfVersion( targetDatasetPath, datasetVersion.getVersionHash, @@ -1013,9 +951,6 @@ class DatasetResource { output ) } - .getOrElse { - throw new IllegalArgumentException("File relative path is missing.") - } } } @@ -1125,7 +1060,7 @@ class DatasetResource { ): (Dataset, DatasetVersion) = { val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) val (_, dataset, dsVersion, _) = - resolvePath(Paths.get(decodedPathStr), shouldContainFile = false) + FileResolver.parseFileNameForDataset(context, decodedPathStr) validateUserAccess(dataset.getDid, user.getUid) (dataset, dsVersion) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index 1c1b4f43c1a..e4043941da1 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -2,7 +2,7 @@ package edu.uci.ics.texera.workflow.common.storage import edu.uci.ics.amber.engine.common.Utils.withTransaction -import java.nio.file.{Files, Paths} +import java.nio.file.{Files, Path, Paths} import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument, VirtualDocument} import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} @@ -67,6 +67,7 @@ object FileResolver { throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") } } + /** * Attempts to resolve a local file path. * @throws FileNotFoundException if the local file does not exist @@ -94,22 +95,8 @@ object FileResolver { * @throws FileNotFoundException if the dataset file does not exist or cannot be created */ private def datasetResolveFunc(fileName: String): URI = { - val filePath = Paths.get(fileName) - val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray - - if (pathSegments.length < 4) { - throw new RuntimeException( - s"Invalid path format. Expected format: /ownerEmail/datasetName/versionName/fileRelativePath" - ) - } - - val ownerEmail = pathSegments(0) - val datasetName = pathSegments(1) - val versionName = pathSegments(2) - val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/")) - withTransaction(SqlServer.createDSLContext()) { ctx => - val (dataset, datasetVersion) = getDatasetAndDatasetVersionByName(ctx, ownerEmail, datasetName, versionName) + val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName) if (dataset == null || datasetVersion == null) { throw new FileNotFoundException(s"Dataset file $fileName") } @@ -120,7 +107,15 @@ object FileResolver { } } - private def getDatasetAndDatasetVersionByName(ctx: DSLContext, ownerEmail: String, datasetName: String, datasetVersionName: String): (Dataset, DatasetVersion) = { + def parseFileNameForDataset(ctx: DSLContext, fileName: String): (String, Dataset, DatasetVersion, Path) = { + val filePath = Paths.get(fileName) + val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray + + val ownerEmail = pathSegments(0) + val datasetName = pathSegments(1) + val versionName = pathSegments(2) + val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/")) + val dataset = ctx .select(DATASET.fields: _*) .from(DATASET) @@ -133,8 +128,8 @@ object FileResolver { val datasetVersion = ctx .selectFrom(DATASET_VERSION) .where(DATASET_VERSION.DID.eq(dataset.getDid)) - .and(DATASET_VERSION.NAME.eq(datasetVersionName)) + .and(DATASET_VERSION.NAME.eq(versionName)) .fetchOneInto(classOf[DatasetVersion]) - (dataset, datasetVersion) + (ownerEmail, dataset, datasetVersion, fileRelativePath) } } From 1013205e10cd4991bfd0f8e03fd07bab9dc4b09f Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 27 Oct 2024 08:07:09 -0700 Subject: [PATCH 03/18] fix more --- .../operators/source/scan/json/JSONLScanSourceOpDesc.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala index be56a302419..99547e0182a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala @@ -85,8 +85,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - val (filepath, fileDesc) = determineFilePathOrDatasetFile() - val stream = createInputStream(filepath, fileDesc) + val stream = FileResolver.open(new URI(fileUri.get)).asInputStream() val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset)) var fieldNames = Set[String]() From bb60d75f58bc6934536d1c4e12c126dc260384cb Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 27 Oct 2024 09:28:12 -0700 Subject: [PATCH 04/18] make it work --- .../resource/SchemaPropagationResource.scala | 1 + .../common/storage/FileResolver.scala | 55 +++++++++++++------ .../common/workflow/LogicalPlan.scala | 24 ++++++++ .../common/workflow/WorkflowCompiler.scala | 3 +- .../source/scan/ScanSourceOpDesc.scala | 15 +++-- 5 files changed, 73 insertions(+), 25 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala index 1a48eca0ae9..dfcf38cedf3 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala @@ -40,6 +40,7 @@ class SchemaPropagationResource extends LazyLogging { ) val logicalPlan = LogicalPlan(logicalPlanPojo) + logicalPlan.resolveScanSourceOpFileName(None) // the PhysicalPlan with topology expanded. val physicalPlan = PhysicalPlan(context, logicalPlan) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index e4043941da1..39a430d0191 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -12,7 +12,8 @@ import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET import org.apache.commons.vfs2.FileNotFoundException import org.jooq.DSLContext -import java.net.URI +import java.net.{URI, URLDecoder, URLEncoder} +import java.nio.charset.StandardCharsets import scala.util.{Success, Try} object FileResolver { @@ -45,19 +46,26 @@ object FileResolver { def open(fileUri: URI): ReadonlyVirtualDocument[_] = { fileUri.getScheme match { case DatasetFileUriScheme => - // Parse the host to get dataset ID and version hash - val hostParts = fileUri.getHost.split("\\.") - if (hostParts.length != 2) { + // Extract path components and decode them + val pathParts = fileUri.getPath.stripPrefix("/").split("/").map(part => + URLDecoder.decode(part, StandardCharsets.UTF_8) + ) + + if (pathParts.length < 3) { throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}") } - val datasetId = hostParts(0).toInt - val versionHash = hostParts(1) - // The path within the URI represents the relative path of the file in the dataset - val fileRelativePath = Paths.get(fileUri.getPath.stripPrefix("/")) + // Parse the dataset ID and version hash, and build the file path + val did = pathParts(0).toInt + val versionHash = pathParts(1) + val fileRelativePath = Paths.get(pathParts.drop(2).mkString("/")) - // Create and return a DatasetFileDocument with the parsed values - new DatasetFileDocument(datasetId, versionHash, fileRelativePath) + // Create and return a DatasetFileDocument + new DatasetFileDocument( + did = did, + datasetVersionHash = versionHash, + fileRelativePath = fileRelativePath + ) case "file" => // For local files, create a ReadonlyLocalFileDocument @@ -87,23 +95,38 @@ object FileResolver { * * The fileName format should be: /ownerEmail/datasetName/versionName/fileRelativePath * e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv - * The output dataset URI format is: {DatasetFileUriScheme}://{did}.{versionHash}/file-path - * e.g. vfs://15.adeq233td/some/dir/file.txt + * The output dataset URI format is: {DatasetFileUriScheme}:///{did}/{versionHash}/file-path + * e.g. vfs:///15/adeq233td/some/dir/file.txt * * @param fileName the name of the file to attempt resolving as a DatasetFileDocument * @return Either[String, DatasetFileDocument] - Right(document) if creation succeeds * @throws FileNotFoundException if the dataset file does not exist or cannot be created */ + + import java.net.{URI, URISyntaxException, URLEncoder} + import java.nio.charset.StandardCharsets + import java.nio.file.Path + import org.apache.commons.vfs2.FileNotFoundException + private def datasetResolveFunc(fileName: String): URI = { withTransaction(SqlServer.createDSLContext()) { ctx => val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName) + if (dataset == null || datasetVersion == null) { - throw new FileNotFoundException(s"Dataset file $fileName") + throw new FileNotFoundException(s"Dataset file $fileName not found.") } - // assemble dataset URI format - val host = s"${dataset.getDid.intValue()}.${datasetVersion.getVersionHash}" - new URI(DatasetFileUriScheme, host, fileRelativePath.toUri.getPath, null) + // Construct path as /{did}/{versionHash}/file-path + val did = dataset.getDid.intValue() + val versionHash = datasetVersion.getVersionHash + val encodedPath = s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}" + + try { + new URI(DatasetFileUriScheme, null, encodedPath, null) + } catch { + case e: URISyntaxException => + throw e + } } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala index 3ea3e24fd06..6e9da8e9fcb 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala @@ -8,6 +8,8 @@ import edu.uci.ics.amber.engine.common.workflow.PortIdentity import edu.uci.ics.texera.web.model.websocket.request.LogicalPlanPojo import edu.uci.ics.texera.workflow.common.operators.LogicalOp import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor +import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import org.jgrapht.graph.DirectedAcyclicGraph import org.jgrapht.util.SupplierUtil @@ -144,6 +146,28 @@ case class LogicalPlan( .toMap } + def resolveScanSourceOpFileName( + errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]] + ): Unit = { + operators.foreach { + case operator@(scanOp: ScanSourceOpDesc) => + Try { + // Resolve file path for ScanSourceOpDesc + val fileName = scanOp.fileName.getOrElse(throw new RuntimeException("no input file name")) + val fileUri = FileResolver.resolve(fileName) // Convert to URI + + // Set the URI in the ScanSourceOpDesc + scanOp.setFileUri(fileUri) + } match { + case Success(_) => // Successfully resolved and set the file URI + case Failure(err) => + logger.error("Error resolving file path for ScanSourceOpDesc", err) + errorList.foreach(_.append((operator.operatorIdentifier, err))) + } + case _ => // Skip non-ScanSourceOpDesc operators + } + } + def propagateWorkflowSchema( context: WorkflowContext, errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]] diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index efa4e305275..e40252f5409 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -51,7 +51,7 @@ class WorkflowCompiler( logicalPlanPojo.opsToViewResult, logicalPlan ) - + logicalPlan.resolveScanSourceOpFileName(Some(errorList)) logicalPlan.propagateWorkflowSchema(context, Some(errorList)) // map compilation errors with op id if (errorList.nonEmpty) { @@ -121,6 +121,7 @@ class WorkflowCompiler( logicalPlan ) + logicalPlan.resolveScanSourceOpFileName(Some(errorList)) logicalPlan.propagateWorkflowSchema(context, Some(errorList)) // report compilation errors diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala index 7997b4846ae..82817358304 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala @@ -12,6 +12,8 @@ import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescrip import edu.uci.ics.texera.workflow.common.storage.FileResolver import org.apache.commons.lang3.builder.EqualsBuilder +import java.net.URI + abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { /** in the case we do not want to read the entire large file, but only @@ -50,19 +52,12 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { var offset: Option[Int] = None override def sourceSchema(): Schema = { - if (fileUri == null) return null + if (fileUri.isEmpty) return null inferSchema() } override def setContext(workflowContext: WorkflowContext): Unit = { super.setContext(workflowContext) - - if (fileName.isEmpty) { - throw new RuntimeException("no input file name") - } - - // Resolve the file and assign the result to file uri - fileUri = Some(FileResolver.resolve(fileName.get).toASCIIString) } override def operatorInfo: OperatorInfo = { @@ -77,6 +72,10 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { def inferSchema(): Schema + def setFileUri(uri: URI): Unit = { + fileUri = Some(uri.toASCIIString) + } + override def equals(that: Any): Boolean = EqualsBuilder.reflectionEquals(this, that, "context", "fileHandle") } From 8b35d90023e97c461e1d4768ea4895d1c336bad2 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 27 Oct 2024 09:41:33 -0700 Subject: [PATCH 05/18] fmt --- .../common/storage/DatasetFileDocument.scala | 13 ++-- .../storage/ReadonlyLocalFileDocument.scala | 2 +- .../storage/ReadonlyVirtualDocument.scala | 2 +- .../user/dataset/DatasetResource.scala | 77 +++++++++++++++++-- .../common/storage/FileResolver.scala | 34 ++++---- .../common/workflow/LogicalPlan.scala | 6 +- .../scan/csvOld/CSVOldScanSourceOpExec.scala | 7 +- .../scan/json/JSONLScanSourceOpExec.scala | 5 +- 8 files changed, 110 insertions(+), 36 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala index 3f013825ccb..cbc90d70fe6 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala @@ -9,7 +9,8 @@ import java.io.{File, FileOutputStream, InputStream} import java.net.URI import java.nio.file.{Files, Path} -class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path) extends VirtualDocument[Nothing] { +class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path) + extends VirtualDocument[Nothing] { private var tempFile: Option[File] = None override def getURI: URI = @@ -20,11 +21,11 @@ class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath override def asInputStream(): InputStream = { val datasetAbsolutePath = PathUtils.getDatasetPath(UInteger.valueOf(did)) GitVersionControlLocalFileStorage - .retrieveFileContentOfVersionAsInputStream( - datasetAbsolutePath, - datasetVersionHash, - datasetAbsolutePath.resolve(fileRelativePath) - ) + .retrieveFileContentOfVersionAsInputStream( + datasetAbsolutePath, + datasetVersionHash, + datasetAbsolutePath.resolve(fileRelativePath) + ) } override def asFile(): File = { diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala index 86873e2c525..2ea19ee887d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala @@ -64,4 +64,4 @@ class ReadonlyLocalFileDocument(uri: URI) extends ReadonlyVirtualDocument[Nothin */ override def getCount: Long = throw new NotImplementedError("getCount is not supported for ReadonlyLocalFileDocument") -} \ No newline at end of file +} diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala index e9df984f036..81acadd5aff 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyVirtualDocument.scala @@ -61,4 +61,4 @@ trait ReadonlyVirtualDocument[T] { */ def asFile(): File -} \ No newline at end of file +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index 96459561d03..9e47377ee57 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -4,15 +4,60 @@ import edu.uci.ics.amber.engine.common.Utils.withTransaction import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.auth.SessionUser import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege -import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao} -import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User} +import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{ + DatasetDao, + DatasetUserAccessDao, + DatasetVersionDao +} +import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{ + Dataset, + DatasetUserAccess, + DatasetVersion, + User +} import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{getDatasetUserAccessPrivilege, getOwner, userHasReadAccess, userHasWriteAccess, userOwnDataset} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{DATASET_IS_PRIVATE, DATASET_IS_PUBLIC, DashboardDataset, DashboardDatasetVersion, DatasetDescriptionModification, DatasetIDs, DatasetNameModification, DatasetVersionRootFileNodes, DatasetVersionRootFileNodesResponse, DatasetVersions, ERR_DATASET_CREATION_FAILED_MESSAGE, ERR_DATASET_NAME_ALREADY_EXISTS, ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, ListDatasetsResponse, calculateDatasetVersionSize, calculateLatestDatasetVersionSize, context, createNewDatasetVersionFromFormData, getDashboardDataset, getDatasetByID, getDatasetVersionByID, getDatasetVersions, getFileNodesOfCertainVersion, getLatestDatasetVersionWithAccessCheck, getUserDatasets, retrievePublicDatasets} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{ + getDatasetUserAccessPrivilege, + getOwner, + userHasReadAccess, + userHasWriteAccess, + userOwnDataset +} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{ + DATASET_IS_PRIVATE, + DATASET_IS_PUBLIC, + DashboardDataset, + DashboardDatasetVersion, + DatasetDescriptionModification, + DatasetIDs, + DatasetNameModification, + DatasetVersionRootFileNodes, + DatasetVersionRootFileNodesResponse, + DatasetVersions, + ERR_DATASET_CREATION_FAILED_MESSAGE, + ERR_DATASET_NAME_ALREADY_EXISTS, + ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, + ListDatasetsResponse, + calculateDatasetVersionSize, + calculateLatestDatasetVersionSize, + context, + createNewDatasetVersionFromFormData, + getDashboardDataset, + getDatasetByID, + getDatasetVersionByID, + getDatasetVersions, + getFileNodesOfCertainVersion, + getLatestDatasetVersionWithAccessCheck, + getUserDatasets, + retrievePublicDatasets +} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{ + DatasetFileNode, + PhysicalFileNode +} import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils import edu.uci.ics.texera.workflow.common.storage.FileResolver @@ -31,9 +76,24 @@ import java.util.zip.{ZipEntry, ZipOutputStream} import java.util import java.util.concurrent.locks.ReentrantLock import javax.annotation.security.RolesAllowed -import javax.ws.rs.{BadRequestException, Consumes, ForbiddenException, GET, NotFoundException, POST, Path, PathParam, Produces, QueryParam, WebApplicationException} +import javax.ws.rs.{ + BadRequestException, + Consumes, + ForbiddenException, + GET, + NotFoundException, + POST, + Path, + PathParam, + Produces, + QueryParam, + WebApplicationException +} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} -import scala.collection.convert.ImplicitConversions.{`collection AsScalaIterable`, `iterable AsScalaIterable`} +import scala.collection.convert.ImplicitConversions.{ + `collection AsScalaIterable`, + `iterable AsScalaIterable` +} import scala.collection.mutable import scala.collection.mutable.ListBuffer import scala.jdk.CollectionConverters._ @@ -220,7 +280,8 @@ object DatasetResource { .parse(filePathsValue) .as[List[String]] .foreach(pathStr => { - val fileRelativePath = Paths.get(FileResolver.resolve(pathStr).getPath) + // TODO: refactor this part + val (_, _, _, fileRelativePath) = FileResolver.parseFileNameForDataset(context, pathStr) fileRelativePath .map { path => filesToRemove += datasetPath diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index 39a430d0191..a3d74e4ec43 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -3,7 +3,12 @@ package edu.uci.ics.texera.workflow.common.storage import edu.uci.ics.amber.engine.common.Utils.withTransaction import java.nio.file.{Files, Path, Paths} -import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument, VirtualDocument} +import edu.uci.ics.amber.engine.common.storage.{ + DatasetFileDocument, + ReadonlyLocalFileDocument, + ReadonlyVirtualDocument, + VirtualDocument +} import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET @@ -47,24 +52,19 @@ object FileResolver { fileUri.getScheme match { case DatasetFileUriScheme => // Extract path components and decode them - val pathParts = fileUri.getPath.stripPrefix("/").split("/").map(part => - URLDecoder.decode(part, StandardCharsets.UTF_8) - ) + val pathParts = fileUri.getPath + .stripPrefix("/") + .split("/") + .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) if (pathParts.length < 3) { throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}") } - // Parse the dataset ID and version hash, and build the file path - val did = pathParts(0).toInt - val versionHash = pathParts(1) - val fileRelativePath = Paths.get(pathParts.drop(2).mkString("/")) - - // Create and return a DatasetFileDocument new DatasetFileDocument( - did = did, - datasetVersionHash = versionHash, - fileRelativePath = fileRelativePath + did = pathParts(0).toInt, + datasetVersionHash = pathParts(1), + fileRelativePath = Paths.get(pathParts.drop(2).mkString("/")) ) case "file" => @@ -119,7 +119,8 @@ object FileResolver { // Construct path as /{did}/{versionHash}/file-path val did = dataset.getDid.intValue() val versionHash = datasetVersion.getVersionHash - val encodedPath = s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}" + val encodedPath = + s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}" try { new URI(DatasetFileUriScheme, null, encodedPath, null) @@ -130,7 +131,10 @@ object FileResolver { } } - def parseFileNameForDataset(ctx: DSLContext, fileName: String): (String, Dataset, DatasetVersion, Path) = { + def parseFileNameForDataset( + ctx: DSLContext, + fileName: String + ): (String, Dataset, DatasetVersion, Path) = { val filePath = Paths.get(fileName) val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala index 6e9da8e9fcb..8af7d60fd05 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala @@ -147,10 +147,10 @@ case class LogicalPlan( } def resolveScanSourceOpFileName( - errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]] - ): Unit = { + errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]] + ): Unit = { operators.foreach { - case operator@(scanOp: ScanSourceOpDesc) => + case operator @ (scanOp: ScanSourceOpDesc) => Try { // Resolve file path for ScanSourceOpDesc val fileName = scanOp.fileName.getOrElse(throw new RuntimeException("no input file name")) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala index 812c8361093..7c11005d643 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -2,7 +2,12 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor -import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike} +import edu.uci.ics.amber.engine.common.model.tuple.{ + Attribute, + AttributeTypeUtils, + Schema, + TupleLike +} import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala index 6d9a1d29e6d..f00a07802fc 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala @@ -43,7 +43,10 @@ class JSONLScanSourceOpExec private[json] ( override def open(): Unit = { schema = schemaFunc() reader = new BufferedReader( - new InputStreamReader(FileResolver.open(new URI(fileUri)).asInputStream(), fileEncoding.getCharset) + new InputStreamReader( + FileResolver.open(new URI(fileUri)).asInputStream(), + fileEncoding.getCharset + ) ) rows = reader.lines().iterator().asScala.slice(startOffset, endOffset) } From 4aba44135b26c7b43a538ab9e766bc00d10e8670 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 27 Oct 2024 09:59:14 -0700 Subject: [PATCH 06/18] add type alias --- .../uci/ics/texera/workflow/common/storage/FileResolver.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index a3d74e4ec43..8908970c522 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -22,6 +22,8 @@ import java.nio.charset.StandardCharsets import scala.util.{Success, Try} object FileResolver { + type FileHandle = ReadonlyVirtualDocument[_] + private val DatasetFileUriScheme = "vfs" /** @@ -48,7 +50,7 @@ object FileResolver { * @param fileUri the uri pointing to the file * @return */ - def open(fileUri: URI): ReadonlyVirtualDocument[_] = { + def open(fileUri: URI): FileHandle = { fileUri.getScheme match { case DatasetFileUriScheme => // Extract path components and decode them From 2c4a9da9fa88003f12d2b709cbdf7df1b37f10f3 Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Sun, 27 Oct 2024 13:06:16 -0700 Subject: [PATCH 07/18] some handy changes --- .../common/storage/DatasetFileDocument.scala | 1 - .../storage/ReadonlyLocalFileDocument.scala | 1 - .../user/dataset/DatasetResource.scala | 102 ++++-------------- .../common/storage/FileResolver.scala | 31 ++---- 4 files changed, 29 insertions(+), 106 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala index cbc90d70fe6..22d7672d1d4 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala @@ -1,6 +1,5 @@ package edu.uci.ics.amber.engine.common.storage -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils import org.jooq.types.UInteger diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala index 2ea19ee887d..ce1f0329625 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/ReadonlyLocalFileDocument.scala @@ -2,7 +2,6 @@ package edu.uci.ics.amber.engine.common.storage import java.io.{File, FileInputStream, InputStream} import java.net.URI -import java.nio.file.Path /** * ReadonlyLocalFileDocument provides a read-only abstraction over a local file. diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index 9e47377ee57..752278f0526 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -4,103 +4,41 @@ import edu.uci.ics.amber.engine.common.Utils.withTransaction import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.auth.SessionUser import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege -import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{ - DatasetDao, - DatasetUserAccessDao, - DatasetVersionDao -} -import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{ - Dataset, - DatasetUserAccess, - DatasetVersion, - User -} import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET -import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource.{ - getDatasetUserAccessPrivilege, - getOwner, - userHasReadAccess, - userHasWriteAccess, - userOwnDataset -} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{ - DATASET_IS_PRIVATE, - DATASET_IS_PUBLIC, - DashboardDataset, - DashboardDatasetVersion, - DatasetDescriptionModification, - DatasetIDs, - DatasetNameModification, - DatasetVersionRootFileNodes, - DatasetVersionRootFileNodesResponse, - DatasetVersions, - ERR_DATASET_CREATION_FAILED_MESSAGE, - ERR_DATASET_NAME_ALREADY_EXISTS, - ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE, - ListDatasetsResponse, - calculateDatasetVersionSize, - calculateLatestDatasetVersionSize, - context, - createNewDatasetVersionFromFormData, - getDashboardDataset, - getDatasetByID, - getDatasetVersionByID, - getDatasetVersions, - getFileNodesOfCertainVersion, - getLatestDatasetVersionWithAccessCheck, - getUserDatasets, - retrievePublicDatasets -} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{ - DatasetFileNode, - PhysicalFileNode -} +import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER +import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao} +import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource._ +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{context, _} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode} import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils import edu.uci.ics.texera.workflow.common.storage.FileResolver import io.dropwizard.auth.Auth import org.apache.commons.lang3.StringUtils import org.glassfish.jersey.media.multipart.{FormDataMultiPart, FormDataParam} -import org.jooq.{DSLContext, EnumType} import org.jooq.types.UInteger +import org.jooq.{DSLContext, EnumType} import play.api.libs.json.Json import java.io.{IOException, InputStream, OutputStream} import java.net.URLDecoder import java.nio.charset.StandardCharsets -import java.nio.file.{Files, Paths} -import java.util.zip.{ZipEntry, ZipOutputStream} +import java.nio.file.Files import java.util import java.util.concurrent.locks.ReentrantLock +import java.util.zip.{ZipEntry, ZipOutputStream} import javax.annotation.security.RolesAllowed -import javax.ws.rs.{ - BadRequestException, - Consumes, - ForbiddenException, - GET, - NotFoundException, - POST, - Path, - PathParam, - Produces, - QueryParam, - WebApplicationException -} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} -import scala.collection.convert.ImplicitConversions.{ - `collection AsScalaIterable`, - `iterable AsScalaIterable` -} +import javax.ws.rs._ +import scala. jdk. CollectionConverters._ import scala.collection.mutable import scala.collection.mutable.ListBuffer -import scala.jdk.CollectionConverters._ import scala.jdk.OptionConverters._ -import scala.util.Using import scala.util.control.NonFatal -import scala.util.{Failure, Success, Try} +import scala.util.{Failure, Success, Try, Using} object DatasetResource { val DATASET_IS_PUBLIC: Byte = 1; @@ -282,7 +220,7 @@ object DatasetResource { .foreach(pathStr => { // TODO: refactor this part val (_, _, _, fileRelativePath) = FileResolver.parseFileNameForDataset(context, pathStr) - fileRelativePath + fileRelativePath.asScala .map { path => filesToRemove += datasetPath .resolve(path) // When path exists, resolve it and add to filesToRemove @@ -352,7 +290,7 @@ object DatasetResource { .orderBy(DATASET_VERSION.CREATION_TIME.desc()) // or .asc() for ascending .fetchInto(classOf[DatasetVersion]) - result.toList + result.asScala.toList } // apply the dataset operation to create a new dataset version @@ -420,7 +358,7 @@ object DatasetResource { .into(classOf[DatasetVersion]), DatasetFileNode.fromPhysicalFileNodes( Map( - (ownerEmail, datasetName, versionName) -> physicalFileNodes.toList + (ownerEmail, datasetName, versionName) -> physicalFileNodes.asScala.toList ) ) ) @@ -808,7 +746,7 @@ class DatasetResource { ownerEmail = ownerEmail, size = calculateLatestDatasetVersionSize(dataset.getDid) ) - }) + }).asScala ) // then we fetch the public datasets and merge it as a part of the result if not exist @@ -851,7 +789,7 @@ class DatasetResource { PathUtils.getDatasetPath(did), version.getVersionHash ) - .toList) + .asScala) } DashboardDatasetVersion( version, @@ -918,7 +856,7 @@ class DatasetResource { datasetPath, latestVersion.getVersionHash ) - .toList + .asScala.toList ) ) .head @@ -951,7 +889,7 @@ class DatasetResource { val size = calculateDatasetVersionSize(did, dvid) val ownerFileNode = DatasetFileNode .fromPhysicalFileNodes( - Map((dataset.ownerEmail, datasetName, datasetVersion.getName) -> fileNodes.toList) + Map((dataset.ownerEmail, datasetName, datasetVersion.getName) -> fileNodes.asScala.toList) ) .head @@ -1003,7 +941,7 @@ class DatasetResource { val streamingOutput = new StreamingOutput() { override def write(output: OutputStream): Unit = { - fileRelativePath + fileRelativePath.asScala .foreach { path => GitVersionControlLocalFileStorage.retrieveFileContentOfVersion( targetDatasetPath, diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index 8908970c522..5d06175ad25 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -1,24 +1,18 @@ package edu.uci.ics.texera.workflow.common.storage import edu.uci.ics.amber.engine.common.Utils.withTransaction - -import java.nio.file.{Files, Path, Paths} -import edu.uci.ics.amber.engine.common.storage.{ - DatasetFileDocument, - ReadonlyLocalFileDocument, - ReadonlyVirtualDocument, - VirtualDocument -} +import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument} import edu.uci.ics.texera.web.SqlServer -import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET -import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION +import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER +import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} import org.apache.commons.vfs2.FileNotFoundException import org.jooq.DSLContext -import java.net.{URI, URLDecoder, URLEncoder} +import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder} import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path, Paths} import scala.util.{Success, Try} object FileResolver { @@ -34,10 +28,10 @@ object FileResolver { * @return Either[String, DatasetFileDocument] - the resolved path as a String or a DatasetFileDocument */ def resolve(fileName: String): URI = { - val resolvers: List[String => URI] = List(localResolveFunc, datasetResolveFunc) + val resolvers: Seq[String => URI] = Seq(localResolveFunc, datasetResolveFunc) // Try each resolver function in sequence - resolvers.iterator + resolvers .map(resolver => Try(resolver(fileName))) .collectFirst { case Success(output) => output @@ -85,11 +79,10 @@ object FileResolver { */ private def localResolveFunc(fileName: String): URI = { val filePath = Paths.get(fileName) - if (Files.exists(filePath)) { - filePath.toUri // File exists locally, return the path as a string in the Left - } else { + if (!Files.exists(filePath)) { throw new FileNotFoundException(s"Local file $fileName does not exist") } + filePath.toUri } /** @@ -104,12 +97,6 @@ object FileResolver { * @return Either[String, DatasetFileDocument] - Right(document) if creation succeeds * @throws FileNotFoundException if the dataset file does not exist or cannot be created */ - - import java.net.{URI, URISyntaxException, URLEncoder} - import java.nio.charset.StandardCharsets - import java.nio.file.Path - import org.apache.commons.vfs2.FileNotFoundException - private def datasetResolveFunc(fileName: String): URI = { withTransaction(SqlServer.createDSLContext()) { ctx => val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName) From bb576ffb1daae6dc0e1edb14fab2314af56cfb6c Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 27 Oct 2024 19:11:44 -0700 Subject: [PATCH 08/18] save working version --- .../common/storage/DatasetFileDocument.scala | 23 +++- .../user/dataset/DatasetResource.scala | 104 ++++++--------- .../workflow/common/storage/FileOpener.scala | 23 ++++ .../common/storage/FileResolver.scala | 121 ++++++------------ .../source/scan/FileScanSourceOpExec.scala | 4 +- .../source/scan/ScanSourceOpDesc.scala | 5 +- .../source/scan/csv/CSVScanSourceOpDesc.scala | 4 +- .../source/scan/csv/CSVScanSourceOpExec.scala | 4 +- .../csv/ParallelCSVScanSourceOpDesc.scala | 6 +- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 4 +- .../scan/csvOld/CSVOldScanSourceOpExec.scala | 11 +- .../scan/json/JSONLScanSourceOpDesc.scala | 6 +- .../scan/json/JSONLScanSourceOpExec.scala | 4 +- 13 files changed, 140 insertions(+), 179 deletions(-) create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala index 22d7672d1d4..6c2c8cd38cf 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala @@ -5,11 +5,22 @@ import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils import org.jooq.types.UInteger import java.io.{File, FileOutputStream, InputStream} -import java.net.URI -import java.nio.file.{Files, Path} +import java.net.{URI, URLDecoder} +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Paths} -class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath: Path) +class DatasetFileDocument(uri: URI) extends VirtualDocument[Nothing] { + // Extract path components and decode them + private val pathParts = uri.getPath + .stripPrefix("/") + .split("/") + .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) + + private val did = pathParts(0).toInt + private val datasetVersionHash = pathParts(1) + private val fileRelativePath = Paths.get(pathParts.drop(2).head, pathParts.drop(2).tail: _*) + private var tempFile: Option[File] = None override def getURI: URI = @@ -53,9 +64,15 @@ class DatasetFileDocument(did: Int, datasetVersionHash: String, fileRelativePath } override def remove(): Unit = { + // first remove the temporary file tempFile match { case Some(file) => Files.delete(file.toPath) case None => // Do nothing } + // then remove the dataset file + GitVersionControlLocalFileStorage.removeFileFromRepo( + PathUtils.getDatasetPath(UInteger.valueOf(did)), + PathUtils.getDatasetPath(UInteger.valueOf(did)).resolve(fileRelativePath) + ) } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index 752278f0526..bf84370cd8b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -1,6 +1,7 @@ package edu.uci.ics.texera.web.resource.dashboard.user.dataset import edu.uci.ics.amber.engine.common.Utils.withTransaction +import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.auth.SessionUser import edu.uci.ics.texera.web.model.jooq.generated.enums.DatasetUserAccessPrivilege @@ -24,7 +25,7 @@ import org.jooq.{DSLContext, EnumType} import play.api.libs.json.Json import java.io.{IOException, InputStream, OutputStream} -import java.net.URLDecoder +import java.net.{URI, URLDecoder} import java.nio.charset.StandardCharsets import java.nio.file.Files import java.util @@ -33,7 +34,7 @@ import java.util.zip.{ZipEntry, ZipOutputStream} import javax.annotation.security.RolesAllowed import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import javax.ws.rs._ -import scala. jdk. CollectionConverters._ +import scala.jdk.CollectionConverters._ import scala.collection.mutable import scala.collection.mutable.ListBuffer import scala.jdk.OptionConverters._ @@ -182,7 +183,7 @@ object DatasetResource { // DatasetOperation defines the operations that will be applied when creating a new dataset version private case class DatasetOperation( filesToAdd: Map[java.nio.file.Path, InputStream], - filesToRemove: List[java.nio.file.Path] + filesToRemove: List[URI] ) private def parseUserUploadedFormToDatasetOperations( @@ -193,7 +194,7 @@ object DatasetResource { // Mutable collections for constructing DatasetOperation val filesToAdd = mutable.Map[java.nio.file.Path, InputStream]() - val filesToRemove = mutable.ListBuffer[java.nio.file.Path]() + val filesToRemove = mutable.ListBuffer[URI]() val fields = multiPart.getFields.keySet.iterator() // Get all field names @@ -218,13 +219,7 @@ object DatasetResource { .parse(filePathsValue) .as[List[String]] .foreach(pathStr => { - // TODO: refactor this part - val (_, _, _, fileRelativePath) = FileResolver.parseFileNameForDataset(context, pathStr) - fileRelativePath.asScala - .map { path => - filesToRemove += datasetPath - .resolve(path) // When path exists, resolve it and add to filesToRemove - } + filesToRemove += FileResolver.resolve(pathStr) }) } } @@ -328,11 +323,8 @@ object DatasetResource { GitVersionControlLocalFileStorage.writeFileToRepo(datasetPath, filePath, fileStream) } - datasetOperation.filesToRemove.foreach { filePath => - GitVersionControlLocalFileStorage.removeFileFromRepo( - datasetPath, - filePath - ) + datasetOperation.filesToRemove.foreach { fileUri => + new DatasetFileDocument(fileUri).remove() } } ) @@ -690,18 +682,16 @@ class DatasetResource { @Auth user: SessionUser, @QueryParam("includeVersions") includeVersions: Boolean = false, @QueryParam("includeFileNodes") includeFileNodes: Boolean = false, - @QueryParam("path") filePathStr: String + @QueryParam("did") datasetId: UInteger, ): ListDatasetsResponse = { val uid = user.getUid withTransaction(context)(ctx => { var accessibleDatasets: ListBuffer[DashboardDataset] = ListBuffer() - if (filePathStr != null && filePathStr.nonEmpty) { - // if the file path is given, then only fetch the dataset and version this file is belonging to - val decodedPathStr = URLDecoder.decode(filePathStr, StandardCharsets.UTF_8.name()) - val (ownerEmail, dataset, version, _) = - FileResolver.parseFileNameForDataset(ctx, decodedPathStr) - val accessPrivilege = getDatasetUserAccessPrivilege(ctx, dataset.getDid, uid) + if (datasetId != null) { + // if dataset id is given, retrieve only one dataset + val dataset = getDatasetByID(ctx, datasetId) + val accessPrivilege = getDatasetUserAccessPrivilege(ctx, datasetId, uid) if ( accessPrivilege == DatasetUserAccessPrivilege.NONE && dataset.getIsPublic == DATASET_IS_PRIVATE ) { @@ -709,15 +699,10 @@ class DatasetResource { } accessibleDatasets = accessibleDatasets :+ DashboardDataset( dataset = dataset, - ownerEmail = ownerEmail, + ownerEmail = getOwner(ctx, datasetId).getEmail, accessPrivilege = accessPrivilege, isOwner = dataset.getOwnerUid == uid, - versions = List( - DashboardDatasetVersion( - datasetVersion = version, - fileNodes = List() - ) - ), + versions = List(), size = calculateLatestDatasetVersionSize(dataset.getDid) ) } else { @@ -789,7 +774,7 @@ class DatasetResource { PathUtils.getDatasetPath(did), version.getVersionHash ) - .asScala) + .asScala.toList) } DashboardDatasetVersion( version, @@ -926,30 +911,20 @@ class DatasetResource { val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) withTransaction(context)(ctx => { - val (_, dataset, dsVersion, fileRelativePath) = - FileResolver.parseFileNameForDataset(ctx, decodedPathStr) - - val did = dataset.getDid - val dvid = dsVersion.getDvid - - if (!userHasReadAccess(ctx, dataset.getDid, uid)) { - throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) - } - - val targetDatasetPath = PathUtils.getDatasetPath(did) - val datasetVersion = getDatasetVersionByID(ctx, dvid) - + val fileUri = FileResolver.resolve(decodedPathStr) val streamingOutput = new StreamingOutput() { override def write(output: OutputStream): Unit = { - fileRelativePath.asScala - .foreach { path => - GitVersionControlLocalFileStorage.retrieveFileContentOfVersion( - targetDatasetPath, - datasetVersion.getVersionHash, - targetDatasetPath.resolve(path), - output - ) + val inputStream = new DatasetFileDocument(fileUri).asInputStream() + try { + val buffer = new Array[Byte](8192) // buffer size + var bytesRead = inputStream.read(buffer) + while (bytesRead != -1) { + output.write(buffer, 0, bytesRead) + bytesRead = inputStream.read(buffer) } + } finally { + inputStream.close() + } } } @@ -987,16 +962,21 @@ class DatasetResource { @GET @Path("/version-zip") def retrieveDatasetVersionZip( - @QueryParam("path") pathStr: String, - @QueryParam("getLatest") getLatest: Boolean, @QueryParam("did") did: UInteger, + @QueryParam("dvid") dvid: UInteger, @Auth user: SessionUser ): Response = { - val (dataset, version) = if (getLatest) { + val (dataset, version) = if (dvid == null) { getLatestVersionInfo(did, user) } else { - resolveAndValidatePath(pathStr, user) + withTransaction(context) {ctx => + if (!userHasReadAccess(ctx, did, dvid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + (getDatasetByID(ctx, did), getDatasetVersionByID(ctx, dvid)) + } } + val targetDatasetPath = PathUtils.getDatasetPath(dataset.getDid) val fileNodes = GitVersionControlLocalFileStorage.retrieveRootFileNodesOfVersion( targetDatasetPath, @@ -1053,18 +1033,6 @@ class DatasetResource { .build() } - private def resolveAndValidatePath( - pathStr: String, - user: SessionUser - ): (Dataset, DatasetVersion) = { - val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) - val (_, dataset, dsVersion, _) = - FileResolver.parseFileNameForDataset(context, decodedPathStr) - - validateUserAccess(dataset.getDid, user.getUid) - (dataset, dsVersion) - } - private def getLatestVersionInfo(did: UInteger, user: SessionUser): (Dataset, DatasetVersion) = { validateUserAccess(did, user.getUid) val dataset = getDatasetByID(context, did) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala new file mode 100644 index 00000000000..08de3b603c5 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala @@ -0,0 +1,23 @@ +package edu.uci.ics.texera.workflow.common.storage + +import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument} +import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME + +import java.net.URI + +object FileOpener { + type FileHandle = ReadonlyVirtualDocument[_] + def openFile(fileUri: URI): FileHandle = { + fileUri.getScheme match { + case DATASET_FILE_URI_SCHEME => + new DatasetFileDocument(fileUri) + + case "file" => + // For local files, create a ReadonlyLocalFileDocument + new ReadonlyLocalFileDocument(fileUri) + + case _ => + throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") + } + } +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index 5d06175ad25..c8bee7d7669 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -1,24 +1,20 @@ package edu.uci.ics.texera.workflow.common.storage import edu.uci.ics.amber.engine.common.Utils.withTransaction -import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument} import edu.uci.ics.texera.web.SqlServer import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} import org.apache.commons.vfs2.FileNotFoundException -import org.jooq.DSLContext -import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder} +import java.net.{URI, URLEncoder} import java.nio.charset.StandardCharsets -import java.nio.file.{Files, Path, Paths} +import java.nio.file.{Files, Paths} import scala.util.{Success, Try} object FileResolver { - type FileHandle = ReadonlyVirtualDocument[_] - - private val DatasetFileUriScheme = "vfs" + val DATASET_FILE_URI_SCHEME = "vfs" /** * Attempts to resolve the given fileName using a list of resolver functions. @@ -39,39 +35,6 @@ object FileResolver { .getOrElse(throw new FileNotFoundException(fileName)) } - /** - * Open a file handle for the given fileUri - * @param fileUri the uri pointing to the file - * @return - */ - def open(fileUri: URI): FileHandle = { - fileUri.getScheme match { - case DatasetFileUriScheme => - // Extract path components and decode them - val pathParts = fileUri.getPath - .stripPrefix("/") - .split("/") - .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) - - if (pathParts.length < 3) { - throw new RuntimeException(s"Invalid dataset URI format: ${fileUri.toString}") - } - - new DatasetFileDocument( - did = pathParts(0).toInt, - datasetVersionHash = pathParts(1), - fileRelativePath = Paths.get(pathParts.drop(2).mkString("/")) - ) - - case "file" => - // For local files, create a ReadonlyLocalFileDocument - new ReadonlyLocalFileDocument(fileUri) - - case _ => - throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") - } - } - /** * Attempts to resolve a local file path. * @throws FileNotFoundException if the local file does not exist @@ -90,7 +53,7 @@ object FileResolver { * * The fileName format should be: /ownerEmail/datasetName/versionName/fileRelativePath * e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv - * The output dataset URI format is: {DatasetFileUriScheme}:///{did}/{versionHash}/file-path + * The output dataset URI format is: {DATASET_FILE_URI_SCHEME}:///{did}/{versionHash}/file-path * e.g. vfs:///15/adeq233td/some/dir/file.txt * * @param fileName the name of the file to attempt resolving as a DatasetFileDocument @@ -98,54 +61,50 @@ object FileResolver { * @throws FileNotFoundException if the dataset file does not exist or cannot be created */ private def datasetResolveFunc(fileName: String): URI = { - withTransaction(SqlServer.createDSLContext()) { ctx => - val (_, dataset, datasetVersion, fileRelativePath) = parseFileNameForDataset(ctx, fileName) + val filePath = Paths.get(fileName) + val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray - if (dataset == null || datasetVersion == null) { - throw new FileNotFoundException(s"Dataset file $fileName not found.") + // extract info from the user-given fileName + val ownerEmail = pathSegments(0) + val datasetName = pathSegments(1) + val versionName = pathSegments(2) + val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/")) + + // fetch the dataset and version from DB to get dataset ID and version hash + val (dataset, datasetVersion) = + withTransaction(SqlServer.createDSLContext()) { ctx => + // fetch the dataset from DB + val dataset = ctx + .select(DATASET.fields: _*) + .from(DATASET) + .leftJoin(USER) + .on(USER.UID.eq(DATASET.OWNER_UID)) + .where(USER.EMAIL.eq(ownerEmail)) + .and(DATASET.NAME.eq(datasetName)) + .fetchOneInto(classOf[Dataset]) + + // fetch the dataset version from DB + val datasetVersion = ctx + .selectFrom(DATASET_VERSION) + .where(DATASET_VERSION.DID.eq(dataset.getDid)) + .and(DATASET_VERSION.NAME.eq(versionName)) + .fetchOneInto(classOf[DatasetVersion]) + + if (dataset == null || datasetVersion == null) { + throw new FileNotFoundException(s"Dataset file $fileName not found.") + } + (dataset, datasetVersion) } // Construct path as /{did}/{versionHash}/file-path - val did = dataset.getDid.intValue() - val versionHash = datasetVersion.getVersionHash val encodedPath = - s"/$did/$versionHash/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}" + s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}" try { - new URI(DatasetFileUriScheme, null, encodedPath, null) + new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null) } catch { - case e: URISyntaxException => - throw e + case e: Exception => + throw new FileNotFoundException(s"Dataset file $fileName not found.") } } - } - - def parseFileNameForDataset( - ctx: DSLContext, - fileName: String - ): (String, Dataset, DatasetVersion, Path) = { - val filePath = Paths.get(fileName) - val pathSegments = (0 until filePath.getNameCount).map(filePath.getName(_).toString).toArray - - val ownerEmail = pathSegments(0) - val datasetName = pathSegments(1) - val versionName = pathSegments(2) - val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/")) - - val dataset = ctx - .select(DATASET.fields: _*) - .from(DATASET) - .leftJoin(USER) - .on(USER.UID.eq(DATASET.OWNER_UID)) - .where(USER.EMAIL.eq(ownerEmail)) - .and(DATASET.NAME.eq(datasetName)) - .fetchOneInto(classOf[Dataset]) - - val datasetVersion = ctx - .selectFrom(DATASET_VERSION) - .where(DATASET_VERSION.DID.eq(dataset.getDid)) - .and(DATASET_VERSION.NAME.eq(versionName)) - .fetchOneInto(classOf[DatasetVersion]) - (ownerEmail, dataset, datasetVersion, fileRelativePath) - } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala index 5756252995d..8878d9f2092 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala @@ -4,7 +4,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.model.tuple.TupleLike import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory} import org.apache.commons.io.IOUtils.toByteArray @@ -27,7 +27,7 @@ class FileScanSourceOpExec private[scan] ( override def produceTuple(): Iterator[TupleLike] = { var filenameIt: Iterator[String] = Iterator.empty val fileEntries: Iterator[InputStream] = { - val is = FileResolver.open(new URI(fileUri)).asInputStream() + val is = FileOpener.openFile(new URI(fileUri)).asInputStream() if (extract) { val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream( new BufferedInputStream(is) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala index 82817358304..c638abfd701 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala @@ -5,17 +5,16 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.engine.common.model.WorkflowContext import edu.uci.ics.amber.engine.common.model.tuple.Schema -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument +import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument} import edu.uci.ics.amber.engine.common.workflow.OutputPort import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME import org.apache.commons.lang3.builder.EqualsBuilder import java.net.URI abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { - /** in the case we do not want to read the entire large file, but only * the first a few lines of it to do the type inference. */ diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala index 2cc906bce4d..9227fd887d2 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, FileInputStream, IOException, InputStreamReader} @@ -74,7 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { return null } - val stream = FileResolver.open(new URI(fileUri.get)).asInputStream() + val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream() val inputReader = new InputStreamReader(stream, fileEncoding.getCharset) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala index 05942761d7b..7213bb0456a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala @@ -6,7 +6,7 @@ import edu.uci.ics.amber.engine.common.workflow.PortIdentity import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport} import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike} import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.io.InputStreamReader @@ -70,7 +70,7 @@ class CSVScanSourceOpExec private[csv] ( override def open(): Unit = { inputReader = new InputStreamReader( - FileResolver.open(new URI(fileUri)).asInputStream(), + FileOpener.openFile(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 66ecde61726..534fe28a154 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, IOException} @@ -41,7 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here // TODO: consider a better way - val file = FileResolver.open(new URI(fileUri.get)).asFile() + val file = FileOpener.openFile(new URI(fileUri.get)).asFile() val totalBytes: Long = file.length() PhysicalOp @@ -82,7 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty) { return null } - val file = FileResolver.open(new URI(fileUri.get)).asFile() + val file = FileOpener.openFile(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index 5524e6ee51a..d5a32cfa8a9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, IOException} @@ -72,7 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty) { return null } - val file = FileResolver.open(new URI(fileUri.get)).asFile() + val file = FileOpener.openFile(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala index 7c11005d643..fb764793b8f 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -2,13 +2,8 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor -import edu.uci.ics.amber.engine.common.model.tuple.{ - Attribute, - AttributeTypeUtils, - Schema, - TupleLike -} -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike} +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.net.URI @@ -53,7 +48,7 @@ class CSVOldScanSourceOpExec private[csvOld] ( implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } - val filePath = FileResolver.open(new URI(fileUri)).asFile().toPath + val filePath = FileOpener.openFile(new URI(fileUri)).asFile().toPath reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat) // skip line if this worker reads the start of a file, and the file has a header line val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala index 99547e0182a..79542ffee07 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, Workf import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema} -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap @@ -39,7 +39,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val stream = FileResolver.open(new URI(fileUri.get)).asInputStream() + val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream() // count lines and partition the task to each worker val reader = new BufferedReader( new InputStreamReader(stream, fileEncoding.getCharset) @@ -85,7 +85,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - val stream = FileResolver.open(new URI(fileUri.get)).asInputStream() + val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream() val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset)) var fieldNames = Set[String]() diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala index f00a07802fc..43c4cdfff87 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala @@ -5,7 +5,7 @@ import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike} -import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap @@ -44,7 +44,7 @@ class JSONLScanSourceOpExec private[json] ( schema = schemaFunc() reader = new BufferedReader( new InputStreamReader( - FileResolver.open(new URI(fileUri)).asInputStream(), + FileOpener.openFile(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) ) From 4fa96d131eeca2696135e107db11326f358b7769 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 27 Oct 2024 19:12:19 -0700 Subject: [PATCH 09/18] fmt --- .../common/storage/DatasetFileDocument.scala | 3 +- .../user/dataset/DatasetResource.scala | 31 ++++++++++++++----- .../workflow/common/storage/FileOpener.scala | 6 +++- .../common/storage/FileResolver.scala | 21 +++++++------ .../source/scan/ScanSourceOpDesc.scala | 7 ++++- .../scan/csvOld/CSVOldScanSourceOpExec.scala | 7 ++++- 6 files changed, 53 insertions(+), 22 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala index 6c2c8cd38cf..8b02580c00d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala @@ -9,8 +9,7 @@ import java.net.{URI, URLDecoder} import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} -class DatasetFileDocument(uri: URI) - extends VirtualDocument[Nothing] { +class DatasetFileDocument(uri: URI) extends VirtualDocument[Nothing] { // Extract path components and decode them private val pathParts = uri.getPath .stripPrefix("/") diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index bf84370cd8b..d39e4395ca5 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -9,11 +9,23 @@ import edu.uci.ics.texera.web.model.jooq.generated.tables.Dataset.DATASET import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS import edu.uci.ics.texera.web.model.jooq.generated.tables.DatasetVersion.DATASET_VERSION import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER -import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, DatasetVersionDao} -import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetUserAccess, DatasetVersion, User} +import edu.uci.ics.texera.web.model.jooq.generated.tables.daos.{ + DatasetDao, + DatasetUserAccessDao, + DatasetVersionDao +} +import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{ + Dataset, + DatasetUserAccess, + DatasetVersion, + User +} import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource._ import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{context, _} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{DatasetFileNode, PhysicalFileNode} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{ + DatasetFileNode, + PhysicalFileNode +} import edu.uci.ics.texera.web.resource.dashboard.user.dataset.service.GitVersionControlLocalFileStorage import edu.uci.ics.texera.web.resource.dashboard.user.dataset.utils.PathUtils import edu.uci.ics.texera.workflow.common.storage.FileResolver @@ -682,7 +694,7 @@ class DatasetResource { @Auth user: SessionUser, @QueryParam("includeVersions") includeVersions: Boolean = false, @QueryParam("includeFileNodes") includeFileNodes: Boolean = false, - @QueryParam("did") datasetId: UInteger, + @QueryParam("did") datasetId: UInteger ): ListDatasetsResponse = { val uid = user.getUid withTransaction(context)(ctx => { @@ -731,7 +743,8 @@ class DatasetResource { ownerEmail = ownerEmail, size = calculateLatestDatasetVersionSize(dataset.getDid) ) - }).asScala + }) + .asScala ) // then we fetch the public datasets and merge it as a part of the result if not exist @@ -774,7 +787,8 @@ class DatasetResource { PathUtils.getDatasetPath(did), version.getVersionHash ) - .asScala.toList) + .asScala + .toList) } DashboardDatasetVersion( version, @@ -841,7 +855,8 @@ class DatasetResource { datasetPath, latestVersion.getVersionHash ) - .asScala.toList + .asScala + .toList ) ) .head @@ -969,7 +984,7 @@ class DatasetResource { val (dataset, version) = if (dvid == null) { getLatestVersionInfo(did, user) } else { - withTransaction(context) {ctx => + withTransaction(context) { ctx => if (!userHasReadAccess(ctx, did, dvid)) { throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala index 08de3b603c5..308d25b33ec 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala @@ -1,6 +1,10 @@ package edu.uci.ics.texera.workflow.common.storage -import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument} +import edu.uci.ics.amber.engine.common.storage.{ + DatasetFileDocument, + ReadonlyLocalFileDocument, + ReadonlyVirtualDocument +} import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME import java.net.URI diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index c8bee7d7669..fb4f5b70967 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -96,15 +96,18 @@ object FileResolver { (dataset, datasetVersion) } - // Construct path as /{did}/{versionHash}/file-path - val encodedPath = - s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString.split("/").map(URLEncoder.encode(_, StandardCharsets.UTF_8)).mkString("/")}" + // Construct path as /{did}/{versionHash}/file-path + val encodedPath = + s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString + .split("/") + .map(URLEncoder.encode(_, StandardCharsets.UTF_8)) + .mkString("/")}" - try { - new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null) - } catch { - case e: Exception => - throw new FileNotFoundException(s"Dataset file $fileName not found.") - } + try { + new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null) + } catch { + case e: Exception => + throw new FileNotFoundException(s"Dataset file $fileName not found.") } + } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala index c638abfd701..3b353b1adfe 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala @@ -5,7 +5,11 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.engine.common.model.WorkflowContext import edu.uci.ics.amber.engine.common.model.tuple.Schema -import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, ReadonlyLocalFileDocument, ReadonlyVirtualDocument} +import edu.uci.ics.amber.engine.common.storage.{ + DatasetFileDocument, + ReadonlyLocalFileDocument, + ReadonlyVirtualDocument +} import edu.uci.ics.amber.engine.common.workflow.OutputPort import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor @@ -15,6 +19,7 @@ import org.apache.commons.lang3.builder.EqualsBuilder import java.net.URI abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { + /** in the case we do not want to read the entire large file, but only * the first a few lines of it to do the type inference. */ diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala index fb764793b8f..80eed48ea76 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -2,7 +2,12 @@ package edu.uci.ics.texera.workflow.operators.source.scan.csvOld import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor -import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike} +import edu.uci.ics.amber.engine.common.model.tuple.{ + Attribute, + AttributeTypeUtils, + Schema, + TupleLike +} import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod From e84070b92cc06e732a4018070ce21fc3b2430b5b Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Mon, 28 Oct 2024 13:14:16 -0700 Subject: [PATCH 10/18] merge dataset resource --- .../web/resource/dashboard/user/dataset/DatasetResource.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index 49e5c5ccecf..b0d83ab5b21 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -724,7 +724,7 @@ class DatasetResource { ownerEmail = ownerEmail, size = calculateLatestDatasetVersionSize(dataset.getDid) ) - }) + }).asScala ) // then we fetch the public datasets and merge it as a part of the result if not exist From 402f413c424029fbc797082f9426a223a8a38d30 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Mon, 28 Oct 2024 13:35:29 -0700 Subject: [PATCH 11/18] move open to virtual document --- .../common/storage/VirtualDocument.scala | 20 ++++++++++++++ .../user/dataset/DatasetResource.scala | 3 ++- .../workflow/common/storage/FileOpener.scala | 27 ------------------- .../source/scan/FileScanSourceOpExec.scala | 5 ++-- .../source/scan/csv/CSVScanSourceOpDesc.scala | 4 +-- .../source/scan/csv/CSVScanSourceOpExec.scala | 5 ++-- .../csv/ParallelCSVScanSourceOpDesc.scala | 6 ++--- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 6 ++--- .../scan/csvOld/CSVOldScanSourceOpExec.scala | 4 +-- .../scan/json/JSONLScanSourceOpDesc.scala | 6 ++--- .../scan/json/JSONLScanSourceOpExec.scala | 5 ++-- 11 files changed, 41 insertions(+), 50 deletions(-) delete mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala index 0fb0c1e7897..b586958d674 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala @@ -1,8 +1,28 @@ package edu.uci.ics.amber.engine.common.storage +import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME + import java.io.{File, InputStream} import java.net.URI +object VirtualDocument { + type FileHandle = ReadonlyVirtualDocument[_] + + def openFile(fileUri: URI): FileHandle = { + fileUri.getScheme match { + case DATASET_FILE_URI_SCHEME => + new DatasetFileDocument(fileUri) + + case "file" => + // For local files, create a ReadonlyLocalFileDocument + new ReadonlyLocalFileDocument(fileUri) + + case _ => + throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") + } + } +} + /** * TODO: break this base definition into more self-contained pieces, including Writeonly, IteratorBased * VirtualDocument provides the abstraction of doing read/write/copy/delete operations over a single resource in Texera system. diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index b0d83ab5b21..742545af59e 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -724,7 +724,8 @@ class DatasetResource { ownerEmail = ownerEmail, size = calculateLatestDatasetVersionSize(dataset.getDid) ) - }).asScala + }) + .asScala ) // then we fetch the public datasets and merge it as a part of the result if not exist diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala deleted file mode 100644 index 308d25b33ec..00000000000 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileOpener.scala +++ /dev/null @@ -1,27 +0,0 @@ -package edu.uci.ics.texera.workflow.common.storage - -import edu.uci.ics.amber.engine.common.storage.{ - DatasetFileDocument, - ReadonlyLocalFileDocument, - ReadonlyVirtualDocument -} -import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME - -import java.net.URI - -object FileOpener { - type FileHandle = ReadonlyVirtualDocument[_] - def openFile(fileUri: URI): FileHandle = { - fileUri.getScheme match { - case DATASET_FILE_URI_SCHEME => - new DatasetFileDocument(fileUri) - - case "file" => - // For local files, create a ReadonlyLocalFileDocument - new ReadonlyLocalFileDocument(fileUri) - - case _ => - throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") - } - } -} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala index 8878d9f2092..702c3f8a0db 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala @@ -2,9 +2,8 @@ package edu.uci.ics.texera.workflow.operators.source.scan import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.model.tuple.TupleLike -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory} import org.apache.commons.io.IOUtils.toByteArray @@ -27,7 +26,7 @@ class FileScanSourceOpExec private[scan] ( override def produceTuple(): Iterator[TupleLike] = { var filenameIt: Iterator[String] = Iterator.empty val fileEntries: Iterator[InputStream] = { - val is = FileOpener.openFile(new URI(fileUri)).asInputStream() + val is = openFile(new URI(fileUri)).asInputStream() if (extract) { val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream( new BufferedInputStream(is) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala index 9227fd887d2..47489ea64d6 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, FileInputStream, IOException, InputStreamReader} @@ -74,7 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { return null } - val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream() + val stream = openFile(new URI(fileUri.get)).asInputStream() val inputReader = new InputStreamReader(stream, fileEncoding.getCharset) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala index 7213bb0456a..0117cf02ba0 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala @@ -5,8 +5,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.workflow.PortIdentity import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport} import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike} -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.io.InputStreamReader @@ -70,7 +69,7 @@ class CSVScanSourceOpExec private[csv] ( override def open(): Unit = { inputReader = new InputStreamReader( - FileOpener.openFile(new URI(fileUri)).asInputStream(), + openFile(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 534fe28a154..6c589169104 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{File, IOException} @@ -41,7 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here // TODO: consider a better way - val file = FileOpener.openFile(new URI(fileUri.get)).asFile() + val file = openFile(new URI(fileUri.get)).asFile() val totalBytes: Long = file.length() PhysicalOp @@ -82,7 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty) { return null } - val file = FileOpener.openFile(new URI(fileUri.get)).asFile() + val file = openFile(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index d5a32cfa8a9..7498877171b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -9,10 +9,10 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc -import java.io.{File, IOException} +import java.io.IOException import java.net.URI class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { @@ -72,7 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty) { return null } - val file = FileOpener.openFile(new URI(fileUri.get)).asFile() + val file = openFile(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala index 80eed48ea76..cf455e66d70 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -8,7 +8,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{ Schema, TupleLike } -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.net.URI @@ -53,7 +53,7 @@ class CSVOldScanSourceOpExec private[csvOld] ( implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } - val filePath = FileOpener.openFile(new URI(fileUri)).asFile().toPath + val filePath = openFile(new URI(fileUri)).asFile().toPath reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat) // skip line if this worker reads the start of a file, and the file has a header line val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala index 79542ffee07..6a1eeae9bc8 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, Workf import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema} -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap @@ -39,7 +39,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream() + val stream = openFile(new URI(fileUri.get)).asInputStream() // count lines and partition the task to each worker val reader = new BufferedReader( new InputStreamReader(stream, fileEncoding.getCharset) @@ -85,7 +85,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - val stream = FileOpener.openFile(new URI(fileUri.get)).asInputStream() + val stream = openFile(new URI(fileUri.get)).asInputStream() val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset)) var fieldNames = Set[String]() diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala index 43c4cdfff87..5553f676123 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala @@ -1,11 +1,10 @@ package edu.uci.ics.texera.workflow.operators.source.scan.json import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike} -import edu.uci.ics.texera.workflow.common.storage.{FileOpener, FileResolver} +import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap @@ -44,7 +43,7 @@ class JSONLScanSourceOpExec private[json] ( schema = schemaFunc() reader = new BufferedReader( new InputStreamReader( - FileOpener.openFile(new URI(fileUri)).asInputStream(), + openFile(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) ) From 4d8e72bc8c6f6e7b3ef6b38178b343329f4de8ab Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Mon, 28 Oct 2024 17:32:47 -0700 Subject: [PATCH 12/18] fix tests --- .../source/scan/csv/CSVScanSourceOpDesc.scala | 2 +- .../csv/ParallelCSVScanSourceOpDesc.scala | 2 +- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 2 +- .../scan/json/JSONLScanSourceOpDesc.scala | 3 +++ .../scan/csv/CSVScanSourceOpDescSpec.scala | 11 +++++++++ .../scan/text/FileScanSourceOpDescSpec.scala | 24 ++++++++----------- 6 files changed, 27 insertions(+), 17 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala index 47489ea64d6..d61ab6f7fcb 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala @@ -70,7 +70,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - if (customDelimiter.isEmpty) { + if (customDelimiter.isEmpty || fileUri.isEmpty) { return null } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 6c589169104..57af21d4a47 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -79,7 +79,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - if (customDelimiter.isEmpty) { + if (customDelimiter.isEmpty || fileUri.isEmpty) { return null } val file = openFile(new URI(fileUri.get)).asFile() diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index 7498877171b..8bc67f5629d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -69,7 +69,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - if (customDelimiter.isEmpty) { + if (customDelimiter.isEmpty || fileUri.isEmpty) { return null } val file = openFile(new URI(fileUri.get)).asFile() diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala index 6a1eeae9bc8..066203e115e 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala @@ -85,6 +85,9 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { + if (fileUri.isEmpty) { + return null + } val stream = openFile(new URI(fileUri.get)).asInputStream() val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset)) var fieldNames = Set[String]() diff --git a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala index 21ce3de60a0..858b2188309 100644 --- a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDescSpec.scala @@ -4,6 +4,7 @@ import edu.uci.ics.amber.engine.common.model.WorkflowContext import edu.uci.ics.amber.engine.common.model.tuple.{AttributeType, Schema} import edu.uci.ics.amber.engine.common.workflow.PortIdentity import WorkflowContext.{DEFAULT_EXECUTION_ID, DEFAULT_WORKFLOW_ID} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -27,6 +28,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { parallelCsvScanSourceOpDesc.customDelimiter = Some(",") parallelCsvScanSourceOpDesc.hasHeader = true parallelCsvScanSourceOpDesc.setContext(workflowContext) + parallelCsvScanSourceOpDesc.setFileUri( + FileResolver.resolve(parallelCsvScanSourceOpDesc.fileName.get) + ) val inferredSchema: Schema = parallelCsvScanSourceOpDesc.inferSchema() assert(inferredSchema.getAttributes.length == 14) @@ -42,6 +46,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { parallelCsvScanSourceOpDesc.customDelimiter = Some(",") parallelCsvScanSourceOpDesc.hasHeader = false parallelCsvScanSourceOpDesc.setContext(workflowContext) + parallelCsvScanSourceOpDesc.setFileUri( + FileResolver.resolve(parallelCsvScanSourceOpDesc.fileName.get) + ) val inferredSchema: Schema = parallelCsvScanSourceOpDesc.inferSchema() @@ -56,6 +63,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { csvScanSourceOpDesc.customDelimiter = Some(",") csvScanSourceOpDesc.hasHeader = true csvScanSourceOpDesc.setContext(workflowContext) + csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema() @@ -70,6 +78,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { csvScanSourceOpDesc.customDelimiter = Some(",") csvScanSourceOpDesc.hasHeader = false csvScanSourceOpDesc.setContext(workflowContext) + csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema() @@ -85,6 +94,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { csvScanSourceOpDesc.customDelimiter = Some(";") csvScanSourceOpDesc.hasHeader = false csvScanSourceOpDesc.setContext(workflowContext) + csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema() @@ -100,6 +110,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { csvScanSourceOpDesc.customDelimiter = Some(";") csvScanSourceOpDesc.hasHeader = false csvScanSourceOpDesc.setContext(workflowContext) + csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) assert( !csvScanSourceOpDesc diff --git a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala index 7c16a38d05c..de1d60c51b8 100644 --- a/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/texera/workflow/operators/source/scan/text/FileScanSourceOpDescSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.texera.workflow.operators.source.scan.text import edu.uci.ics.amber.engine.common.model.tuple.{AttributeType, Schema, SchemaEnforceable, Tuple} +import edu.uci.ics.texera.workflow.common.storage.FileResolver import edu.uci.ics.texera.workflow.operators.source.scan.{ FileAttributeType, FileDecodingMethod, @@ -18,7 +19,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { before { fileScanSourceOpDesc = new FileScanSourceOpDesc() - fileScanSourceOpDesc.fileUri = Left(TestTextFilePath) + fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestTextFilePath).toASCIIString) fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.UTF_8 } @@ -61,8 +62,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.left.getOrElse(""), - null, + fileScanSourceOpDesc.fileUri.getOrElse(""), fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, fileScanSourceOpDesc.extract, @@ -87,13 +87,12 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text file with CRLF separators into corresponding output tuples" in { - fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath) + fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestCRLFTextFilePath).toASCIIString) fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.left.getOrElse(""), - null, + fileScanSourceOpDesc.fileUri.getOrElse(""), fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, fileScanSourceOpDesc.extract, @@ -121,8 +120,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { fileScanSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.left.getOrElse(""), - null, + fileScanSourceOpDesc.fileUri.getOrElse(""), fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, fileScanSourceOpDesc.extract, @@ -148,12 +146,11 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text into corresponding output INTEGER tuples" in { - fileScanSourceOpDesc.fileUri = Left(TestNumbersFilePath) + fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestNumbersFilePath).toASCIIString) fileScanSourceOpDesc.attributeType = FileAttributeType.INTEGER fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.left.getOrElse(""), - null, + fileScanSourceOpDesc.fileUri.getOrElse(""), fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, fileScanSourceOpDesc.extract, @@ -178,14 +175,13 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text file with US_ASCII encoding" in { - fileScanSourceOpDesc.fileUri = Left(TestCRLFTextFilePath) + fileScanSourceOpDesc.fileUri = Some(FileResolver.resolve(TestCRLFTextFilePath).toASCIIString) fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.ASCII fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.left.getOrElse(""), - null, + fileScanSourceOpDesc.fileUri.getOrElse(""), fileScanSourceOpDesc.attributeType, fileScanSourceOpDesc.fileEncoding, fileScanSourceOpDesc.extract, From 5fa9ca7377b17cf96ae86c1b8b58565fa9de2a15 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Mon, 28 Oct 2024 17:40:33 -0700 Subject: [PATCH 13/18] fmt --- .../engine/common/executor/SourceOperatorExecutor.scala | 3 --- .../workflow/operators/source/scan/ScanSourceOpDesc.scala | 6 ------ .../operators/source/scan/csv/CSVScanSourceOpDesc.scala | 2 +- .../source/scan/csv/ParallelCSVScanSourceOpDesc.scala | 2 +- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala index d4c92b19a2a..453a41c5ac6 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/executor/SourceOperatorExecutor.scala @@ -1,11 +1,8 @@ package edu.uci.ics.amber.engine.common.executor import edu.uci.ics.amber.engine.common.model.tuple.{Tuple, TupleLike} -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument import edu.uci.ics.amber.engine.common.workflow.PortIdentity -import java.io.{FileInputStream, InputStream} - trait SourceOperatorExecutor extends OperatorExecutor { override def open(): Unit = {} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala index 3b353b1adfe..46ff9196ef0 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/ScanSourceOpDesc.scala @@ -5,15 +5,9 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.engine.common.model.WorkflowContext import edu.uci.ics.amber.engine.common.model.tuple.Schema -import edu.uci.ics.amber.engine.common.storage.{ - DatasetFileDocument, - ReadonlyLocalFileDocument, - ReadonlyVirtualDocument -} import edu.uci.ics.amber.engine.common.workflow.OutputPort import edu.uci.ics.texera.workflow.common.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.texera.workflow.common.operators.source.SourceOperatorDescriptor -import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME import org.apache.commons.lang3.builder.EqualsBuilder import java.net.URI diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala index d61ab6f7fcb..97eeab439e3 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala @@ -12,7 +12,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Sc import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc -import java.io.{File, FileInputStream, IOException, InputStreamReader} +import java.io.{IOException, InputStreamReader} import java.net.URI class CSVScanSourceOpDesc extends ScanSourceOpDesc { diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 57af21d4a47..76537550f4c 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -12,7 +12,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Sc import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc -import java.io.{File, IOException} +import java.io.IOException import java.net.URI class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { From 238b31cbd73d6618b87564a0d616469c5508e2da Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 29 Oct 2024 15:03:12 -0700 Subject: [PATCH 14/18] fix naming --- .../common/storage/DocumentFactory.scala | 21 +++++++++++++++++++ .../common/storage/VirtualDocument.scala | 18 ---------------- .../source/scan/FileScanSourceOpExec.scala | 4 ++-- .../source/scan/csv/CSVScanSourceOpDesc.scala | 4 ++-- .../source/scan/csv/CSVScanSourceOpExec.scala | 4 ++-- .../csv/ParallelCSVScanSourceOpDesc.scala | 6 +++--- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 4 ++-- .../scan/csvOld/CSVOldScanSourceOpExec.scala | 4 ++-- .../scan/json/JSONLScanSourceOpDesc.scala | 7 +++---- .../scan/json/JSONLScanSourceOpExec.scala | 4 ++-- 10 files changed, 39 insertions(+), 37 deletions(-) create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala new file mode 100644 index 00000000000..f216504d096 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DocumentFactory.scala @@ -0,0 +1,21 @@ +package edu.uci.ics.amber.engine.common.storage + +import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME + +import java.net.URI + +object DocumentFactory { + def newReadonlyDocument(fileUri: URI): ReadonlyVirtualDocument[_] = { + fileUri.getScheme match { + case DATASET_FILE_URI_SCHEME => + new DatasetFileDocument(fileUri) + + case "file" => + // For local files, create a ReadonlyLocalFileDocument + new ReadonlyLocalFileDocument(fileUri) + + case _ => + throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") + } + } +} diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala index b586958d674..7fbd5050556 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala @@ -5,24 +5,6 @@ import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_ import java.io.{File, InputStream} import java.net.URI -object VirtualDocument { - type FileHandle = ReadonlyVirtualDocument[_] - - def openFile(fileUri: URI): FileHandle = { - fileUri.getScheme match { - case DATASET_FILE_URI_SCHEME => - new DatasetFileDocument(fileUri) - - case "file" => - // For local files, create a ReadonlyLocalFileDocument - new ReadonlyLocalFileDocument(fileUri) - - case _ => - throw new UnsupportedOperationException(s"Unsupported URI scheme: ${fileUri.getScheme}") - } - } -} - /** * TODO: break this base definition into more self-contained pieces, including Writeonly, IteratorBased * VirtualDocument provides the abstraction of doing read/write/copy/delete operations over a single resource in Texera system. diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala index 702c3f8a0db..e7c7be0768a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/FileScanSourceOpExec.scala @@ -3,7 +3,7 @@ package edu.uci.ics.texera.workflow.operators.source.scan import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.model.tuple.TupleLike import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory} import org.apache.commons.io.IOUtils.toByteArray @@ -26,7 +26,7 @@ class FileScanSourceOpExec private[scan] ( override def produceTuple(): Iterator[TupleLike] = { var filenameIt: Iterator[String] = Iterator.empty val fileEntries: Iterator[InputStream] = { - val is = openFile(new URI(fileUri)).asInputStream() + val is = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream() if (extract) { val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream( new BufferedInputStream(is) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala index 97eeab439e3..5e5ccdfd26b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.{IOException, InputStreamReader} @@ -74,7 +74,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { return null } - val stream = openFile(new URI(fileUri.get)).asInputStream() + val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream() val inputReader = new InputStreamReader(stream, fileEncoding.getCharset) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala index 0117cf02ba0..245e67e1dfe 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/CSVScanSourceOpExec.scala @@ -5,7 +5,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.workflow.PortIdentity import edu.uci.ics.amber.engine.common.{CheckpointState, CheckpointSupport} import edu.uci.ics.amber.engine.common.model.tuple.{AttributeTypeUtils, Schema, TupleLike} -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.io.InputStreamReader @@ -69,7 +69,7 @@ class CSVScanSourceOpExec private[csv] ( override def open(): Unit = { inputReader = new InputStreamReader( - openFile(new URI(fileUri)).asInputStream(), + DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 76537550f4c..4496fd3fd18 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.{DocumentFactory, VirtualDocument} import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.IOException @@ -41,7 +41,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here // TODO: consider a better way - val file = openFile(new URI(fileUri.get)).asFile() + val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() val totalBytes: Long = file.length() PhysicalOp @@ -82,7 +82,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty || fileUri.isEmpty) { return null } - val file = openFile(new URI(fileUri.get)).asFile() + val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index 8bc67f5629d..38625b62b0f 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.IOException @@ -72,7 +72,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { if (customDelimiter.isEmpty || fileUri.isEmpty) { return null } - val file = openFile(new URI(fileUri.get)).asFile() + val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala index cf455e66d70..f6b2bd96c16 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -8,7 +8,7 @@ import edu.uci.ics.amber.engine.common.model.tuple.{ Schema, TupleLike } -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import java.net.URI @@ -53,7 +53,7 @@ class CSVOldScanSourceOpExec private[csvOld] ( implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } - val filePath = openFile(new URI(fileUri)).asFile().toPath + val filePath = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asFile().toPath reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat) // skip line if this worker reads the start of a file, and the file has a header line val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala index 066203e115e..c622a6e8853 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpDesc.scala @@ -4,12 +4,11 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.fasterxml.jackson.databind.JsonNode import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.OpExecInitInfo import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} -import edu.uci.ics.amber.engine.common.storage.DatasetFileDocument +import edu.uci.ics.amber.engine.common.storage.{DatasetFileDocument, DocumentFactory} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, Schema} -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap @@ -39,7 +38,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val stream = openFile(new URI(fileUri.get)).asInputStream() + val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream() // count lines and partition the task to each worker val reader = new BufferedReader( new InputStreamReader(stream, fileEncoding.getCharset) @@ -88,7 +87,7 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { if (fileUri.isEmpty) { return null } - val stream = openFile(new URI(fileUri.get)).asInputStream() + val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream() val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset)) var fieldNames = Set[String]() diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala index 5553f676123..f58f9adcb8d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/json/JSONLScanSourceOpExec.scala @@ -4,7 +4,7 @@ import edu.uci.ics.amber.engine.common.executor.SourceOperatorExecutor import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.parseField import edu.uci.ics.amber.engine.common.model.tuple.{Schema, TupleLike} -import edu.uci.ics.amber.engine.common.storage.VirtualDocument.openFile +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import edu.uci.ics.texera.workflow.operators.source.scan.FileDecodingMethod import edu.uci.ics.texera.workflow.operators.source.scan.json.JSONUtil.JSONToMap @@ -43,7 +43,7 @@ class JSONLScanSourceOpExec private[json] ( schema = schemaFunc() reader = new BufferedReader( new InputStreamReader( - openFile(new URI(fileUri)).asInputStream(), + DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream(), fileEncoding.getCharset ) ) From ecad8e33b88839fee1d436efdcecad08f1f36d8a Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 29 Oct 2024 16:08:38 -0700 Subject: [PATCH 15/18] fmt --- .../common/storage/DatasetFileDocument.scala | 32 +++++++++++-------- .../common/storage/FileResolver.scala | 29 ++++++++++++----- .../common/workflow/LogicalPlan.scala | 4 +++ 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala index 8b02580c00d..fa4d740b5c6 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/DatasetFileDocument.scala @@ -7,25 +7,31 @@ import org.jooq.types.UInteger import java.io.{File, FileOutputStream, InputStream} import java.net.{URI, URLDecoder} import java.nio.charset.StandardCharsets -import java.nio.file.{Files, Paths} +import java.nio.file.{Files, Path, Paths} +import scala.jdk.CollectionConverters.IteratorHasAsScala class DatasetFileDocument(uri: URI) extends VirtualDocument[Nothing] { - // Extract path components and decode them - private val pathParts = uri.getPath - .stripPrefix("/") - .split("/") - .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) + // Utility function to parse and decode URI segments into individual components + private def parseUri(uri: URI): (Int, String, Path) = { + val segments = Paths.get(uri.getPath).iterator().asScala.map(_.toString).toArray + if (segments.length < 3) + throw new IllegalArgumentException("URI format is incorrect") - private val did = pathParts(0).toInt - private val datasetVersionHash = pathParts(1) - private val fileRelativePath = Paths.get(pathParts.drop(2).head, pathParts.drop(2).tail: _*) + val did = segments(0).toInt + val datasetVersionHash = URLDecoder.decode(segments(1), StandardCharsets.UTF_8) + val decodedRelativeSegments = + segments.drop(2).map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) + val fileRelativePath = Paths.get(decodedRelativeSegments.head, decodedRelativeSegments.tail: _*) + + (did, datasetVersionHash, fileRelativePath) + } + + // Extract components from URI using the utility function + private val (did, datasetVersionHash, fileRelativePath) = parseUri(uri) private var tempFile: Option[File] = None - override def getURI: URI = - throw new UnsupportedOperationException( - "The URI cannot be acquired because the file is not physically located" - ) + override def getURI: URI = uri override def asInputStream(): InputStream = { val datasetAbsolutePath = PathUtils.getDatasetPath(UInteger.valueOf(did)) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala index fb4f5b70967..af5eb26a47c 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/storage/FileResolver.scala @@ -8,9 +8,11 @@ import edu.uci.ics.texera.web.model.jooq.generated.tables.User.USER import edu.uci.ics.texera.web.model.jooq.generated.tables.pojos.{Dataset, DatasetVersion} import org.apache.commons.vfs2.FileNotFoundException +import java.io.File import java.net.{URI, URLEncoder} import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} +import scala.jdk.CollectionConverters.IteratorHasAsScala import scala.util.{Success, Try} object FileResolver { @@ -68,7 +70,7 @@ object FileResolver { val ownerEmail = pathSegments(0) val datasetName = pathSegments(1) val versionName = pathSegments(2) - val fileRelativePath = Paths.get(pathSegments.drop(3).mkString("/")) + val fileRelativePath = Paths.get(pathSegments.drop(3).head, pathSegments.drop(3).tail: _*) // fetch the dataset and version from DB to get dataset ID and version hash val (dataset, datasetVersion) = @@ -96,15 +98,26 @@ object FileResolver { (dataset, datasetVersion) } - // Construct path as /{did}/{versionHash}/file-path - val encodedPath = - s"/${dataset.getDid.intValue()}/${datasetVersion.getVersionHash}/${fileRelativePath.toString - .split("/") - .map(URLEncoder.encode(_, StandardCharsets.UTF_8)) - .mkString("/")}" + // Convert each segment of fileRelativePath to an encoded String + val encodedFileRelativePath = fileRelativePath + .iterator() + .asScala + .map { segment => + URLEncoder.encode(segment.toString, StandardCharsets.UTF_8) + } + .toArray + + // Prepend did and versionHash to the encoded path segments + val allPathSegments = Array( + dataset.getDid.intValue().toString, + datasetVersion.getVersionHash + ) ++ encodedFileRelativePath + + // Build the the format /{did}/{versionHash}/{fileRelativePath} + val encodedPath = Paths.get(File.separator, allPathSegments: _*) try { - new URI(DATASET_FILE_URI_SCHEME, null, encodedPath, null) + new URI(DATASET_FILE_URI_SCHEME, "", encodedPath.toString, null) } catch { case e: Exception => throw new FileNotFoundException(s"Dataset file $fileName not found.") diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala index 8af7d60fd05..1574f1cd373 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/LogicalPlan.scala @@ -146,6 +146,10 @@ case class LogicalPlan( .toMap } + /** + * Resolve all user-given filename for the scan source operators to URIs, and call op.setFileUri to set the URi + * @param errorList if given, put errors during resolving to it + */ def resolveScanSourceOpFileName( errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]] ): Unit = { From a6d2cef7a2e0435d0ec871895d9f05dc2288fd0c Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 29 Oct 2024 16:12:55 -0700 Subject: [PATCH 16/18] fix fmt --- .../uci/ics/amber/engine/common/storage/VirtualDocument.scala | 2 -- .../operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala index 7fbd5050556..0fb0c1e7897 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/storage/VirtualDocument.scala @@ -1,7 +1,5 @@ package edu.uci.ics.amber.engine.common.storage -import edu.uci.ics.texera.workflow.common.storage.FileResolver.DATASET_FILE_URI_SCHEME - import java.io.{File, InputStream} import java.net.URI diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 4496fd3fd18..2978cb56992 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -9,7 +9,7 @@ import edu.uci.ics.amber.engine.common.model.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.engine.common.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.engine.common.model.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.engine.common.model.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.engine.common.storage.{DocumentFactory, VirtualDocument} +import edu.uci.ics.amber.engine.common.storage.DocumentFactory import edu.uci.ics.texera.workflow.operators.source.scan.ScanSourceOpDesc import java.io.IOException From 3cd305127ffe754d3698ad8cf094030854358c4a Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 29 Oct 2024 16:59:55 -0700 Subject: [PATCH 17/18] fix test --- .../edu/uci/ics/amber/engine/e2e/TestOperators.scala | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala index 2dda4568e49..cfb382e7544 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala @@ -1,10 +1,7 @@ package edu.uci.ics.amber.engine.e2e -import edu.uci.ics.texera.workflow.operators.aggregate.{ - AggregateOpDesc, - AggregationFunction, - AggregationOperation -} +import edu.uci.ics.texera.workflow.common.storage.FileResolver +import edu.uci.ics.texera.workflow.operators.aggregate.{AggregateOpDesc, AggregationFunction, AggregationOperation} import edu.uci.ics.texera.workflow.operators.hashJoin.HashJoinOpDesc import edu.uci.ics.texera.workflow.operators.keywordSearch.KeywordSearchOpDesc import edu.uci.ics.texera.workflow.operators.sink.managed.ProgressiveSinkOpDesc @@ -48,6 +45,7 @@ object TestOperators { csvHeaderlessOp.fileName = Some(fileName) csvHeaderlessOp.customDelimiter = Some(",") csvHeaderlessOp.hasHeader = header + csvHeaderlessOp.setFileUri(FileResolver.resolve(fileName)) csvHeaderlessOp } @@ -56,6 +54,7 @@ object TestOperators { val jsonlOp = new JSONLScanSourceOpDesc jsonlOp.fileName = Some(fileName) jsonlOp.flatten = flatten + jsonlOp.setFileUri(FileResolver.resolve(fileName)) jsonlOp } From 5a75f26e10186704e6de6e648b45051d38d60408 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 29 Oct 2024 17:55:43 -0700 Subject: [PATCH 18/18] fmt --- .../scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala index cfb382e7544..0b891472dcb 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/e2e/TestOperators.scala @@ -1,7 +1,11 @@ package edu.uci.ics.amber.engine.e2e import edu.uci.ics.texera.workflow.common.storage.FileResolver -import edu.uci.ics.texera.workflow.operators.aggregate.{AggregateOpDesc, AggregationFunction, AggregationOperation} +import edu.uci.ics.texera.workflow.operators.aggregate.{ + AggregateOpDesc, + AggregationFunction, + AggregationOperation +} import edu.uci.ics.texera.workflow.operators.hashJoin.HashJoinOpDesc import edu.uci.ics.texera.workflow.operators.keywordSearch.KeywordSearchOpDesc import edu.uci.ics.texera.workflow.operators.sink.managed.ProgressiveSinkOpDesc