diff --git a/core/amber/src/main/python/pytexera/__init__.py b/core/amber/src/main/python/pytexera/__init__.py index 28ed737411c..a4a7a9c5ce7 100644 --- a/core/amber/src/main/python/pytexera/__init__.py +++ b/core/amber/src/main/python/pytexera/__init__.py @@ -3,6 +3,7 @@ from typing import Iterator, Optional, Union from pyamber import * +from .storage.dataset_file_document import DatasetFileDocument from .udf.udf_operator import ( UDFOperatorV2, UDFTableOperator, @@ -22,6 +23,7 @@ "UDFTableOperator", "UDFBatchOperator", "UDFSourceOperator", + "DatasetFileDocument", # export external tools to be used "overrides", "logger", diff --git a/core/amber/src/main/python/pytexera/storage/__init__.py b/core/amber/src/main/python/pytexera/storage/__init__.py new file mode 100644 index 00000000000..da91bba69d6 --- /dev/null +++ b/core/amber/src/main/python/pytexera/storage/__init__.py @@ -0,0 +1,3 @@ +from .dataset_file_document import DatasetFileDocument + +__all__ = ["DatasetFileDocument"] diff --git a/core/amber/src/main/python/pytexera/storage/dataset_file_document.py b/core/amber/src/main/python/pytexera/storage/dataset_file_document.py new file mode 100644 index 00000000000..f4496c78b94 --- /dev/null +++ b/core/amber/src/main/python/pytexera/storage/dataset_file_document.py @@ -0,0 +1,81 @@ +import os +import io +import requests +import urllib.parse + + +class DatasetFileDocument: + def __init__(self, file_path: str): + """ + Parses the file path into dataset metadata. + + :param file_path: + Expected format - "/ownerEmail/datasetName/versionName/fileRelativePath" + Example: "/bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv" + """ + parts = file_path.strip("/").split("/") + if len(parts) < 4: + raise ValueError( + "Invalid file path format. " + "Expected: /ownerEmail/datasetName/versionName/fileRelativePath" + ) + + self.owner_email = parts[0] + self.dataset_name = parts[1] + self.version_name = parts[2] + self.file_relative_path = "/".join(parts[3:]) + + self.jwt_token = os.getenv("USER_JWT_TOKEN") + self.presign_endpoint = os.getenv("PRESIGN_API_ENDPOINT") + + if not self.jwt_token: + raise ValueError( + "JWT token is required but not set in environment variables." + ) + if not self.presign_endpoint: + self.presign_endpoint = "http://localhost:9092/api/dataset/presign-download" + + def get_presigned_url(self) -> str: + """ + Requests a presigned URL from the API. + + :return: The presigned URL as a string. + :raises: RuntimeError if the request fails. + """ + headers = {"Authorization": f"Bearer {self.jwt_token}"} + encoded_file_path = urllib.parse.quote( + f"/{self.owner_email}" + f"/{self.dataset_name}" + f"/{self.version_name}" + f"/{self.file_relative_path}" + ) + + params = {"filePath": encoded_file_path} + + response = requests.get(self.presign_endpoint, headers=headers, params=params) + + if response.status_code != 200: + raise RuntimeError( + f"Failed to get presigned URL: " + f"{response.status_code} {response.text}" + ) + + return response.json().get("presignedUrl") + + def read_file(self) -> io.BytesIO: + """ + Reads the file content from the presigned URL. + + :return: A file-like object. + :raises: RuntimeError if the retrieval fails. + """ + presigned_url = self.get_presigned_url() + response = requests.get(presigned_url) + + if response.status_code != 200: + raise RuntimeError( + f"Failed to retrieve file content: " + f"{response.status_code} {response.text}" + ) + + return io.BytesIO(response.content) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWebApplication.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWebApplication.scala index be2e9691eeb..7c237ad7f90 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWebApplication.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWebApplication.scala @@ -1,6 +1,5 @@ package edu.uci.ics.texera.web -import com.fasterxml.jackson.databind.module.SimpleModule import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.github.dirkraft.dropwizard.fileassets.FileAssetsBundle import com.typesafe.scalalogging.LazyLogging @@ -17,14 +16,7 @@ import edu.uci.ics.texera.web.resource.dashboard.DashboardResource import edu.uci.ics.texera.web.resource.dashboard.admin.execution.AdminExecutionResource import edu.uci.ics.texera.web.resource.dashboard.admin.user.AdminUserResource import edu.uci.ics.texera.web.resource.dashboard.hub.HubResource -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.`type`.{ - DatasetFileNode, - DatasetFileNodeSerializer -} -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.{ - DatasetAccessResource, - DatasetResource -} +import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource import edu.uci.ics.texera.web.resource.dashboard.user.project.{ ProjectAccessResource, ProjectResource, @@ -89,11 +81,6 @@ class TexeraWebApplication bootstrap.addBundle(new WebsocketBundle(classOf[CollaborationResource])) // register scala module to dropwizard default object mapper bootstrap.getObjectMapper.registerModule(DefaultScalaModule) - - // register a new custom module and add the custom serializer into it - val customSerializerModule = new SimpleModule("CustomSerializers") - customSerializerModule.addSerializer(classOf[DatasetFileNode], new DatasetFileNodeSerializer()) - bootstrap.getObjectMapper.registerModule(customSerializerModule) } override def run(configuration: TexeraWebConfiguration, environment: Environment): Unit = { @@ -146,7 +133,6 @@ class TexeraWebApplication environment.jersey.register(classOf[ResultResource]) environment.jersey.register(classOf[HubResource]) environment.jersey.register(classOf[WorkflowVersionResource]) - environment.jersey.register(classOf[DatasetResource]) environment.jersey.register(classOf[DatasetAccessResource]) environment.jersey.register(classOf[ProjectResource]) environment.jersey.register(classOf[ProjectAccessResource]) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala index 27bbedc2247..485c98d9623 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/dashboard/user/dataset/DatasetResource.scala @@ -22,8 +22,7 @@ import edu.uci.ics.texera.dao.jooq.generated.tables.daos.{ import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.{ Dataset, DatasetUserAccess, - DatasetVersion, - User + DatasetVersion } import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetAccessResource._ import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.{context, _} @@ -200,28 +199,6 @@ object DatasetResource { DatasetOperation(filesToAdd.toMap, filesToRemove.toList) } - /** - * Create a new dataset version by adding new files - * @param did the target dataset id - * @param user the user submitting the request - * @param filesToAdd the map containing the files to add - * @return the created dataset version - */ - def createNewDatasetVersionByAddingFiles( - did: Integer, - user: User, - filesToAdd: Map[java.nio.file.Path, InputStream] - ): Option[DashboardDatasetVersion] = { - applyDatasetOperationToCreateNewVersion( - context, - did, - user.getUid, - user.getEmail, - "", - DatasetOperation(filesToAdd, List()) - ) - } - // apply the dataset operation to create a new dataset version // it returns the created dataset version if creation succeed, else return None // concurrency control is performed here: the thread has to have the lock in order to create the new version diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/service/ResultExportService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/service/ResultExportService.scala index f0a16ef37fc..b9998f6f3c9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/service/ResultExportService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/service/ResultExportService.scala @@ -6,18 +6,17 @@ import edu.uci.ics.amber.core.storage.model.VirtualDocument import edu.uci.ics.amber.core.tuple.Tuple import edu.uci.ics.amber.core.virtualidentity.{OperatorIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.PortIdentity -import edu.uci.ics.amber.util.{ArrowUtils, PathUtils} +import edu.uci.ics.amber.util.ArrowUtils import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.User import edu.uci.ics.texera.web.model.websocket.request.ResultExportRequest import edu.uci.ics.texera.web.model.websocket.response.ResultExportResponse -import edu.uci.ics.texera.web.resource.dashboard.user.dataset.DatasetResource.createNewDatasetVersionByAddingFiles import edu.uci.ics.texera.web.resource.dashboard.user.workflow.{ WorkflowExecutionsResource, WorkflowVersionResource } import edu.uci.ics.texera.web.service.WorkflowExecutionService.getLatestExecutionId -import java.io.{FilterOutputStream, IOException, OutputStream, PipedInputStream, PipedOutputStream} +import java.io.{FilterOutputStream, IOException, OutputStream} import java.nio.channels.Channels import java.nio.charset.StandardCharsets import java.time.LocalDateTime @@ -33,6 +32,10 @@ import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.commons.lang3.StringUtils import javax.ws.rs.WebApplicationException import javax.ws.rs.core.StreamingOutput +import edu.uci.ics.texera.web.auth.JwtAuth +import edu.uci.ics.texera.web.auth.JwtAuth.{TOKEN_EXPIRE_TIME_IN_DAYS, dayToMin, jwtClaims} + +import java.net.{HttpURLConnection, URL, URLEncoder} /** * A simple wrapper that ignores 'close()' calls on the underlying stream. @@ -52,6 +55,14 @@ object ResultExportService { // Matches the remote's approach for a thread pool final private val pool: ThreadPoolExecutor = Executors.newFixedThreadPool(3).asInstanceOf[ThreadPoolExecutor] + + lazy val fileServiceUploadOneFileToDatasetEndpoint: String = + sys.env + .getOrElse( + "FILE_SERVICE_UPLOAD_ONE_FILE_TO_DATASET_ENDPOINT", + "http://localhost:9092/api/dataset/did/upload" + ) + .trim } class ResultExportService(workflowIdentity: WorkflowIdentity) { @@ -156,23 +167,22 @@ class ResultExportService(workflowIdentity: WorkflowIdentity) { results: Iterable[Tuple], headers: List[String] ): (Option[String], Option[String]) = { + val fileName = generateFileName(request, operatorId, "csv") try { - val pipedOutputStream = new PipedOutputStream() - val pipedInputStream = new PipedInputStream(pipedOutputStream) - pool.submit(new Runnable { - override def run(): Unit = { - val writer = CSVWriter.open(pipedOutputStream) + saveToDatasets( + request, + user, + outputStream => { + val writer = CSVWriter.open(outputStream) writer.writeRow(headers) results.foreach { tuple => writer.writeRow(tuple.getFields.toIndexedSeq) } writer.close() - } - }) - - val fileName = generateFileName(request, operatorId, "csv") - saveToDatasets(request, user, pipedInputStream, fileName) + }, + fileName + ) (Some(s"CSV export done for operator $operatorId -> file: $fileName"), None) } catch { case ex: Exception => @@ -202,17 +212,15 @@ class ResultExportService(workflowIdentity: WorkflowIdentity) { val field = selectedRow.getField(columnIndex) val dataBytes: Array[Byte] = convertFieldToBytes(field) - val pipedOutputStream = new PipedOutputStream() - val pipedInputStream = new PipedInputStream(pipedOutputStream) - - pool.submit(new Runnable { - override def run(): Unit = { - pipedOutputStream.write(dataBytes) - pipedOutputStream.close() - } - }) - - saveToDatasets(request, user, pipedInputStream, fileName) + saveToDatasets( + request, + user, + outputStream => { + outputStream.write(dataBytes) + outputStream.close() + }, + fileName + ) (Some(s"Data export done for operator $operatorId -> file: $fileName"), None) } catch { case ex: Exception => @@ -242,24 +250,24 @@ class ResultExportService(workflowIdentity: WorkflowIdentity) { } try { - val pipedOutputStream = new PipedOutputStream() - val pipedInputStream = new PipedInputStream(pipedOutputStream) - val allocator = new RootAllocator() - - pool.submit(() => { - Using.Manager { use => - val (writer, root) = createArrowWriter(results, allocator, pipedOutputStream) - use(writer) - use(root) - use(allocator) - use(pipedOutputStream) - - writeArrowData(writer, root, results) - } - }) - val fileName = generateFileName(request, operatorId, "arrow") - saveToDatasets(request, user, pipedInputStream, fileName) + + saveToDatasets( + request, + user, + outputStream => { + val allocator = new RootAllocator() + Using.Manager { use => + val (writer, root) = createArrowWriter(results, allocator, outputStream) + use(writer) + use(root) + use(allocator) + + writeArrowData(writer, root, results) + } + }, + fileName + ) (Some(s"Arrow file export done for operator $operatorId -> file: $fileName"), None) } catch { @@ -333,17 +341,47 @@ class ResultExportService(workflowIdentity: WorkflowIdentity) { private def saveToDatasets( request: ResultExportRequest, user: User, - pipedInputStream: PipedInputStream, + fileWriter: OutputStream => Unit, // Pass function that writes data fileName: String ): Unit = { request.datasetIds.foreach { did => - val datasetPath = PathUtils.getDatasetPath(did) - val filePath = datasetPath.resolve(fileName) - createNewDatasetVersionByAddingFiles( - did, - user, - Map(filePath -> pipedInputStream) + val encodedFilePath = URLEncoder.encode(fileName, StandardCharsets.UTF_8.name()) + val message = URLEncoder.encode( + s"Export from workflow ${request.workflowName}", + StandardCharsets.UTF_8.name() ) + + val uploadUrl = s"$fileServiceUploadOneFileToDatasetEndpoint" + .replace("did", did.toString) + s"?filePath=$encodedFilePath&message=$message" + + var connection: HttpURLConnection = null + try { + val url = new URL(uploadUrl) + connection = url.openConnection().asInstanceOf[HttpURLConnection] + connection.setDoOutput(true) + connection.setRequestMethod("POST") + connection.setRequestProperty("Content-Type", "application/octet-stream") + connection.setRequestProperty( + "Authorization", + s"Bearer ${JwtAuth.jwtToken(jwtClaims(user, dayToMin(TOKEN_EXPIRE_TIME_IN_DAYS)))}" + ) + + // Get output stream from connection + val outputStream = connection.getOutputStream + fileWriter(outputStream) // Write directly to HTTP request output stream + outputStream.close() + + // Check response + val responseCode = connection.getResponseCode + if (responseCode != HttpURLConnection.HTTP_OK) { + throw new RuntimeException(s"Failed to upload file. Server responded with: $responseCode") + } + } catch { + case e: Exception => + throw new RuntimeException(s"Error uploading file to dataset $did: ${e.getMessage}", e) + } finally { + if (connection != null) connection.disconnect() + } } } diff --git a/core/build.sbt b/core/build.sbt index 2f0ec2d44fe..74e873ce5a4 100644 --- a/core/build.sbt +++ b/core/build.sbt @@ -3,6 +3,17 @@ lazy val WorkflowCore = (project in file("workflow-core")) .dependsOn(DAO) .configs(Test) .dependsOn(DAO % "test->test") // test scope dependency +lazy val FileService = (project in file("file-service")) + .dependsOn(WorkflowCore) + .settings( + dependencyOverrides ++= Seq( + // override it as io.dropwizard 4 require 2.16.1 or higher + "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.16.1", + "com.fasterxml.jackson.core" % "jackson-databind" % "2.16.1", + "org.glassfish.jersey.core" % "jersey-common" % "3.0.12" + ) + ) + lazy val WorkflowOperator = (project in file("workflow-operator")).dependsOn(WorkflowCore) lazy val WorkflowCompilingService = (project in file("workflow-compiling-service")) .dependsOn(WorkflowOperator) @@ -26,14 +37,17 @@ lazy val WorkflowExecutionService = (project in file("amber")) "org.eclipse.jetty" % "jetty-server" % "9.4.20.v20190813", "org.eclipse.jetty" % "jetty-servlet" % "9.4.20.v20190813", "org.eclipse.jetty" % "jetty-http" % "9.4.20.v20190813", - ) + ), + libraryDependencies ++= Seq( + "com.squareup.okhttp3" % "okhttp" % "4.10.0" force(), // Force usage of OkHttp 4.10.0 + ), ) .configs(Test) .dependsOn(DAO % "test->test") // test scope dependency // root project definition lazy val CoreProject = (project in file(".")) - .aggregate(DAO, WorkflowCore, WorkflowOperator, WorkflowCompilingService, WorkflowExecutionService) + .aggregate(DAO, WorkflowCore, FileService, WorkflowOperator, WorkflowCompilingService, WorkflowExecutionService) .settings( name := "core", version := "0.1.0", diff --git a/core/file-service/build.sbt b/core/file-service/build.sbt new file mode 100644 index 00000000000..984de4d3ba5 --- /dev/null +++ b/core/file-service/build.sbt @@ -0,0 +1,68 @@ +import scala.collection.Seq + +name := "file-service" +organization := "edu.uci.ics" +version := "0.1.0" +scalaVersion := "2.13.12" + +enablePlugins(JavaAppPackaging) + +// Enable semanticdb for Scalafix +ThisBuild / semanticdbEnabled := true +ThisBuild / semanticdbVersion := scalafixSemanticdb.revision + +// Manage dependency conflicts by always using the latest revision +ThisBuild / conflictManager := ConflictManager.latestRevision + +// Restrict parallel execution of tests to avoid conflicts +Global / concurrentRestrictions += Tags.limit(Tags.Test, 1) + +///////////////////////////////////////////////////////////////////////////// +// Compiler Options +///////////////////////////////////////////////////////////////////////////// + +// Scala compiler options +Compile / scalacOptions ++= Seq( + "-Xelide-below", "WARNING", // Turn on optimizations with "WARNING" as the threshold + "-feature", // Check feature warnings + "-deprecation", // Check deprecation warnings + "-Ywarn-unused:imports" // Check for unused imports +) + +///////////////////////////////////////////////////////////////////////////// +// Version Variables +///////////////////////////////////////////////////////////////////////////// + +val dropwizardVersion = "4.0.7" +val mockitoVersion = "5.4.0" +val assertjVersion = "3.24.2" + +///////////////////////////////////////////////////////////////////////////// +// Test-related Dependencies +///////////////////////////////////////////////////////////////////////////// + +libraryDependencies ++= Seq( + "org.scalamock" %% "scalamock" % "5.2.0" % Test, // ScalaMock + "org.scalatest" %% "scalatest" % "3.2.17" % Test, // ScalaTest + "io.dropwizard" % "dropwizard-testing" % dropwizardVersion % Test, // Dropwizard Testing + "org.mockito" % "mockito-core" % mockitoVersion % Test, // Mockito for mocking + "org.assertj" % "assertj-core" % assertjVersion % Test, // AssertJ for assertions + "com.novocode" % "junit-interface" % "0.11" % Test // SBT interface for JUnit +) + +///////////////////////////////////////////////////////////////////////////// +// Dependencies +///////////////////////////////////////////////////////////////////////////// + +// Core Dependencies +libraryDependencies ++= Seq( + "io.dropwizard" % "dropwizard-core" % dropwizardVersion, + "io.dropwizard" % "dropwizard-auth" % dropwizardVersion, // Dropwizard Authentication module + "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.15.2", + "jakarta.ws.rs" % "jakarta.ws.rs-api" % "3.1.0", // Ensure Jakarta JAX-RS API is available + "org.bitbucket.b_c" % "jose4j" % "0.9.6", + "org.playframework" %% "play-json" % "3.1.0-M1", + "software.amazon.awssdk" % "s3" % "2.29.51", + "software.amazon.awssdk" % "auth" % "2.29.51", + "software.amazon.awssdk" % "regions" % "2.29.51", +) diff --git a/core/file-service/src/main/resources/auth-config.yaml b/core/file-service/src/main/resources/auth-config.yaml new file mode 100644 index 00000000000..a9cd08a7b16 --- /dev/null +++ b/core/file-service/src/main/resources/auth-config.yaml @@ -0,0 +1,6 @@ +auth: + jwt: + exp-in-days: 30 + # generate the secret again for each deployment using the following: + # 'openssl rand -hex 16' or 'xxd -l16 -ps /dev/urandom' + 256-bit-secret: "8a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d" \ No newline at end of file diff --git a/core/file-service/src/main/resources/docker-compose.yml b/core/file-service/src/main/resources/docker-compose.yml new file mode 100644 index 00000000000..b50ab5f1a7d --- /dev/null +++ b/core/file-service/src/main/resources/docker-compose.yml @@ -0,0 +1,75 @@ +version: "3.5" +name: texera-lakefs +services: + minio: + image: minio/minio:RELEASE.2025-02-28T09-55-16Z + container_name: minio + ports: + - "9000:9000" + - "9001:9001" + environment: + - MINIO_ROOT_USER=texera_minio + - MINIO_ROOT_PASSWORD=password + command: server --console-address ":9001" /data +# Below lines are recommended to uncomment in order to persist your data even if the container dies +# volumes: +# - /path/to/your/local/directory:/data + + postgres: + image: postgres:15 + container_name: postgres + restart: always + environment: + - POSTGRES_DB=texera_lakefs + - POSTGRES_USER=texera_lakefs_admin + - POSTGRES_PASSWORD=password + healthcheck: + test: ["CMD", "pg_isready", "-U", "texera_lakefs_admin"] + interval: 10s + retries: 5 + start_period: 5s +# Ditto +# volumes: +# - /path/to/your/local/directory:/var/lib/postgresql/data + + lakefs: + image: treeverse/lakefs:1.51 + container_name: lakefs + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_started + ports: + - "8000:8000" + environment: + - LAKEFS_BLOCKSTORE_TYPE=s3 + - LAKEFS_BLOCKSTORE_S3_FORCE_PATH_STYLE=true + - LAKEFS_BLOCKSTORE_S3_ENDPOINT=http://minio:9000 + - LAKEFS_BLOCKSTORE_S3_PRE_SIGNED_ENDPOINT=http://localhost:9000 + - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID=texera_minio + - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_SECRET_ACCESS_KEY=password + - LAKEFS_AUTH_ENCRYPT_SECRET_KEY=random_string_for_lakefs + - LAKEFS_LOGGING_LEVEL=INFO + - LAKEFS_STATS_ENABLED=1 + - LAKEFS_DATABASE_TYPE=postgres + - LAKEFS_DATABASE_POSTGRES_CONNECTION_STRING=postgres://texera_lakefs_admin:password@postgres:5432/texera_lakefs?sslmode=disable + - LAKEFS_INSTALLATION_USER_NAME=texera-admin + - LAKEFS_INSTALLATION_ACCESS_KEY_ID=AKIAIOSFOLKFSSAMPLES + - LAKEFS_INSTALLATION_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + entrypoint: ["/bin/sh", "-c"] + command: + - | + lakefs setup --user-name "$$LAKEFS_INSTALLATION_USER_NAME" --access-key-id "$$LAKEFS_INSTALLATION_ACCESS_KEY_ID" --secret-access-key "$$LAKEFS_INSTALLATION_SECRET_ACCESS_KEY" || true + lakefs run & + echo "---- lakeFS Web UI ----" + echo "http://127.0.0.1:8000/" + echo "" + echo "Access Key ID : $$LAKEFS_INSTALLATION_ACCESS_KEY_ID" + echo "Secret Access Key: $$LAKEFS_INSTALLATION_SECRET_ACCESS_KEY" + echo "" + wait + +networks: + default: + name: texera-lakefs \ No newline at end of file diff --git a/core/file-service/src/main/resources/file-service-web-config.yaml b/core/file-service/src/main/resources/file-service-web-config.yaml new file mode 100644 index 00000000000..01396a8910e --- /dev/null +++ b/core/file-service/src/main/resources/file-service-web-config.yaml @@ -0,0 +1,22 @@ +server: + applicationConnectors: + - type: http + port: 9092 + adminConnectors: [] + +logging: + level: INFO + loggers: + "io.dropwizard": INFO + appenders: + - type: console + - type: file + currentLogFilename: log/file-service.log + threshold: ALL + queueSize: 512 + discardingThreshold: 0 + archive: true + archivedLogFilenamePattern: log/file-service-%d{yyyy-MM-dd}.log.gz + archivedFileCount: 7 + bufferSize: 8KiB + immediateFlush: true \ No newline at end of file diff --git a/core/file-service/src/main/resources/minio-config.yml b/core/file-service/src/main/resources/minio-config.yml new file mode 100644 index 00000000000..a4ee5aace8c --- /dev/null +++ b/core/file-service/src/main/resources/minio-config.yml @@ -0,0 +1,15 @@ +version: '3.8' + +services: + minio: + image: minio/minio:latest + container_name: minio + ports: + - "9500:9000" # MinIO API + - "9501:9001" # MinIO Console UI + environment: + - MINIO_ROOT_USER=texera_minio + - MINIO_ROOT_PASSWORD=password + volumes: + - /Users/baijiadong/Desktop/chenlab/texera/core/file-service/src/main/user-resources/minio:/data + command: server --console-address ":9001" /data \ No newline at end of file diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/FileService.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/FileService.scala new file mode 100644 index 00000000000..de9a40727be --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/FileService.scala @@ -0,0 +1,75 @@ +package edu.uci.ics.texera.service + +import com.fasterxml.jackson.databind.module.SimpleModule +import io.dropwizard.core.Application +import io.dropwizard.core.setup.{Bootstrap, Environment} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import edu.uci.ics.amber.core.storage.StorageConfig +import edu.uci.ics.amber.core.storage.util.LakeFSStorageClient +import edu.uci.ics.amber.util.PathUtils.fileServicePath +import edu.uci.ics.texera.dao.SqlServer +import edu.uci.ics.texera.service.`type`.DatasetFileNode +import edu.uci.ics.texera.service.`type`.serde.DatasetFileNodeSerializer +import edu.uci.ics.texera.service.auth.{JwtAuthFilter, SessionUser} +import edu.uci.ics.texera.service.resource.{DatasetAccessResource, DatasetResource} +import edu.uci.ics.texera.service.util.{S3StorageClient} +import io.dropwizard.auth.AuthDynamicFeature +import org.eclipse.jetty.server.session.SessionHandler + +class FileService extends Application[FileServiceConfiguration] { + override def initialize(bootstrap: Bootstrap[FileServiceConfiguration]): Unit = { + // Register Scala module to Dropwizard default object mapper + bootstrap.getObjectMapper.registerModule(DefaultScalaModule) + + // register a new custom module just for DatasetFileNode serde/deserde + val customSerializerModule = new SimpleModule("CustomSerializers") + customSerializerModule.addSerializer(classOf[DatasetFileNode], new DatasetFileNodeSerializer()) + bootstrap.getObjectMapper.registerModule(customSerializerModule) + } + + override def run(configuration: FileServiceConfiguration, environment: Environment): Unit = { + // Serve backend at /api + environment.jersey.setUrlPattern("/api/*") + SqlServer.initConnection( + StorageConfig.jdbcUrl, + StorageConfig.jdbcUsername, + StorageConfig.jdbcPassword + ) + + // check if the texera dataset bucket exists, if not create it + S3StorageClient.createBucketIfNotExist(StorageConfig.lakefsBlockStorageBucketName) + // check if we can connect to the lakeFS service + LakeFSStorageClient.healthCheck() + + environment.jersey.register(classOf[SessionHandler]) + environment.servlets.setSessionHandler(new SessionHandler) + + // Register JWT authentication filter + environment.jersey.register(new AuthDynamicFeature(classOf[JwtAuthFilter])) + + // Enable @Auth annotation for injecting SessionUser + environment.jersey.register( + new io.dropwizard.auth.AuthValueFactoryProvider.Binder(classOf[SessionUser]) + ) + + // Register multipart feature for file uploads + environment.jersey.register(classOf[DatasetResource]) + environment.jersey.register(classOf[DatasetAccessResource]) + } +} + +object FileService { + def main(args: Array[String]): Unit = { + // Set the configuration file's path + val configFilePath = fileServicePath + .resolve("src") + .resolve("main") + .resolve("resources") + .resolve("file-service-web-config.yaml") + .toAbsolutePath + .toString + + // Start the Dropwizard application + new FileService().run("server", configFilePath) + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/FileServiceConfiguration.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/FileServiceConfiguration.scala new file mode 100644 index 00000000000..be3a700ce97 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/FileServiceConfiguration.scala @@ -0,0 +1,5 @@ +package edu.uci.ics.texera.service + +import io.dropwizard.core.Configuration + +class FileServiceConfiguration extends Configuration {} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/AuthConfig.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/AuthConfig.scala new file mode 100644 index 00000000000..2e6e78192ed --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/AuthConfig.scala @@ -0,0 +1,34 @@ +package edu.uci.ics.texera.service.auth + +import org.yaml.snakeyaml.Yaml + +import java.util.{Map => JMap} +import scala.jdk.CollectionConverters._ + +object AuthConfig { + private val conf: Map[String, Any] = { + val yaml = new Yaml() + val inputStream = getClass.getClassLoader.getResourceAsStream("auth-config.yaml") + val javaConf = yaml.load(inputStream).asInstanceOf[JMap[String, Any]].asScala.toMap + + val authMap = javaConf("auth").asInstanceOf[JMap[String, Any]].asScala.toMap + val jwtMap = authMap("jwt").asInstanceOf[JMap[String, Any]].asScala.toMap + + javaConf.updated( + "auth", + authMap.updated("jwt", jwtMap) + ) + } + + // Read JWT expiration time + val jwtExpirationDays: Int = conf("auth") + .asInstanceOf[Map[String, Any]]("jwt") + .asInstanceOf[Map[String, Any]]("exp-in-days") + .asInstanceOf[Int] + + // Read JWT secret key + val jwtSecretKey: String = conf("auth") + .asInstanceOf[Map[String, Any]]("jwt") + .asInstanceOf[Map[String, Any]]("256-bit-secret") + .asInstanceOf[String] +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/JwtAuthFilter.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/JwtAuthFilter.scala new file mode 100644 index 00000000000..a6a153ea9bc --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/JwtAuthFilter.scala @@ -0,0 +1,40 @@ +package edu.uci.ics.texera.service.auth + +import jakarta.ws.rs.container.{ContainerRequestContext, ContainerRequestFilter} +import jakarta.ws.rs.core.{HttpHeaders, SecurityContext} +import jakarta.ws.rs.ext.Provider +import jakarta.ws.rs.container.ResourceInfo +import jakarta.ws.rs.core.Context + +import java.security.Principal +import com.typesafe.scalalogging.LazyLogging +import edu.uci.ics.texera.dao.jooq.generated.enums.UserRoleEnum + +@Provider +class JwtAuthFilter extends ContainerRequestFilter with LazyLogging { + + @Context + private var resourceInfo: ResourceInfo = _ + + override def filter(requestContext: ContainerRequestContext): Unit = { + val authHeader = requestContext.getHeaderString(HttpHeaders.AUTHORIZATION) + + if (authHeader != null && authHeader.startsWith("Bearer ")) { + val token = authHeader.substring(7) // Remove "Bearer " prefix + val userOpt = JwtParser.parseToken(token) + + if (userOpt.isPresent) { + val user = userOpt.get() + requestContext.setSecurityContext(new SecurityContext { + override def getUserPrincipal: Principal = user + override def isUserInRole(role: String): Boolean = + user.isRoleOf(UserRoleEnum.valueOf(role)) + override def isSecure: Boolean = false + override def getAuthenticationScheme: String = "Bearer" + }) + } else { + logger.warn("Invalid JWT: Unable to parse token") + } + } + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/JwtParser.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/JwtParser.scala new file mode 100644 index 00000000000..97732a51928 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/JwtParser.scala @@ -0,0 +1,58 @@ +package edu.uci.ics.texera.service.auth + +import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.User +import org.jose4j.jwt.consumer.{JwtConsumer, JwtConsumerBuilder} +import org.jose4j.keys.HmacKey +import org.jose4j.lang.UnresolvableKeyException +import com.typesafe.scalalogging.LazyLogging +import edu.uci.ics.texera.dao.jooq.generated.enums.UserRoleEnum +import org.jose4j.jwt.JwtClaims + +import java.nio.charset.StandardCharsets +import java.util.Optional + +object JwtParser extends LazyLogging { + + private val TOKEN_SECRET = AuthConfig.jwtSecretKey.toLowerCase() match { + case "random" => getRandomHexString + case _ => AuthConfig.jwtSecretKey + } + + private val jwtConsumer: JwtConsumer = new JwtConsumerBuilder() + .setAllowedClockSkewInSeconds(30) + .setRequireExpirationTime() + .setRequireSubject() + .setVerificationKey(new HmacKey(TOKEN_SECRET.getBytes(StandardCharsets.UTF_8))) + .setRelaxVerificationKeyValidation() + .build() + + def parseToken(token: String): Optional[SessionUser] = { + try { + val jwtClaims: JwtClaims = jwtConsumer.processToClaims(token) + val userName = jwtClaims.getSubject + val email = jwtClaims.getClaimValue("email", classOf[String]) + val userId = jwtClaims.getClaimValue("userId").asInstanceOf[Long].toInt + val role = UserRoleEnum.valueOf(jwtClaims.getClaimValue("role").asInstanceOf[String]) + val googleId = jwtClaims.getClaimValue("googleId", classOf[String]) + + val user = new User(userId, userName, email, null, googleId, null, role) + Optional.of(new SessionUser(user)) + } catch { + case _: UnresolvableKeyException => + logger.error("Invalid JWT Signature") + Optional.empty() + case e: Exception => + logger.error(s"Failed to parse JWT: ${e.getMessage}") + Optional.empty() + } + } + + private def getRandomHexString: String = { + val bytes = 32 + val r = new scala.util.Random() + val sb = new StringBuilder + while (sb.length < bytes) + sb.append(Integer.toHexString(r.nextInt())) + sb.toString.substring(0, bytes) + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/SessionUser.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/SessionUser.scala new file mode 100644 index 00000000000..79a05eccbda --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/auth/SessionUser.scala @@ -0,0 +1,20 @@ +package edu.uci.ics.texera.service.auth + +import edu.uci.ics.texera.dao.jooq.generated.enums.UserRoleEnum +import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.User + +import java.security.Principal + +class SessionUser(val user: User) extends Principal { + def getUser: User = user + + override def getName: String = user.getName + + def getUid: Integer = user.getUid + + def getEmail: String = user.getEmail + + def getGoogleId: String = user.getGoogleId + + def isRoleOf(role: UserRoleEnum): Boolean = user.getRole == role +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/resource/DatasetAccessResource.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/resource/DatasetAccessResource.scala new file mode 100644 index 00000000000..3242a260267 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/resource/DatasetAccessResource.scala @@ -0,0 +1,192 @@ +package edu.uci.ics.texera.service.resource + +import edu.uci.ics.texera.dao.SqlServer +import edu.uci.ics.texera.dao.SqlServer.withTransaction +import edu.uci.ics.texera.dao.jooq.generated.Tables.USER +import edu.uci.ics.texera.dao.jooq.generated.enums.PrivilegeEnum +import edu.uci.ics.texera.dao.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS +import edu.uci.ics.texera.dao.jooq.generated.tables.daos.{DatasetDao, DatasetUserAccessDao, UserDao} +import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.{DatasetUserAccess, User} +import edu.uci.ics.texera.service.resource.DatasetAccessResource.{AccessEntry, context, getOwner} +import org.jooq.{DSLContext, EnumType} + +import java.util +import javax.annotation.security.RolesAllowed +import javax.ws.rs._ +import javax.ws.rs.core.{MediaType, Response} + +object DatasetAccessResource { + private lazy val context: DSLContext = SqlServer + .getInstance() + .createDSLContext() + + def isDatasetPublic(ctx: DSLContext, did: Integer): Boolean = { + val datasetDao = new DatasetDao(ctx.configuration()) + Option(datasetDao.fetchOneByDid(did)) + .flatMap(dataset => Option(dataset.getIsPublic)) + .contains(true) + } + + def userHasReadAccess(ctx: DSLContext, did: Integer, uid: Integer): Boolean = { + isDatasetPublic(ctx, did) || + userHasWriteAccess(ctx, did, uid) || + getDatasetUserAccessPrivilege(ctx, did, uid) == PrivilegeEnum.READ + } + + def userOwnDataset(ctx: DSLContext, did: Integer, uid: Integer): Boolean = { + val datasetDao = new DatasetDao(ctx.configuration()) + + Option(datasetDao.fetchOneByDid(did)) + .exists(_.getOwnerUid == uid) + } + + def userHasWriteAccess(ctx: DSLContext, did: Integer, uid: Integer): Boolean = { + userOwnDataset(ctx, did, uid) || + getDatasetUserAccessPrivilege(ctx, did, uid) == PrivilegeEnum.WRITE + } + + def getDatasetUserAccessPrivilege( + ctx: DSLContext, + did: Integer, + uid: Integer + ): PrivilegeEnum = { + Option( + ctx + .select(DATASET_USER_ACCESS.PRIVILEGE) + .from(DATASET_USER_ACCESS) + .where( + DATASET_USER_ACCESS.DID + .eq(did) + .and(DATASET_USER_ACCESS.UID.eq(uid)) + ) + .fetchOneInto(classOf[PrivilegeEnum]) + ).getOrElse(PrivilegeEnum.NONE) + } + + def getOwner(ctx: DSLContext, did: Integer): User = { + val datasetDao = new DatasetDao(ctx.configuration()) + val userDao = new UserDao(ctx.configuration()) + + Option(datasetDao.fetchOneByDid(did)) + .flatMap(dataset => Option(dataset.getOwnerUid)) + .map(ownerUid => userDao.fetchOneByUid(ownerUid)) + .orNull + } + + case class AccessEntry(email: String, name: String, privilege: EnumType) {} + +} + +@Produces(Array(MediaType.APPLICATION_JSON)) +@RolesAllowed(Array("REGULAR", "ADMIN")) +@Path("/access/dataset") +class DatasetAccessResource { + + /** + * This method returns the owner of a dataset + * + * @param did , dataset id + * @return ownerEmail, the owner's email + */ + @GET + @Path("/owner/{did}") + def getOwnerEmailOfDataset(@PathParam("did") did: Integer): String = { + var email = "" + withTransaction(context) { ctx => + val owner = getOwner(ctx, did) + if (owner != null) { + email = owner.getEmail + } + } + email + } + + /** + * Returns information about all current shared access of the given dataset + * + * @param did dataset id + * @return a List of email/name/permission + */ + @GET + @Path("/list/{did}") + def getAccessList( + @PathParam("did") did: Integer + ): util.List[AccessEntry] = { + withTransaction(context) { ctx => + val datasetDao = new DatasetDao(ctx.configuration()) + ctx + .select( + USER.EMAIL, + USER.NAME, + DATASET_USER_ACCESS.PRIVILEGE + ) + .from(DATASET_USER_ACCESS) + .join(USER) + .on(USER.UID.eq(DATASET_USER_ACCESS.UID)) + .where( + DATASET_USER_ACCESS.DID + .eq(did) + .and(DATASET_USER_ACCESS.UID.notEqual(datasetDao.fetchOneByDid(did).getOwnerUid)) + ) + .fetchInto(classOf[AccessEntry]) + } + } + + /** + * This method shares a dataset to a user with a specific access type + * + * @param did the given dataset + * @param email the email which the access is given to + * @param privilege the type of Access given to the target user + * @return rejection if user not permitted to share the workflow or Success Message + */ + @PUT + @Path("/grant/{did}/{email}/{privilege}") + def grantAccess( + @PathParam("did") did: Integer, + @PathParam("email") email: String, + @PathParam("privilege") privilege: String + ): Response = { + withTransaction(context) { ctx => + val datasetUserAccessDao = new DatasetUserAccessDao(ctx.configuration()) + val userDao = new UserDao(ctx.configuration()) + datasetUserAccessDao.merge( + new DatasetUserAccess( + did, + userDao.fetchOneByEmail(email).getUid, + PrivilegeEnum.valueOf(privilege) + ) + ) + Response.ok().build() + } + } + + /** + * This method revoke the user's access of the given dataset + * + * @param did the given dataset + * @param email the email of the use whose access is about to be removed + * @return message indicating a success message + */ + @DELETE + @Path("/revoke/{did}/{email}") + def revokeAccess( + @PathParam("did") did: Integer, + @PathParam("email") email: String + ): Response = { + withTransaction(context) { ctx => + val userDao = new UserDao(ctx.configuration()) + + ctx + .delete(DATASET_USER_ACCESS) + .where( + DATASET_USER_ACCESS.UID + .eq(userDao.fetchOneByEmail(email).getUid) + .and(DATASET_USER_ACCESS.DID.eq(did)) + ) + .execute() + + Response.ok().build() + } + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/resource/DatasetResource.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/resource/DatasetResource.scala new file mode 100644 index 00000000000..0ae21b13341 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/resource/DatasetResource.scala @@ -0,0 +1,1006 @@ +package edu.uci.ics.texera.service.resource + +import edu.uci.ics.amber.core.storage.model.OnDataset +import edu.uci.ics.amber.core.storage.util.LakeFSStorageClient +import edu.uci.ics.amber.core.storage.{DocumentFactory, FileResolver, StorageConfig} +import edu.uci.ics.texera.dao.SqlServer +import edu.uci.ics.texera.dao.SqlServer.withTransaction +import edu.uci.ics.texera.dao.jooq.generated.enums.PrivilegeEnum +import edu.uci.ics.texera.dao.jooq.generated.tables.User.USER +import edu.uci.ics.texera.dao.jooq.generated.tables.Dataset.DATASET +import edu.uci.ics.texera.dao.jooq.generated.tables.DatasetUserAccess.DATASET_USER_ACCESS +import edu.uci.ics.texera.dao.jooq.generated.tables.DatasetVersion.DATASET_VERSION +import edu.uci.ics.texera.dao.jooq.generated.tables.daos.{ + DatasetDao, + DatasetUserAccessDao, + DatasetVersionDao +} +import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.{ + Dataset, + DatasetUserAccess, + DatasetVersion +} +import edu.uci.ics.texera.service.`type`.DatasetFileNode +import edu.uci.ics.texera.service.auth.SessionUser +import edu.uci.ics.texera.service.resource.DatasetAccessResource.{ + getDatasetUserAccessPrivilege, + getOwner, + isDatasetPublic, + userHasReadAccess, + userHasWriteAccess, + userOwnDataset +} +import edu.uci.ics.texera.service.resource.DatasetResource.{ + CreateDatasetRequest, + DashboardDataset, + DashboardDatasetVersion, + DatasetDescriptionModification, + DatasetVersionRootFileNodesResponse, + Diff, + context, + getDatasetByID, + getDatasetVersionByID, + getLatestDatasetVersion +} +import edu.uci.ics.texera.service.util.S3StorageClient +import io.dropwizard.auth.Auth +import jakarta.annotation.security.RolesAllowed +import jakarta.ws.rs._ +import jakarta.ws.rs.core.{MediaType, Response, StreamingOutput} +import org.jooq.{DSLContext, EnumType} + +import java.io.{InputStream, OutputStream} +import java.net.URLDecoder +import java.nio.charset.StandardCharsets +import java.util.Optional +import scala.collection.mutable.ListBuffer +import scala.jdk.CollectionConverters._ +import scala.jdk.OptionConverters._ + +object DatasetResource { + private val context = SqlServer + .getInstance() + .createDSLContext() + + /** + * Helper function to get the dataset from DB using did + */ + private def getDatasetByID(ctx: DSLContext, did: Integer): Dataset = { + val datasetDao = new DatasetDao(ctx.configuration()) + val dataset = datasetDao.fetchOneByDid(did) + if (dataset == null) { + throw new NotFoundException(f"Dataset $did not found") + } + dataset + } + + /** + * Helper function to get the dataset version from DB using dvid + */ + private def getDatasetVersionByID( + ctx: DSLContext, + dvid: Integer + ): DatasetVersion = { + val datasetVersionDao = new DatasetVersionDao(ctx.configuration()) + val version = datasetVersionDao.fetchOneByDvid(dvid) + if (version == null) { + throw new NotFoundException("Dataset Version not found") + } + version + } + + /** + * Helper function to get the latest dataset version from the DB + */ + private def getLatestDatasetVersion( + ctx: DSLContext, + did: Integer + ): Option[DatasetVersion] = { + ctx + .selectFrom(DATASET_VERSION) + .where(DATASET_VERSION.DID.eq(did)) + .orderBy(DATASET_VERSION.CREATION_TIME.desc()) + .limit(1) + .fetchOptionalInto(classOf[DatasetVersion]) + .toScala + } + + case class DashboardDataset( + dataset: Dataset, + ownerEmail: String, + accessPrivilege: EnumType, + isOwner: Boolean + ) + case class DashboardDatasetVersion( + datasetVersion: DatasetVersion, + fileNodes: List[DatasetFileNode] + ) + + case class CreateDatasetRequest( + datasetName: String, + datasetDescription: String, + isDatasetPublic: Boolean + ) + + case class Diff( + path: String, + pathType: String, + diffType: String, // "added", "removed", "changed", etc. + sizeBytes: Option[Long] // Size of the changed file (None for directories) + ) + + case class DatasetDescriptionModification(did: Integer, description: String) + + case class DatasetVersionRootFileNodesResponse( + fileNodes: List[DatasetFileNode], + size: Long + ) +} + +@Produces(Array(MediaType.APPLICATION_JSON, "image/jpeg", "application/pdf")) +@Path("/dataset") +class DatasetResource { + private val ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE = "User has no access to this dataset" + private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" + private val ERR_DATASET_CREATION_FAILED_MESSAGE = + "Dataset creation is failed. Please make sure to upload files in order to create the initial version of dataset" + + /** + * Helper function to get the dataset from DB with additional information including user access privilege and owner email + */ + private def getDashboardDataset( + ctx: DSLContext, + did: Integer, + requesterUid: Option[Integer] + ): DashboardDataset = { + val targetDataset = getDatasetByID(ctx, did) + if (requesterUid.isDefined && !userHasReadAccess(ctx, did, requesterUid.get)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + val userAccessPrivilege = getDatasetUserAccessPrivilege(ctx, did, requesterUid.get) + + DashboardDataset( + targetDataset, + getOwner(ctx, did).getEmail, + userAccessPrivilege, + targetDataset.getOwnerUid == requesterUid.get + ) + } + + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/create") + @Consumes(Array(MediaType.APPLICATION_JSON)) + def createDataset( + request: CreateDatasetRequest, + @Auth user: SessionUser + ): DashboardDataset = { + + withTransaction(context) { ctx => + val uid = user.getUid + val datasetDao: DatasetDao = new DatasetDao(ctx.configuration()) + val datasetUserAccessDao: DatasetUserAccessDao = new DatasetUserAccessDao(ctx.configuration()) + + val datasetName = request.datasetName + val datasetDescription = request.datasetDescription + val isDatasetPublic = request.isDatasetPublic + + // Check if a dataset with the same name already exists + if (!datasetDao.fetchByName(datasetName).isEmpty) { + throw new BadRequestException("Dataset with the same name already exists") + } + + // Initialize the repository in LakeFS + try { + LakeFSStorageClient.initRepo(datasetName) + } catch { + case e: Exception => + throw new WebApplicationException( + s"Failed to create the dataset: ${e.getMessage}" + ) + } + + // Insert the dataset into the database + val dataset = new Dataset() + dataset.setName(datasetName) + dataset.setDescription(datasetDescription) + dataset.setIsPublic(isDatasetPublic) + dataset.setOwnerUid(uid) + + val createdDataset = ctx + .insertInto(DATASET) + .set(ctx.newRecord(DATASET, dataset)) + .returning() + .fetchOne() + + // Insert the requester as the WRITE access user for this dataset + val datasetUserAccess = new DatasetUserAccess() + datasetUserAccess.setDid(createdDataset.getDid) + datasetUserAccess.setUid(uid) + datasetUserAccess.setPrivilege(PrivilegeEnum.WRITE) + datasetUserAccessDao.insert(datasetUserAccess) + + DashboardDataset( + new Dataset( + createdDataset.getDid, + createdDataset.getOwnerUid, + createdDataset.getName, + createdDataset.getIsPublic, + createdDataset.getDescription, + createdDataset.getCreationTime + ), + user.getEmail, + PrivilegeEnum.WRITE, + isOwner = true + ) + } + } + + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/version/create") + @Consumes(Array(MediaType.TEXT_PLAIN)) + def createDatasetVersion( + versionName: String, + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDatasetVersion = { + val uid = user.getUid + withTransaction(context) { ctx => + if (!userHasWriteAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + val dataset = getDatasetByID(ctx, did) + val datasetName = dataset.getName + + // Check if there are any changes in LakeFS before creating a new version + val diffs = LakeFSStorageClient.retrieveUncommittedObjects(repoName = datasetName) + + if (diffs.isEmpty) { + throw new WebApplicationException( + "No changes detected in dataset. Version creation aborted.", + Response.Status.BAD_REQUEST + ) + } + + // Generate a new version name + val versionCount = ctx + .selectCount() + .from(DATASET_VERSION) + .where(DATASET_VERSION.DID.eq(did)) + .fetchOne(0, classOf[Int]) + + val sanitizedVersionName = Option(versionName).filter(_.nonEmpty).getOrElse("") + val newVersionName = if (sanitizedVersionName.isEmpty) { + s"v${versionCount + 1}" + } else { + s"v${versionCount + 1} - $sanitizedVersionName" + } + + // Create a commit in LakeFS + val commit = LakeFSStorageClient.createCommit( + repoName = datasetName, + branch = "main", + commitMessage = s"Created dataset version: $newVersionName" + ) + + if (commit == null || commit.getId == null) { + throw new WebApplicationException( + "Failed to create commit in LakeFS. Version creation aborted.", + Response.Status.INTERNAL_SERVER_ERROR + ) + } + + // Create a new dataset version entry in the database + val datasetVersion = new DatasetVersion() + datasetVersion.setDid(did) + datasetVersion.setCreatorUid(uid) + datasetVersion.setName(newVersionName) + datasetVersion.setVersionHash(commit.getId) // Store LakeFS version hash + + val insertedVersion = ctx + .insertInto(DATASET_VERSION) + .set(ctx.newRecord(DATASET_VERSION, datasetVersion)) + .returning() + .fetchOne() + .into(classOf[DatasetVersion]) + + // Retrieve committed file structure + val fileNodes = LakeFSStorageClient.retrieveObjectsOfVersion(datasetName, commit.getId) + + DashboardDatasetVersion( + insertedVersion, + DatasetFileNode + .fromLakeFSRepositoryCommittedObjects( + Map((user.getEmail, datasetName, newVersionName) -> fileNodes) + ) + ) + } + } + + @DELETE + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}") + def deleteDataset(@PathParam("did") did: Integer, @Auth user: SessionUser): Response = { + val uid = user.getUid + withTransaction(context) { ctx => + val datasetDao = new DatasetDao(ctx.configuration()) + val dataset = getDatasetByID(ctx, did) + if (!userOwnDataset(ctx, dataset.getDid, uid)) { + // throw the exception that user has no access to certain dataset + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + try { + LakeFSStorageClient.deleteRepo(dataset.getName) + } catch { + case e: Exception => + throw new WebApplicationException( + s"Failed to delete a repository in LakeFS: ${e.getMessage}", + e + ) + } + + // delete the directory on S3 + if ( + S3StorageClient.directoryExists(StorageConfig.lakefsBlockStorageBucketName, dataset.getName) + ) { + S3StorageClient.deleteDirectory(StorageConfig.lakefsBlockStorageBucketName, dataset.getName) + } + + // delete the dataset from the DB + datasetDao.deleteById(dataset.getDid) + + Response.ok().build() + } + } + + @POST + @Consumes(Array(MediaType.APPLICATION_JSON)) + @Produces(Array(MediaType.APPLICATION_JSON)) + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/update/description") + def updateDatasetDescription( + modificator: DatasetDescriptionModification, + @Auth sessionUser: SessionUser + ): Response = { + withTransaction(context) { ctx => + val uid = sessionUser.getUid + val datasetDao = new DatasetDao(ctx.configuration()) + val dataset = getDatasetByID(ctx, modificator.did) + if (!userHasWriteAccess(ctx, modificator.did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + dataset.setDescription(modificator.description) + datasetDao.update(dataset) + Response.ok().build() + } + } + + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/upload") + @Consumes(Array(MediaType.APPLICATION_OCTET_STREAM)) + def uploadOneFileToDataset( + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @QueryParam("message") message: String, + fileStream: InputStream, + @Auth user: SessionUser + ): Response = { + val uid = user.getUid + + withTransaction(context) { ctx => + // Verify the user has write access + if (!userHasWriteAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + // Retrieve dataset name + val dataset = getDatasetByID(ctx, did) + val datasetName = dataset.getName + // Decode file path + val filePath = URLDecoder.decode(encodedFilePath, StandardCharsets.UTF_8.name()) + // TODO: in the future consider using multipart to upload this stream more faster + LakeFSStorageClient.writeFileToRepo(datasetName, filePath, fileStream) + Response.ok(Map("message" -> "File uploaded successfully")).build() + } + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/presign-download") + def getPresignedUrl( + @QueryParam("filePath") encodedUrl: String, + @QueryParam("datasetName") datasetName: String, + @QueryParam("commitHash") commitHash: String, + @Auth user: SessionUser + ): Response = { + val uid = user.getUid + val decodedPathStr = URLDecoder.decode(encodedUrl, StandardCharsets.UTF_8.name()) + + (Option(datasetName), Option(commitHash)) match { + case (Some(_), None) | (None, Some(_)) => + // Case 1: Only one parameter is provided (error case) + Response + .status(Response.Status.BAD_REQUEST) + .entity( + "Both datasetName and commitHash must be provided together, or neither should be provided." + ) + .build() + + case (Some(dsName), Some(commit)) => + // Case 2: datasetName and commitHash are provided, validate access + withTransaction(context) { ctx => + val datasetDao = new DatasetDao(ctx.configuration()) + val datasets = datasetDao.fetchByName(dsName).asScala.toList + + if (datasets.isEmpty || !userHasReadAccess(ctx, datasets.head.getDid, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + val url = LakeFSStorageClient.getFilePresignedUrl(dsName, commit, decodedPathStr) + Response.ok(Map("presignedUrl" -> url)).build() + } + + case (None, None) => + // Case 3: Neither datasetName nor commitHash are provided, resolve normally + withTransaction(context) { ctx => + val fileUri = FileResolver.resolve(decodedPathStr) + val document = DocumentFactory.openReadonlyDocument(fileUri).asInstanceOf[OnDataset] + val datasetDao = new DatasetDao(ctx.configuration()) + val datasets = datasetDao.fetchByName(document.getDatasetName()).asScala.toList + + if (datasets.isEmpty || !userHasReadAccess(ctx, datasets.head.getDid, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + Response + .ok( + Map( + "presignedUrl" -> LakeFSStorageClient.getFilePresignedUrl( + document.getDatasetName(), + document.getVersionHash(), + document.getFileRelativePath() + ) + ) + ) + .build() + } + } + } + + @DELETE + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/file") + @Consumes(Array(MediaType.APPLICATION_JSON)) + def deleteDatasetFile( + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @Auth user: SessionUser + ): Response = { + val uid = user.getUid + withTransaction(context) { ctx => + if (!userHasWriteAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + val datasetName = getDatasetByID(ctx, did).getName + + // Decode the file path + val filePath = URLDecoder.decode(encodedFilePath, StandardCharsets.UTF_8.name()) + // Try to initialize the repository in LakeFS + try { + LakeFSStorageClient.deleteObject(datasetName, filePath) + } catch { + case e: Exception => + throw new WebApplicationException( + s"Failed to delete the file from repo in LakeFS: ${e.getMessage}" + ) + } + + Response.ok().build() + } + } + + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/multipart-upload") + @Consumes(Array(MediaType.APPLICATION_JSON)) + def multipartUpload( + @QueryParam("datasetName") datasetName: String, + @QueryParam("type") operationType: String, + @QueryParam("filePath") encodedUrl: String, + @QueryParam("uploadId") uploadId: Optional[String], + @QueryParam("numParts") numParts: Optional[Integer], + payload: Map[ + String, + Any + ], // Expecting {"parts": [...], "physicalAddress": "s3://bucket/path"} + @Auth user: SessionUser + ): Response = { + val uid = user.getUid + + withTransaction(context) { ctx => + val datasetDao = new DatasetDao(ctx.configuration()) + val datasets = datasetDao.fetchByName(datasetName).asScala.toList + if (datasets.isEmpty || !userHasWriteAccess(ctx, datasets.head.getDid, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + // Decode the file path + val filePath = URLDecoder.decode(encodedUrl, StandardCharsets.UTF_8.name()) + + operationType.toLowerCase match { + case "init" => + val numPartsValue = numParts.toScala.getOrElse( + throw new BadRequestException("numParts is required for initialization") + ) + + val presignedResponse = LakeFSStorageClient.initiatePresignedMultipartUploads( + datasetName, + filePath, + numPartsValue + ) + Response + .ok( + Map( + "uploadId" -> presignedResponse.getUploadId, + "presignedUrls" -> presignedResponse.getPresignedUrls, + "physicalAddress" -> presignedResponse.getPhysicalAddress + ) + ) + .build() + + case "finish" => + val uploadIdValue = uploadId.toScala.getOrElse( + throw new BadRequestException("uploadId is required for completion") + ) + + // Extract parts from payload + val partsList = payload.get("parts") match { + case Some(parts: List[Map[String, Any]]) => // Fix: Accept `Any` type for mixed values + parts.map { part => + val partNumber = part("PartNumber") match { + case i: Int => i + case s: String => s.toInt + case _ => throw new BadRequestException("Invalid PartNumber format") + } + val eTag = part("ETag") match { + case s: String => s + case _ => throw new BadRequestException("Invalid ETag format") + } + (partNumber, eTag) + } + case _ => throw new BadRequestException("Missing or invalid parts data for completion") + } + + // Extract physical address from payload + val physicalAddress = payload.get("physicalAddress") match { + case Some(address: String) => address + case _ => throw new BadRequestException("Missing physicalAddress in payload") + } + + // Complete the multipart upload with parts and physical address + val objectStats = LakeFSStorageClient.completePresignedMultipartUploads( + datasetName, + filePath, + uploadIdValue, + partsList, + physicalAddress + ) + + Response + .ok( + Map( + "message" -> "Multipart upload completed successfully", + "filePath" -> objectStats.getPath() + ) + ) + .build() + + case "abort" => + val uploadIdValue = uploadId.toScala.getOrElse( + throw new BadRequestException("uploadId is required for abortion") + ) + + // Extract physical address from payload + val physicalAddress = payload.get("physicalAddress") match { + case Some(address: String) => address + case _ => throw new BadRequestException("Missing physicalAddress in payload") + } + + // Abort the multipart upload + LakeFSStorageClient.abortPresignedMultipartUploads( + datasetName, + filePath, + uploadIdValue, + physicalAddress + ) + + Response.ok(Map("message" -> "Multipart upload aborted successfully")).build() + + case _ => + throw new BadRequestException("Invalid type parameter. Use 'init', 'finish', or 'abort'.") + } + } + } + + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/update/publicity") + def toggleDatasetPublicity( + @PathParam("did") did: Integer, + @Auth sessionUser: SessionUser + ): Response = { + withTransaction(context) { ctx => + val datasetDao = new DatasetDao(ctx.configuration()) + val uid = sessionUser.getUid + + if (!userHasWriteAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + val existedDataset = getDatasetByID(ctx, did) + existedDataset.setIsPublic(!existedDataset.getIsPublic) + + datasetDao.update(existedDataset) + Response.ok().build() + } + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/diff") + def getDatasetDiff( + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): List[Diff] = { + val uid = user.getUid + withTransaction(context) { ctx => + if (!userHasReadAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + // Retrieve staged (uncommitted) changes from LakeFS + val dataset = getDatasetByID(ctx, did) + val lakefsDiffs = LakeFSStorageClient.retrieveUncommittedObjects(dataset.getName) + + // Convert LakeFS Diff objects to our custom Diff case class + lakefsDiffs.map(d => + new Diff( + d.getPath, + d.getPathType.getValue, + d.getType.getValue, + Option(d.getSizeBytes).map(_.longValue()) + ) + ) + } + } + + @PUT + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/diff") + @Consumes(Array(MediaType.APPLICATION_JSON)) + def resetDatasetFileDiff( + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @Auth user: SessionUser + ): Response = { + val uid = user.getUid + withTransaction(context) { ctx => + if (!userHasWriteAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + val datasetName = getDatasetByID(ctx, did).getName + + // Decode the file path + val filePath = URLDecoder.decode(encodedFilePath, StandardCharsets.UTF_8.name()) + // Try to reset the file change in LakeFS + try { + LakeFSStorageClient.resetObjectUploadOrDeletion(datasetName, filePath) + } catch { + case e: Exception => + throw new WebApplicationException( + s"Failed to reset the changes from repo in LakeFS: ${e.getMessage}" + ) + } + Response.ok().build() + } + } + + /** + * This method returns a list of DashboardDatasets objects that are accessible by current user. + * + * @param user the session user + * @return list of user accessible DashboardDataset objects + */ + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/list") + def listDatasets( + @Auth user: SessionUser + ): List[DashboardDataset] = { + val uid = user.getUid + withTransaction(context)(ctx => { + var accessibleDatasets: ListBuffer[DashboardDataset] = ListBuffer() + // first fetch all datasets user have explicit access to + accessibleDatasets = ListBuffer.from( + ctx + .select() + .from( + DATASET + .leftJoin(DATASET_USER_ACCESS) + .on(DATASET_USER_ACCESS.DID.eq(DATASET.DID)) + .leftJoin(USER) + .on(USER.UID.eq(DATASET.OWNER_UID)) + ) + .where(DATASET_USER_ACCESS.UID.eq(uid)) + .fetch() + .map(record => { + val dataset = record.into(DATASET).into(classOf[Dataset]) + val datasetAccess = record.into(DATASET_USER_ACCESS).into(classOf[DatasetUserAccess]) + val ownerEmail = record.into(USER).getEmail + DashboardDataset( + isOwner = dataset.getOwnerUid == uid, + dataset = dataset, + accessPrivilege = datasetAccess.getPrivilege, + ownerEmail = ownerEmail + ) + }) + .asScala + ) + + // then we fetch the public datasets and merge it as a part of the result if not exist + val publicDatasets = ctx + .select() + .from( + DATASET + .leftJoin(USER) + .on(USER.UID.eq(DATASET.OWNER_UID)) + ) + .where(DATASET.IS_PUBLIC.eq(true)) + .fetch() + .map(record => { + val dataset = record.into(DATASET).into(classOf[Dataset]) + val ownerEmail = record.into(USER).getEmail + DashboardDataset( + isOwner = false, + dataset = dataset, + accessPrivilege = PrivilegeEnum.READ, + ownerEmail = ownerEmail + ) + }) + publicDatasets.forEach { publicDataset => + if (!accessibleDatasets.exists(_.dataset.getDid == publicDataset.dataset.getDid)) { + val dashboardDataset = DashboardDataset( + isOwner = false, + dataset = publicDataset.dataset, + ownerEmail = publicDataset.ownerEmail, + accessPrivilege = PrivilegeEnum.READ + ) + accessibleDatasets = accessibleDatasets :+ dashboardDataset + } + } + + accessibleDatasets.toList + }) + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/version/list") + def getDatasetVersionList( + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): List[DatasetVersion] = { + val uid = user.getUid + withTransaction(context)(ctx => { + val dataset = getDatasetByID(ctx, did) + if (!userHasReadAccess(ctx, dataset.getDid, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + fetchDatasetVersions(ctx, dataset.getDid) + }) + } + + // TODO: change did to name + @GET + @Path("/{name}/publicVersion/list") + def getPublicDatasetVersionList( + @PathParam("name") did: Integer + ): List[DatasetVersion] = { + withTransaction(context)(ctx => { + if (!isDatasetPublic(ctx, did)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + fetchDatasetVersions(ctx, did) + }) + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/version/latest") + def retrieveLatestDatasetVersion( + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDatasetVersion = { + val uid = user.getUid + withTransaction(context)(ctx => { + if (!userHasReadAccess(ctx, did, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + val dataset = getDatasetByID(ctx, did) + val latestVersion = getLatestDatasetVersion(ctx, did).getOrElse( + throw new NotFoundException(ERR_DATASET_VERSION_NOT_FOUND_MESSAGE) + ) + + val ownerNode = DatasetFileNode + .fromLakeFSRepositoryCommittedObjects( + Map( + (user.getEmail, dataset.getName, latestVersion.getName) -> + LakeFSStorageClient + .retrieveObjectsOfVersion(dataset.getName, latestVersion.getVersionHash) + ) + ) + .head + + DashboardDatasetVersion( + latestVersion, + ownerNode.children.get + .find(_.getName == dataset.getName) + .head + .children + .get + .find(_.getName == latestVersion.getName) + .head + .children + .get + ) + }) + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}/version/{dvid}/rootFileNodes") + def retrieveDatasetVersionRootFileNodes( + @PathParam("did") did: Integer, + @PathParam("dvid") dvid: Integer, + @Auth user: SessionUser + ): DatasetVersionRootFileNodesResponse = { + val uid = user.getUid + withTransaction(context)(ctx => fetchDatasetVersionRootFileNodes(ctx, did, dvid, Some(uid))) + } + + @GET + @Path("/{did}/publicVersion/{dvid}/rootFileNodes") + def retrievePublicDatasetVersionRootFileNodes( + @PathParam("did") did: Integer, + @PathParam("dvid") dvid: Integer + ): DatasetVersionRootFileNodesResponse = { + withTransaction(context)(ctx => fetchDatasetVersionRootFileNodes(ctx, did, dvid, None)) + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{did}") + def getDataset( + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDataset = { + val uid = user.getUid + withTransaction(context)(ctx => getDashboardDataset(ctx, did, Some(uid))) + } + + @GET + @Path("/public/{did}") + def getPublicDataset( + @PathParam("did") did: Integer + ): DashboardDataset = { + withTransaction(context)(ctx => getDashboardDataset(ctx, did, None)) + } + + @GET + @Path("/file") + def retrieveDatasetSingleFile( + @QueryParam("path") pathStr: String + ): Response = { + val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) + + withTransaction(context)(ctx => { + val fileUri = FileResolver.resolve(decodedPathStr) + val streamingOutput = new StreamingOutput() { + override def write(output: OutputStream): Unit = { + val inputStream = DocumentFactory.openReadonlyDocument(fileUri).asInputStream() + try { + val buffer = new Array[Byte](8192) // buffer size + var bytesRead = inputStream.read(buffer) + while (bytesRead != -1) { + output.write(buffer, 0, bytesRead) + bytesRead = inputStream.read(buffer) + } + } finally { + inputStream.close() + } + } + } + + val contentType = decodedPathStr.split("\\.").lastOption.map(_.toLowerCase) match { + case Some("jpg") | Some("jpeg") => "image/jpeg" + case Some("png") => "image/png" + case Some("csv") => "text/csv" + case Some("md") => "text/markdown" + case Some("txt") => "text/plain" + case Some("html") | Some("htm") => "text/html" + case Some("json") => "application/json" + case Some("pdf") => "application/pdf" + case Some("doc") | Some("docx") => "application/msword" + case Some("xls") | Some("xlsx") => "application/vnd.ms-excel" + case Some("ppt") | Some("pptx") => "application/vnd.ms-powerpoint" + case Some("mp4") => "video/mp4" + case Some("mp3") => "audio/mpeg" + case _ => "application/octet-stream" // default binary format + } + + Response.ok(streamingOutput).`type`(contentType).build() + }) + } + + @GET + @Path("/datasetUserAccess") + def datasetUserAccess( + @QueryParam("did") did: Integer + ): java.util.List[Integer] = { + val records = context + .select(DATASET_USER_ACCESS.UID) + .from(DATASET_USER_ACCESS) + .where(DATASET_USER_ACCESS.DID.eq(did)) + .fetch() + + records.getValues(DATASET_USER_ACCESS.UID) + } + + private def fetchDatasetVersions(ctx: DSLContext, did: Integer): List[DatasetVersion] = { + ctx + .selectFrom(DATASET_VERSION) + .where(DATASET_VERSION.DID.eq(did)) + .orderBy(DATASET_VERSION.CREATION_TIME.desc()) // Change to .asc() for ascending order + .fetchInto(classOf[DatasetVersion]) + .asScala + .toList + } + + private def fetchDatasetVersionRootFileNodes( + ctx: DSLContext, + did: Integer, + dvid: Integer, + uid: Option[Integer] + ): DatasetVersionRootFileNodesResponse = { + val dataset = getDashboardDataset(ctx, did, uid) + val datasetVersion = getDatasetVersionByID(ctx, dvid) + val datasetName = dataset.dataset.getName + + val ownerFileNode = DatasetFileNode + .fromLakeFSRepositoryCommittedObjects( + Map( + (dataset.ownerEmail, datasetName, datasetVersion.getName) -> LakeFSStorageClient + .retrieveObjectsOfVersion(datasetName, datasetVersion.getVersionHash) + ) + ) + .head + + DatasetVersionRootFileNodesResponse( + ownerFileNode.children.get + .find(_.getName == datasetName) + .head + .children + .get + .find(_.getName == datasetVersion.getName) + .head + .children + .get, + DatasetFileNode.calculateTotalSize(List(ownerFileNode)) + ) + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/type/dataset/DatasetFileNode.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/type/dataset/DatasetFileNode.scala new file mode 100644 index 00000000000..10d89f8e235 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/type/dataset/DatasetFileNode.scala @@ -0,0 +1,228 @@ +package edu.uci.ics.texera.service.`type` + +import edu.uci.ics.amber.core.storage.util.dataset.PhysicalFileNode +import io.lakefs.clients.sdk.model.ObjectStats + +import java.util +import scala.collection.mutable + +// DatasetFileNode represents a unique file in dataset, its full path is in the format of: +// /ownerEmail/datasetName/versionName/fileRelativePath +// e.g. /bob@texera.com/twitterDataset/v1/california/irvine/tw1.csv +// ownerName is bob@texera.com; datasetName is twitterDataset, versionName is v1, fileRelativePath is california/irvine/tw1.csv +class DatasetFileNode( + val name: String, // direct name of this node + val nodeType: String, // "file" or "directory" + val parent: DatasetFileNode, // the parent node + val ownerEmail: String, + val size: Option[Long] = None, // size of the file in bytes, None if directory + var children: Option[List[DatasetFileNode]] = None // Only populated if 'type' is 'directory' +) { + + // Ensure the type is either "file" or "directory" + require(nodeType == "file" || nodeType == "directory", "type must be 'file' or 'directory'") + + // Getters for the parameters + def getName: String = name + + def getNodeType: String = nodeType + + def getParent: DatasetFileNode = parent + + def getOwnerEmail: String = ownerEmail + + def getSize: Option[Long] = size + + def getChildren: List[DatasetFileNode] = children.getOrElse(List()) + + // Method to get the full file path + def getFilePath: String = { + val pathComponents = new mutable.ArrayBuffer[String]() + var currentNode: DatasetFileNode = this + while (currentNode != null) { + if (currentNode.parent != null) { // Skip the root node to avoid double slashes + pathComponents.prepend(currentNode.name) + } + currentNode = currentNode.parent + } + "/" + pathComponents.mkString("/") + } +} + +object DatasetFileNode { + + /** + * Converts a map of LakeFS committed objects into a structured dataset file node tree. + * + * @param map A mapping from `(ownerEmail, datasetName, versionName)` to a list of committed objects. + * @return A list of root-level dataset file nodes. + */ + def fromLakeFSRepositoryCommittedObjects( + map: Map[(String, String, String), List[ObjectStats]] + ): List[DatasetFileNode] = { + val rootNode = new DatasetFileNode("/", "directory", null, "") + + // Owner level nodes map + val ownerNodes = mutable.Map[String, DatasetFileNode]() + + map.foreach { + case ((ownerEmail, datasetName, versionName), objects) => + val ownerNode = ownerNodes.getOrElseUpdate( + ownerEmail, { + val newNode = new DatasetFileNode(ownerEmail, "directory", rootNode, ownerEmail) + rootNode.children = Some(rootNode.getChildren :+ newNode) + newNode + } + ) + + val datasetNode = ownerNode.getChildren.find(_.getName == datasetName).getOrElse { + val newNode = new DatasetFileNode(datasetName, "directory", ownerNode, ownerEmail) + ownerNode.children = Some(ownerNode.getChildren :+ newNode) + newNode + } + + val versionNode = datasetNode.getChildren.find(_.getName == versionName).getOrElse { + val newNode = new DatasetFileNode(versionName, "directory", datasetNode, ownerEmail) + datasetNode.children = Some(datasetNode.getChildren :+ newNode) + newNode + } + + // Directory map for efficient lookups + val directoryMap = mutable.Map[String, DatasetFileNode]() + directoryMap("") = versionNode // Root of the dataset version + + // Process each object (file or directory) from LakeFS + objects.foreach { obj => + val pathParts = obj.getPath.split("/").toList + var currentPath = "" + var parentNode: DatasetFileNode = versionNode + + pathParts.foreach { part => + currentPath = if (currentPath.isEmpty) part else s"$currentPath/$part" + + val isFile = pathParts.last == part + val nodeType = if (isFile) "file" else "directory" + val fileSize = if (isFile) Some(obj.getSizeBytes.longValue()) else None + + val existingNode = directoryMap.get(currentPath) + + val node = existingNode.getOrElse { + val newNode = new DatasetFileNode(part, nodeType, parentNode, ownerEmail, fileSize) + parentNode.children = Some(parentNode.getChildren :+ newNode) + if (!isFile) directoryMap(currentPath) = newNode + newNode + } + + parentNode = node // Move parent reference deeper for next iteration + } + } + } + + // Sorting function to sort children of a node alphabetically in descending order + def sortChildren(node: DatasetFileNode): Unit = { + node.children = Some(node.getChildren.sortBy(_.getName)(Ordering.String.reverse)) + node.getChildren.foreach(sortChildren) + } + + // Apply the sorting to the root node + sortChildren(rootNode) + + rootNode.getChildren + } + + def fromPhysicalFileNodes( + map: Map[(String, String, String), List[PhysicalFileNode]] + ): List[DatasetFileNode] = { + val rootNode = new DatasetFileNode("/", "directory", null, "") + val ownerNodes = mutable.Map[String, DatasetFileNode]() + + map.foreach { + case ((ownerEmail, datasetName, versionName), physicalNodes) => + val ownerNode = ownerNodes.getOrElseUpdate( + ownerEmail, { + val newNode = new DatasetFileNode(ownerEmail, "directory", rootNode, ownerEmail) + rootNode.children = Some(rootNode.getChildren :+ newNode) + newNode + } + ) + + val datasetNode = ownerNode.getChildren.find(_.getName == datasetName).getOrElse { + val newNode = new DatasetFileNode(datasetName, "directory", ownerNode, ownerEmail) + ownerNode.children = Some(ownerNode.getChildren :+ newNode) + newNode + } + + val versionNode = datasetNode.getChildren.find(_.getName == versionName).getOrElse { + val newNode = new DatasetFileNode(versionName, "directory", datasetNode, ownerEmail) + datasetNode.children = Some(datasetNode.getChildren :+ newNode) + newNode + } + + physicalNodes.foreach(node => addNodeToTree(versionNode, node, ownerEmail)) + } + + // Sorting function to sort children of a node alphabetically in descending order + def sortChildren(node: DatasetFileNode): Unit = { + node.children = Some(node.getChildren.sortBy(_.getName)(Ordering.String.reverse)) + node.getChildren.foreach(sortChildren) + } + + // Apply the sorting to the root node + sortChildren(rootNode) + + rootNode.getChildren + } + + private def addNodeToTree( + parentNode: DatasetFileNode, + physicalNode: PhysicalFileNode, + ownerEmail: String + ): Unit = { + val queue = new util.LinkedList[(DatasetFileNode, PhysicalFileNode)]() + queue.add((parentNode, physicalNode)) + + while (!queue.isEmpty) { + val (currentParent, currentPhysicalNode) = queue.poll() + val relativePath = currentPhysicalNode.getRelativePath.toString.split("/").toList + val nodeName = relativePath.last + + val fileType = + if (currentPhysicalNode.isDirectory) "directory" else "file" + val fileSize = + if (fileType == "file") Some(currentPhysicalNode.getSize) else None + val existingNode = currentParent.getChildren.find(child => + child.getName == nodeName && child.getNodeType == fileType + ) + val fileNode = existingNode.getOrElse { + val newNode = new DatasetFileNode( + nodeName, + fileType, + currentParent, + ownerEmail, + fileSize + ) + currentParent.children = Some(currentParent.getChildren :+ newNode) + newNode + } + + // Add children of the current physical node to the queue + currentPhysicalNode.getChildren.forEach(child => queue.add((fileNode, child))) + } + } + + /** + * Traverses a given list of DatasetFileNode and returns the total size of all files. + * + * @param nodes List of root-level DatasetFileNode. + * @return Total size in bytes. + */ + def calculateTotalSize(nodes: List[DatasetFileNode]): Long = { + def traverse(node: DatasetFileNode): Long = { + val fileSize = node.getSize.getOrElse(0L) + val childrenSize = node.getChildren.map(traverse).sum + fileSize + childrenSize + } + + nodes.map(traverse).sum + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/type/serde/DatasetFileNodeSerializer.java b/core/file-service/src/main/scala/edu/uci/ics/texera/service/type/serde/DatasetFileNodeSerializer.java new file mode 100644 index 00000000000..2d2da04fb14 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/type/serde/DatasetFileNodeSerializer.java @@ -0,0 +1,44 @@ +package edu.uci.ics.texera.service.type.serde; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import edu.uci.ics.texera.service.type.DatasetFileNode; +import scala.collection.JavaConverters; +import scala.collection.immutable.List; + +import java.io.IOException; + +// this class is used to serialize the FileNode as JSON. So that FileNodes can be inspected by the frontend through JSON. +public class DatasetFileNodeSerializer extends StdSerializer { + + public DatasetFileNodeSerializer() { + this(null); + } + + public DatasetFileNodeSerializer(Class t) { + super(t); + } + + @Override + public void serialize(DatasetFileNode value, JsonGenerator gen, SerializerProvider provider) throws IOException { + gen.writeStartObject(); + gen.writeStringField("name", value.getName()); + gen.writeStringField("type", value.getNodeType()); + gen.writeStringField("parentDir", value.getParent().getFilePath()); + gen.writeStringField("ownerEmail", value.getOwnerEmail()); + if (value.getNodeType().equals("file")) { + gen.writeObjectField("size", value.getSize()); + } + if (value.getNodeType().equals("directory")) { + gen.writeFieldName("children"); + gen.writeStartArray(); + List children = value.getChildren(); + for (DatasetFileNode child : JavaConverters.seqAsJavaList(children)) { + serialize(child, gen, provider); // Recursively serialize children + } + gen.writeEndArray(); + } + gen.writeEndObject(); + } +} diff --git a/core/file-service/src/main/scala/edu/uci/ics/texera/service/util/S3StorageClient.scala b/core/file-service/src/main/scala/edu/uci/ics/texera/service/util/S3StorageClient.scala new file mode 100644 index 00000000000..de1e70bbc76 --- /dev/null +++ b/core/file-service/src/main/scala/edu/uci/ics/texera/service/util/S3StorageClient.scala @@ -0,0 +1,120 @@ +package edu.uci.ics.texera.service.util + +import edu.uci.ics.amber.core.storage.StorageConfig +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.s3.{S3Client, S3Configuration} +import software.amazon.awssdk.services.s3.model._ + +import java.security.MessageDigest +import scala.jdk.CollectionConverters._ + +/** + * S3Storage provides an abstraction for S3-compatible storage (e.g., MinIO). + * - Uses credentials and endpoint from StorageConfig. + * - Supports object upload, download, listing, and deletion. + */ +object S3StorageClient { + // Initialize MinIO-compatible S3 Client + private lazy val s3Client: S3Client = { + val credentials = AwsBasicCredentials.create(StorageConfig.s3Username, StorageConfig.s3Password) + S3Client + .builder() + .credentialsProvider(StaticCredentialsProvider.create(credentials)) + .region(Region.of(StorageConfig.s3Region)) + .endpointOverride(java.net.URI.create(StorageConfig.s3Endpoint)) // MinIO URL + .serviceConfiguration( + S3Configuration.builder().pathStyleAccessEnabled(true).build() + ) + .build() + } + + /** + * Checks if a directory (prefix) exists within an S3 bucket. + * + * @param bucketName The bucket name. + * @param directoryPrefix The directory (prefix) to check (must end with `/`). + * @return True if the directory contains at least one object, False otherwise. + */ + def directoryExists(bucketName: String, directoryPrefix: String): Boolean = { + // Ensure the prefix ends with `/` to correctly match directories + val normalizedPrefix = + if (directoryPrefix.endsWith("/")) directoryPrefix else directoryPrefix + "/" + + val listRequest = ListObjectsV2Request + .builder() + .bucket(bucketName) + .prefix(normalizedPrefix) + .maxKeys(1) // Only check if at least one object exists + .build() + + val listResponse = s3Client.listObjectsV2(listRequest) + !listResponse.contents().isEmpty // If contents exist, directory exists + } + + /** + * Creates an S3 bucket if it does not already exist. + * + * @param bucketName The name of the bucket to create. + */ + def createBucketIfNotExist(bucketName: String): Unit = { + try { + // Check if the bucket already exists + s3Client.headBucket(HeadBucketRequest.builder().bucket(bucketName).build()) + } catch { + case _: NoSuchBucketException | _: S3Exception => + // If the bucket does not exist, create it + val createBucketRequest = CreateBucketRequest.builder().bucket(bucketName).build() + s3Client.createBucket(createBucketRequest) + println(s"Bucket '$bucketName' created successfully.") + } + } + + /** + * Deletes a directory (all objects under a given prefix) from a bucket. + * + * @param bucketName Target S3/MinIO bucket. + * @param directoryPrefix The directory to delete (must end with `/`). + */ + def deleteDirectory(bucketName: String, directoryPrefix: String): Unit = { + // Ensure the directory prefix ends with `/` to avoid accidental deletions + val prefix = if (directoryPrefix.endsWith("/")) directoryPrefix else directoryPrefix + "/" + + // List objects under the given prefix + val listRequest = ListObjectsV2Request + .builder() + .bucket(bucketName) + .prefix(prefix) + .build() + + val listResponse = s3Client.listObjectsV2(listRequest) + + // Extract object keys + val objectKeys = listResponse.contents().asScala.map(_.key()) + + if (objectKeys.nonEmpty) { + val objectsToDelete = + objectKeys.map(key => ObjectIdentifier.builder().key(key).build()).asJava + + val deleteRequest = Delete + .builder() + .objects(objectsToDelete) + .build() + + // Compute MD5 checksum for MinIO if required + val md5Hash = MessageDigest + .getInstance("MD5") + .digest(deleteRequest.toString.getBytes("UTF-8")) + + // Convert object keys to S3 DeleteObjectsRequest format + val deleteObjectsRequest = DeleteObjectsRequest + .builder() + .bucket(bucketName) + .delete(deleteRequest) + .build() + + // Perform batch deletion + s3Client.deleteObjects(deleteObjectsRequest) + } + } +} diff --git a/core/gui/package.json b/core/gui/package.json index 59a9aa1b645..20b02f092b6 100644 --- a/core/gui/package.json +++ b/core/gui/package.json @@ -80,7 +80,6 @@ "read-excel-file": "5.7.1", "ring-buffer-ts": "1.0.3", "rxjs": "7.8.1", - "sanitize-filename": "1.6.3", "tinyqueue": "2.0.3", "ts-proto": "2.2.0", "tslib": "2.3.1", diff --git a/core/gui/proxy.config.json b/core/gui/proxy.config.json index 29ce58446f0..813650d2973 100755 --- a/core/gui/proxy.config.json +++ b/core/gui/proxy.config.json @@ -4,6 +4,11 @@ "secure": false, "changeOrigin": true }, + "/api/dataset": { + "target": "http://localhost:9092", + "secure": false, + "changeOrigin": true + }, "/api": { "target": "http://localhost:8080", "secure": false, diff --git a/core/gui/src/app/app.module.ts b/core/gui/src/app/app.module.ts index 38f59a986a1..dd5a27f80ee 100644 --- a/core/gui/src/app/app.module.ts +++ b/core/gui/src/app/app.module.ts @@ -142,6 +142,10 @@ import { SocialLoginModule, SocialAuthServiceConfig, GoogleSigninButtonModule } import { GoogleLoginProvider } from "@abacritt/angularx-social-login"; import { lastValueFrom } from "rxjs"; import { HubSearchResultComponent } from "./hub/component/hub-search-result/hub-search-result.component"; +import { UserDatasetStagedObjectsListComponent } from "./dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component"; +import { NzEmptyModule } from "ng-zorro-antd/empty"; +import { NzDividerModule } from "ng-zorro-antd/divider"; +import { NzProgressModule } from "ng-zorro-antd/progress"; registerLocaleData(en); @@ -201,6 +205,7 @@ registerLocaleData(en); UserDatasetVersionFiletreeComponent, UserDatasetListItemComponent, UserDatasetFileRendererComponent, + UserDatasetStagedObjectsListComponent, NzModalCommentBoxComponent, LeftPanelComponent, LocalLoginComponent, @@ -291,6 +296,9 @@ registerLocaleData(en); TreeModule, SocialLoginModule, GoogleSigninButtonModule, + NzEmptyModule, + NzDividerModule, + NzProgressModule, ], providers: [ provideNzI18n(en_US), diff --git a/core/gui/src/app/common/type/dataset-staged-object.ts b/core/gui/src/app/common/type/dataset-staged-object.ts new file mode 100644 index 00000000000..1d3d16dd16e --- /dev/null +++ b/core/gui/src/app/common/type/dataset-staged-object.ts @@ -0,0 +1,7 @@ +// Represents a staged dataset object change, corresponding to backend Diff +export interface DatasetStagedObject { + path: string; + pathType: "file" | "directory"; + diffType: "added" | "removed" | "changed"; + sizeBytes?: number; // Optional, only present for files +} diff --git a/core/gui/src/app/common/type/dataset.ts b/core/gui/src/app/common/type/dataset.ts index be53032944f..195f76d6719 100644 --- a/core/gui/src/app/common/type/dataset.ts +++ b/core/gui/src/app/common/type/dataset.ts @@ -14,7 +14,7 @@ export interface Dataset { did: number | undefined; ownerUid: number | undefined; name: string; - isPublic: number; + isPublic: boolean; storagePath: string | undefined; description: string; creationTime: number | undefined; diff --git a/core/gui/src/app/common/type/datasetVersionFileTree.ts b/core/gui/src/app/common/type/datasetVersionFileTree.ts index f0cef32e12f..39509d6eefc 100644 --- a/core/gui/src/app/common/type/datasetVersionFileTree.ts +++ b/core/gui/src/app/common/type/datasetVersionFileTree.ts @@ -11,6 +11,22 @@ export function getFullPathFromDatasetFileNode(node: DatasetFileNode): string { return `${node.parentDir}/${node.name}`; } +/** + * Returns the relative path of a DatasetFileNode by stripping the first three segments. + * @param node The DatasetFileNode whose relative path is needed. + * @returns The relative path (without the first three segments and without a leading slash). + */ +export function getRelativePathFromDatasetFileNode(node: DatasetFileNode): string { + const fullPath = getFullPathFromDatasetFileNode(node); // Get the full path + const pathSegments = fullPath.split("/").filter(segment => segment.length > 0); // Split and remove empty segments + + if (pathSegments.length <= 3) { + return ""; // If there are 3 or fewer segments, return an empty string (no relative path exists) + } + + return pathSegments.slice(3).join("/"); // Join remaining segments as the relative path +} + export function getPathsUnderOrEqualDatasetFileNode(node: DatasetFileNode): string[] { // Helper function to recursively gather paths const gatherPaths = (node: DatasetFileNode): string[] => { diff --git a/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.html b/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.html index eb0f9f37edb..d558295c8cf 100644 --- a/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.html +++ b/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.html @@ -1,6 +1,6 @@
-
-

Previous Uploads

- - -
-
-

New Uploads

- -
diff --git a/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.ts b/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.ts index c97f1ae9438..272601508fb 100644 --- a/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.ts +++ b/core/gui/src/app/dashboard/component/user/files-uploader/files-uploader.component.ts @@ -16,17 +16,11 @@ import { NotificationService } from "../../../../common/service/notification/not }) export class FilesUploaderComponent { @Input() - previouslyUploadFiles: DatasetFileNode[] | undefined; - previouslyUploadFilesManager: DatasetVersionFileTreeManager | undefined; + showUploadAlert: boolean = false; @Output() uploadedFiles = new EventEmitter(); - // - @Output() - removingFilePaths = new EventEmitter(); - newUploadNodeToFileItems: Map = new Map(); - newUploadFileTreeManager: DatasetVersionFileTreeManager = new DatasetVersionFileTreeManager(); newUploadFileTreeNodes: DatasetFileNode[] = []; fileUploadingFinished: boolean = false; @@ -87,9 +81,9 @@ export class FilesUploaderComponent { .filter((item): item is FileUploadItem => item !== null); if (successfulUploads.length > 0) { - successfulUploads.forEach(fileUploadItem => { - this.addFileToNewUploadsFileTree(fileUploadItem.name, fileUploadItem); - }); + // successfulUploads.forEach(fileUploadItem => { + // this.addFileToNewUploadsFileTree(fileUploadItem.name, fileUploadItem); + // }); const successMessage = `${successfulUploads.length} file${successfulUploads.length > 1 ? "s" : ""} selected successfully!`; this.showFileUploadBanner("success", successMessage); } @@ -100,54 +94,10 @@ export class FilesUploaderComponent { this.showFileUploadBanner("error", errorMessage); } - this.uploadedFiles.emit(Array.from(this.newUploadNodeToFileItems.values())); + this.uploadedFiles.emit(successfulUploads); }) .catch(error => { this.showFileUploadBanner("error", `Unexpected error: ${error.message}`); }); } - - onPreviouslyUploadedFileDeleted(node: DatasetFileNode) { - this.removeFileTreeNode(node, true); - const paths = getPathsUnderOrEqualDatasetFileNode(node); - this.removingFilePaths.emit(paths); - } - - onNewUploadsFileDeleted(node: DatasetFileNode) { - this.removeFileTreeNode(node, false); - this.uploadedFiles.emit(Array.from(this.newUploadNodeToFileItems.values())); - } - - private removeFileTreeNode(node: DatasetFileNode, fromPreviouslyUploads: boolean) { - if (fromPreviouslyUploads) { - if (!this.previouslyUploadFilesManager) { - this.previouslyUploadFilesManager = new DatasetVersionFileTreeManager(this.previouslyUploadFiles); - } - if (this.previouslyUploadFilesManager) { - this.previouslyUploadFilesManager.removeNode(node); - this.previouslyUploadFiles = [...this.previouslyUploadFilesManager.getRootNodes()]; - } - } else { - // from new uploads - this.newUploadFileTreeManager.removeNode(node); - this.newUploadFileTreeNodes = [...this.newUploadFileTreeManager.getRootNodes()]; - this.removeNodeAndChildrenFromFileItemsMap(node); - } - } - - private removeNodeAndChildrenFromFileItemsMap(node: DatasetFileNode) { - this.newUploadNodeToFileItems.delete(node); - - // Recursively remove children if it's a directory - if (node.type === "directory" && node.children) { - node.children.forEach(child => this.removeNodeAndChildrenFromFileItemsMap(child)); - } - } - - private addFileToNewUploadsFileTree(path: string, fileUploadItem: FileUploadItem) { - const newNode = this.newUploadFileTreeManager.addNodeWithPath(path); - - this.newUploadFileTreeNodes = [...this.newUploadFileTreeManager.getRootNodes()]; - this.newUploadNodeToFileItems.set(newNode, fileUploadItem); - } } diff --git a/core/gui/src/app/dashboard/component/user/list-item/list-item.component.html b/core/gui/src/app/dashboard/component/user/list-item/list-item.component.html index 0a86c22b302..89b6f898c4e 100644 --- a/core/gui/src/app/dashboard/component/user/list-item/list-item.component.html +++ b/core/gui/src/app/dashboard/component/user/list-item/list-item.component.html @@ -116,11 +116,12 @@ nz-col nzFlex="75px" class="resource-info"> -
- Size: {{ formatSize(entry.size) }} -
+ + + + + +
{ - this.isPublic = dashboardDataset.dataset.isPublic === 1; + this.isPublic = dashboardDataset.dataset.isPublic; }); } } diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.html b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.html index bd134b605c3..0f8fa840c88 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.html +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.html @@ -1,4 +1,4 @@ -
+

Dataset: {{datasetName}}

Dataset: {{datasetName}}
- +

@@ -71,6 +71,7 @@

+ + nzTheme="outline">
-
-
-
Create a New Dataset
-
Create a New Version
-
-
- - -
-
-
-
-

Dataset Explorer

-
-
-
Choose a Version:
-
- - - +
+ + +
+
Choose a Version:
+
+ + + + + + + + + + + + + + + + +
+
+ + Version Size: {{ formatSize(currentDatasetVersionSize) }} +
+
+ + +
+
+ + + + + +
+
+ {{ uploadProgress.status }}: {{ uploadProgress.filePath }} + +
+ +
+ -
-
- - Version Size: {{ formatSize(currentDatasetVersionSize) }} -
-
- - > - - + +
diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.scss b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.scss index 7a57e8f5978..f2c6e53cfb9 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.scss +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.scss @@ -1,15 +1,17 @@ -.rounded-button { - display: inline-flex; /* Use flexbox for alignment */ +.create-dataset-version-button { + display: flex; /* Use flexbox for centering */ align-items: center; /* Center vertically */ justify-content: center; /* Center horizontally */ color: white; border: none; - padding: 10px 20px; /* Adjust padding as needed */ + padding: 12px 40px; /* Increase padding for a wider button */ border-radius: 25px; cursor: pointer; transition: background-color 0.3s; - margin-top: 50px; - margin-left: 20px; + margin: 50px auto 0 auto; /* Auto margins for horizontal centering */ + width: 200px; /* Adjust width as needed */ + font-size: 18px; /* Make text slightly bigger */ + font-weight: bold; /* Optional: Make text bold */ } .version-storage { @@ -139,3 +141,11 @@ nz-select { .liked { color: red; } + +.empty-version-indicator { + margin-top: 15%; +} + +.upload-progress-container { + margin-left: 20px; +} diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts index 81b192b8204..fd488d6a9ec 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts @@ -1,9 +1,14 @@ -import { Component, OnInit } from "@angular/core"; +import { Component, EventEmitter, OnInit, Output } from "@angular/core"; import { ActivatedRoute, Router } from "@angular/router"; import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; -import { DatasetService } from "../../../../service/user/dataset/dataset.service"; +import { DatasetService, MultipartUploadProgress } from "../../../../service/user/dataset/dataset.service"; import { NzResizeEvent } from "ng-zorro-antd/resizable"; -import { DatasetFileNode, getFullPathFromDatasetFileNode } from "../../../../../common/type/datasetVersionFileTree"; +import { + DatasetFileNode, + getFullPathFromDatasetFileNode, + getPathsUnderOrEqualDatasetFileNode, + getRelativePathFromDatasetFileNode, +} from "../../../../../common/type/datasetVersionFileTree"; import { DatasetVersion } from "../../../../../common/type/dataset"; import { switchMap, throttleTime } from "rxjs/operators"; import { NotificationService } from "../../../../../common/service/notification/notification.service"; @@ -13,6 +18,12 @@ import { DASHBOARD_USER_DATASET } from "../../../../../app-routing.constant"; import { UserService } from "../../../../../common/service/user/user.service"; import { isDefined } from "../../../../../common/util/predicate"; import { HubService } from "../../../../../hub/service/hub.service"; +import { FileUploadItem } from "../../../../type/dashboard-file.interface"; +import { file } from "jszip"; +import { DatasetStagedObject } from "../../../../../common/type/dataset-staged-object"; +import { NzModalService } from "ng-zorro-antd/modal"; +import { UserDatasetVersionCreatorComponent } from "./user-dataset-version-creator/user-dataset-version-creator.component"; +import { DashboardDataset } from "../../../../type/dashboard-dataset.interface"; export const THROTTLE_TIME_MS = 1000; @@ -40,8 +51,6 @@ export class DatasetDetailComponent implements OnInit { public selectedVersion: DatasetVersion | undefined; public fileTreeNodeList: DatasetFileNode[] = []; - public isCreatingVersion: boolean = false; - public isCreatingDataset: boolean = false; public versionCreatorBaseVersion: DatasetVersion | undefined; public isLogin: boolean = this.userService.isLogin(); @@ -51,9 +60,14 @@ export class DatasetDetailComponent implements OnInit { public viewCount: number = 0; public displayPreciseViewCount = false; + userHasPendingChanges: boolean = false; + public uploadProgress: MultipartUploadProgress | null = null; + + @Output() userMakeChanges = new EventEmitter(); + constructor( private route: ActivatedRoute, - private router: Router, + private modalService: NzModalService, private datasetService: DatasetService, private notificationService: NotificationService, private downloadService: DownloadService, @@ -72,7 +86,7 @@ export class DatasetDetailComponent implements OnInit { // item for control the resizeable sider MAX_SIDER_WIDTH = 600; MIN_SIDER_WIDTH = 150; - siderWidth = 200; + siderWidth = 400; id = -1; onSideResize({ width }: NzResizeEvent): void { cancelAnimationFrame(this.id); @@ -85,15 +99,9 @@ export class DatasetDetailComponent implements OnInit { this.route.params .pipe( switchMap(params => { - const param = params["did"]; - if (param !== "create") { - this.did = param; - this.renderDatasetViewSider(); - this.retrieveDatasetInfo(); - this.retrieveDatasetVersionList(); - } else { - this.renderDatasetCreatorSider(); - } + this.did = params["did"]; + this.retrieveDatasetInfo(); + this.retrieveDatasetVersionList(); return this.route.data; // or some other observable }), untilDestroyed(this) @@ -131,55 +139,35 @@ export class DatasetDetailComponent implements OnInit { }); } - renderDatasetViewSider() { - this.isCreatingVersion = false; - this.isCreatingDataset = false; - } - renderDatasetCreatorSider() { - this.isCreatingVersion = false; - this.isCreatingDataset = true; - this.siderWidth = this.MAX_SIDER_WIDTH; - } - - renderVersionCreatorSider() { + public onClickOpenVersionCreator() { if (this.did) { - this.datasetService - .retrieveDatasetLatestVersion(this.did) - .pipe(untilDestroyed(this)) - .subscribe(latestVersion => { - this.versionCreatorBaseVersion = latestVersion; - this.isCreatingDataset = false; - this.isCreatingVersion = true; - this.siderWidth = this.MAX_SIDER_WIDTH; - }); - } - } - - public onCreationFinished(creationID: number) { - if (creationID != 0) { - // creation succeed - if (this.isCreatingVersion) { - this.retrieveDatasetVersionList(); - this.renderDatasetViewSider(); - } else { - this.router.navigate([`${DASHBOARD_USER_DATASET}/${creationID}`]); - } - } else { - // creation failed - if (this.isCreatingVersion) { - this.isCreatingVersion = false; - this.isCreatingDataset = false; - this.retrieveDatasetVersionList(); - } else { - this.router.navigate([DASHBOARD_USER_DATASET]); - } + const modal = this.modalService.create({ + nzTitle: "Create New Dataset Version", + nzContent: UserDatasetVersionCreatorComponent, + nzFooter: null, + nzData: { + isCreatingVersion: true, + did: this.did, + }, + nzBodyStyle: { + resize: "both", + overflow: "auto", + minHeight: "200px", + minWidth: "550px", + maxWidth: "90vw", + maxHeight: "80vh", + }, + nzWidth: "fit-content", + }); + modal.afterClose.pipe(untilDestroyed(this)).subscribe(result => { + if (result != null) { + this.retrieveDatasetVersionList(); + this.userMakeChanges.emit(); + } + }); } } - public onClickOpenVersionCreator() { - this.renderVersionCreatorSider(); - } - onPublicStatusChange(checked: boolean): void { // Handle the change in dataset public status if (this.did) { @@ -212,7 +200,7 @@ export class DatasetDetailComponent implements OnInit { this.datasetName = dataset.name; this.datasetDescription = dataset.description; this.userDatasetAccessLevel = dashboardDataset.accessPrivilege; - this.datasetIsPublic = dataset.isPublic === 1; + this.datasetIsPublic = dataset.isPublic; if (typeof dataset.creationTime === "number") { this.datasetCreationTime = new Date(dataset.creationTime).toString(); } @@ -229,8 +217,10 @@ export class DatasetDetailComponent implements OnInit { this.versions = versionNames; // by default, the selected version is the 1st element in the retrieved list // which is guaranteed(by the backend) to be the latest created version. - this.selectedVersion = this.versions[0]; - this.onVersionSelected(this.selectedVersion); + if (this.versions.length > 0) { + this.selectedVersion = this.versions[0]; + this.onVersionSelected(this.selectedVersion); + } }); } } @@ -246,15 +236,6 @@ export class DatasetDetailComponent implements OnInit { this.downloadService.downloadSingleFile(this.currentDisplayedFileName).pipe(untilDestroyed(this)).subscribe(); }; - onClickDownloadVersionAsZip = (): void => { - if (!this.did || !this.selectedVersion?.dvid) return; - - this.downloadService - .downloadDatasetVersion(this.did, this.selectedVersion.dvid, this.datasetName, this.selectedVersion.name) - .pipe(untilDestroyed(this)) - .subscribe(); - }; - onClickScaleTheView() { this.isMaximized = !this.isMaximized; } @@ -263,6 +244,10 @@ export class DatasetDetailComponent implements OnInit { this.isRightBarCollapsed = !this.isRightBarCollapsed; } + onStagedObjectsUpdated(stagedObjects: DatasetStagedObject[]) { + this.userHasPendingChanges = stagedObjects.length > 0; + } + onVersionSelected(version: DatasetVersion): void { this.selectedVersion = version; if (this.did && this.selectedVersion.dvid) @@ -284,14 +269,93 @@ export class DatasetDetailComponent implements OnInit { this.loadFileContent(node); } - isDisplayingDataset(): boolean { - return !this.isCreatingDataset && !this.isCreatingVersion; - } - userHasWriteAccess(): boolean { return this.userDatasetAccessLevel == "WRITE"; } + onNewUploadFilesChanged(files: FileUploadItem[]) { + if (this.did) { + const did = this.did; + files.forEach(file => { + this.datasetService + .multipartUpload(this.datasetName, file.name, file.file) + .pipe(untilDestroyed(this)) + .subscribe({ + next: res => { + this.uploadProgress = res; // Update the progress UI + }, + error: () => { + this.uploadProgress = { + filePath: file.name, + percentage: 100, + status: "aborted", + physicalAddress: "", + uploadId: "", + }; + setTimeout(() => (this.uploadProgress = null), 3000); // Auto-hide after 3s + }, + complete: () => { + this.uploadProgress = { + filePath: file.name, + percentage: 100, + status: "finished", + uploadId: "", + physicalAddress: "", + }; + this.userMakeChanges.emit(); + setTimeout(() => (this.uploadProgress = null), 3000); // Auto-hide after 3s + }, + }); + }); + } + } + + onClickAbortUploadProgress() { + if (this.uploadProgress) { + this.datasetService + .finalizeMultipartUpload( + this.datasetName, + this.uploadProgress.filePath, + this.uploadProgress.uploadId, + [], + this.uploadProgress.physicalAddress, + true + ) + .pipe(untilDestroyed(this)) + .subscribe(res => { + this.notificationService.info(`${this.uploadProgress?.filePath} uploading has been terminated`); + }); + } + this.uploadProgress = null; + } + + getUploadStatus(status: "initializing" | "uploading" | "finished" | "aborted"): "active" | "exception" | "success" { + return status === "uploading" || status === "initializing" + ? "active" + : status === "aborted" + ? "exception" + : "success"; + } + + onPreviouslyUploadedFileDeleted(node: DatasetFileNode) { + if (this.did) { + this.datasetService + .deleteDatasetFile(this.did, getRelativePathFromDatasetFileNode(node)) + .pipe(untilDestroyed(this)) + .subscribe({ + next: (res: Response) => { + this.notificationService.success( + `File ${node.name} is successfully deleted. You may finalize it or revert it at the "Create Version" panel` + ); + this.userMakeChanges.emit(); + }, + error: (err: unknown) => { + this.notificationService.error("Failed to delete the file"); + }, + }); + } + } + // alias for formatSize formatSize = formatSize; diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts index b46160732be..7f649b821bb 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts @@ -9,6 +9,7 @@ import { NotificationService } from "../../../../../../common/service/notificati export const MIME_TYPES = { JPEG: "image/jpeg", + JPG: "image/jpeg", PNG: "image/png", CSV: "text/csv", TXT: "text/plain", @@ -24,6 +25,13 @@ export const MIME_TYPES = { OCTET_STREAM: "application/octet-stream", // Default binary format }; +export function getMimeType(filename: string): string { + const extension = filename.split(".").pop()?.toUpperCase(); + return extension && MIME_TYPES[extension as keyof typeof MIME_TYPES] + ? MIME_TYPES[extension as keyof typeof MIME_TYPES] + : MIME_TYPES.OCTET_STREAM; +} + // the size limits for all preview-supported types export const MIME_TYPE_SIZE_LIMITS_MB = { [MIME_TYPES.JPEG]: 5 * 1024 * 1024, // 5 MB @@ -136,7 +144,7 @@ export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDe .subscribe({ next: blob => { this.isLoading = false; - const blobMimeType = blob.type; + const blobMimeType = getMimeType(this.filePath); if (!this.isPreviewSupported(blobMimeType)) { this.onFileTypePreviewUnsupported(); return; diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.html b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.html new file mode 100644 index 00000000000..f4ec6203ba6 --- /dev/null +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.html @@ -0,0 +1,37 @@ +
+ + + + {{ obj.diffType }} + + + {{ obj.path }} + + + + + + + + +
diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.scss b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.scss new file mode 100644 index 00000000000..858dc36eae5 --- /dev/null +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.scss @@ -0,0 +1,24 @@ +/* Styles for the file tree container */ +.staged-object-list-container { + max-height: 200px; /* Adjust the max-height as needed */ + overflow-y: auto; /* Enables vertical scrolling when content exceeds max-height */ + overflow-x: auto; /* Prevents horizontal scrolling */ +} + +.truncate-file-path { + display: inline-block; + max-width: 250px; /* Adjust width as needed */ + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.delete-button { + width: 24px; /* Minimum width for button */ + height: 24px; /* Keep the button small */ + padding: 0; /* Remove extra padding */ + display: flex; + align-items: center; + justify-content: center; + min-width: unset; /* Prevents unnecessary expansion */ +} diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.ts b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.ts new file mode 100644 index 00000000000..50d0d02ce0d --- /dev/null +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-staged-objects-list/user-dataset-staged-objects-list.component.ts @@ -0,0 +1,65 @@ +import { Component, EventEmitter, Input, OnInit, Output } from "@angular/core"; +import { DatasetStagedObject } from "../../../../../../common/type/dataset-staged-object"; +import { DatasetService } from "../../../../../service/user/dataset/dataset.service"; +import { NotificationService } from "../../../../../../common/service/notification/notification.service"; +import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; + +@UntilDestroy() +@Component({ + selector: "texera-dataset-staged-objects-list", + templateUrl: "./user-dataset-staged-objects-list.component.html", + styleUrls: ["./user-dataset-staged-objects-list.component.scss"], +}) +export class UserDatasetStagedObjectsListComponent implements OnInit { + @Input() did?: number; // Dataset ID + @Input() set userMakeChangesEvent(event: EventEmitter) { + if (event) { + event.pipe(untilDestroyed(this)).subscribe(() => { + this.fetchDatasetStagedObjects(); + }); + } + } + + @Output() stagedObjectsChanged = new EventEmitter(); // Emits staged objects list + + datasetStagedObjects: DatasetStagedObject[] = []; + + constructor( + private datasetService: DatasetService, + private notificationService: NotificationService + ) {} + + ngOnInit(): void { + this.fetchDatasetStagedObjects(); + } + + private fetchDatasetStagedObjects(): void { + if (this.did != undefined) { + this.datasetService + .getDatasetDiff(this.did) + .pipe(untilDestroyed(this)) + .subscribe(diffs => { + this.datasetStagedObjects = diffs; + // Emit the updated staged objects list + this.stagedObjectsChanged.emit(this.datasetStagedObjects); + }); + } + } + + onObjectReverted(objDiff: DatasetStagedObject) { + if (this.did) { + this.datasetService + .resetDatasetFileDiff(this.did, objDiff.path) + .pipe(untilDestroyed(this)) + .subscribe({ + next: (res: Response) => { + this.notificationService.success(`"${objDiff.diffType} ${objDiff.path}" is successfully reverted`); + this.fetchDatasetStagedObjects(); + }, + error: (err: unknown) => { + this.notificationService.error("Failed to delete the file"); + }, + }); + } + } +} diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.html b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.html index 5a9401b2be6..477d1ba61c5 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.html +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.html @@ -1,11 +1,11 @@
+ nzTip="Creating...">
+ [ngClass]="{'disabled-backdrop': isCreating}">
-
- - -
@@ -39,7 +32,7 @@ nzType="default" (click)="onClickCancel()" class="cancel-btn" - [disabled]="isUploading"> + [disabled]="isCreating"> Cancel
diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.scss b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.scss index b31ae1fd0f7..71ecd39e768 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.scss +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.scss @@ -27,6 +27,7 @@ cursor: pointer; margin-right: 15%; margin-left: 5%; + margin-top: 10%; width: 30%; text-align: center; &:hover { diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.ts b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.ts index bfa34d04623..d062264c815 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.ts +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component.ts @@ -1,13 +1,12 @@ -import { Component, EventEmitter, Input, OnInit, Output } from "@angular/core"; +import { Component, EventEmitter, inject, Input, OnInit, Output } from "@angular/core"; import { FormBuilder, FormGroup, Validators } from "@angular/forms"; import { FormlyFieldConfig } from "@ngx-formly/core"; import { DatasetService } from "../../../../../service/user/dataset/dataset.service"; -import { FileUploadItem } from "../../../../../type/dashboard-file.interface"; -import { Dataset, DatasetVersion } from "../../../../../../common/type/dataset"; +import { Dataset } from "../../../../../../common/type/dataset"; import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; import { NotificationService } from "../../../../../../common/service/notification/notification.service"; -import sanitize from "sanitize-filename"; import { HttpErrorResponse } from "@angular/common/http"; +import { NZ_MODAL_DATA, NzModalRef } from "ng-zorro-antd/modal"; @UntilDestroy() @Component({ @@ -16,22 +15,12 @@ import { HttpErrorResponse } from "@angular/common/http"; styleUrls: ["./user-dataset-version-creator.component.scss"], }) export class UserDatasetVersionCreatorComponent implements OnInit { - @Input() - isCreatingVersion: boolean = false; + readonly isCreatingVersion: boolean = inject(NZ_MODAL_DATA).isCreatingVersion; - @Input() - baseVersion: DatasetVersion | undefined; - - // this emits the ID of the newly created version/dataset, will emit 0 if creation is failed. - @Output() - datasetOrVersionCreationID: EventEmitter = new EventEmitter(); + readonly did: number = inject(NZ_MODAL_DATA)?.did ?? undefined; isCreateButtonDisabled: boolean = false; - newUploadFiles: FileUploadItem[] = []; - - removedFilePaths: string[] = []; - public form: FormGroup = new FormGroup({}); model: any = {}; fields: FormlyFieldConfig[] = []; @@ -41,9 +30,10 @@ export class UserDatasetVersionCreatorComponent implements OnInit { isDatasetNameSanitized: boolean = false; // boolean to control if is uploading - isUploading: boolean = false; + isCreating: boolean = false; constructor( + private modalRef: NzModalRef, private datasetService: DatasetService, private notificationService: NotificationService, private formBuilder: FormBuilder @@ -86,15 +76,6 @@ export class UserDatasetVersionCreatorComponent implements OnInit { label: "Description", }, }, - { - key: "versionDescription", - type: "input", - defaultValue: "", - templateOptions: { - label: "Version Description", - required: false, - }, - }, ]; } get formControlNames(): string[] { @@ -102,10 +83,16 @@ export class UserDatasetVersionCreatorComponent implements OnInit { } datasetNameSanitization(datasetName: string): string { - const sanitizedDatasetName = sanitize(datasetName); - if (sanitizedDatasetName != datasetName) { + // Remove leading spaces + let sanitizedDatasetName = datasetName.trimStart(); + + // Replace all characters that are not letters (a-z, A-Z), numbers (0-9) with a short dash "-" + sanitizedDatasetName = sanitizedDatasetName.replace(/[^a-zA-Z0-9]+/g, "-"); + + if (sanitizedDatasetName !== datasetName) { this.isDatasetNameSanitized = true; } + return sanitizedDatasetName; } @@ -117,7 +104,7 @@ export class UserDatasetVersionCreatorComponent implements OnInit { } onClickCancel() { - this.datasetOrVersionCreationID.emit(0); + this.modalRef.close(null); } onClickCreate() { @@ -128,61 +115,56 @@ export class UserDatasetVersionCreatorComponent implements OnInit { return; // Stop further execution if the form is not valid } - if (this.newUploadFiles.length == 0 && this.removedFilePaths.length == 0) { - this.notificationService.error( - `Please either upload new file(s) or remove old file(s) when creating a new ${this.isCreatingVersion ? "Version" : "Dataset"}` - ); - return; - } - - this.isUploading = true; - if (this.isCreatingVersion && this.baseVersion) { + this.isCreating = true; + if (this.isCreatingVersion && this.did) { const versionName = this.form.get("versionDescription")?.value; this.datasetService - .createDatasetVersion(this.baseVersion?.did, versionName, this.removedFilePaths, this.newUploadFiles) + .createDatasetVersion(this.did, versionName) .pipe(untilDestroyed(this)) .subscribe({ next: res => { this.notificationService.success("Version Created"); - this.datasetOrVersionCreationID.emit(res.dvid); - this.isUploading = false; + this.isCreating = false; + // creation succeed, emit created version + this.modalRef.close(res); }, error: (res: unknown) => { const err = res as HttpErrorResponse; this.notificationService.error(`Version creation failed: ${err.error.message}`); - this.isUploading = false; + this.isCreating = false; + // creation failed, emit null value + this.modalRef.close(null); }, }); } else { const ds: Dataset = { name: this.datasetNameSanitization(this.form.get("name")?.value), description: this.form.get("description")?.value, - isPublic: this.isDatasetPublic ? 1 : 0, + isPublic: this.isDatasetPublic, did: undefined, ownerUid: undefined, storagePath: undefined, creationTime: undefined, versionHierarchy: undefined, }; - const initialVersionName = this.form.get("versionDescription")?.value; - - // do the name sanitization - this.datasetService - .createDataset(ds, initialVersionName, this.newUploadFiles) + .createDataset(ds) .pipe(untilDestroyed(this)) .subscribe({ next: res => { this.notificationService.success( `Dataset '${ds.name}' Created. ${this.isDatasetNameSanitized ? "We have sanitized your provided dataset name for the compatibility reason" : ""}` ); - this.datasetOrVersionCreationID.emit(res.dataset.did); - this.isUploading = false; + this.isCreating = false; + // if creation succeed, emit the created dashboard dataset + this.modalRef.close(res); }, error: (res: unknown) => { const err = res as HttpErrorResponse; this.notificationService.error(`Dataset ${ds.name} creation failed: ${err.error.message}`); - this.isUploading = false; + this.isCreating = false; + // if creation failed, emit null value + this.modalRef.close(null); }, }); } @@ -192,12 +174,4 @@ export class UserDatasetVersionCreatorComponent implements OnInit { // Handle the change in dataset public status this.isDatasetPublic = newValue; } - - onNewUploadFilesChanged(files: FileUploadItem[]) { - this.newUploadFiles = files; - } - - onRemovingFilePathsChanged(paths: string[]) { - this.removedFilePaths = this.removedFilePaths.concat(paths); - } } diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.scss b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.scss index fe9407f78a0..a20082ce2c7 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.scss +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.scss @@ -10,7 +10,7 @@ /* Styles for the file tree container */ .file-tree-container { - max-height: 500px; /* Adjust the max-height as needed */ + max-height: 200px; /* Adjust the max-height as needed */ overflow-y: auto; /* Enables vertical scrolling when content exceeds max-height */ overflow-x: auto; /* Prevents horizontal scrolling */ } diff --git a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset.component.ts b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset.component.ts index e1bbb7a67b4..ca1b69da633 100644 --- a/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset.component.ts +++ b/core/gui/src/app/dashboard/component/user/user-dataset/user-dataset.component.ts @@ -9,7 +9,12 @@ import { DashboardEntry, UserInfo } from "../../../type/dashboard-entry"; import { SearchResultsComponent } from "../search-results/search-results.component"; import { FiltersComponent } from "../filters/filters.component"; import { firstValueFrom } from "rxjs"; -import { DASHBOARD_USER_DATASET_CREATE } from "../../../../app-routing.constant"; +import { DASHBOARD_USER_DATASET, DASHBOARD_USER_DATASET_CREATE } from "../../../../app-routing.constant"; +import { NzModalService } from "ng-zorro-antd/modal"; +import { FileSelectionComponent } from "../../../../workspace/component/file-selection/file-selection.component"; +import { DatasetFileNode, getFullPathFromDatasetFileNode } from "../../../../common/type/datasetVersionFileTree"; +import { UserDatasetVersionCreatorComponent } from "./user-dataset-explorer/user-dataset-version-creator/user-dataset-version-creator.component"; +import { DashboardDataset } from "../../../type/dashboard-dataset.interface"; @UntilDestroy() @Component({ @@ -50,6 +55,7 @@ export class UserDatasetComponent implements AfterViewInit { private masterFilterList: ReadonlyArray | null = null; constructor( + private modalService: NzModalService, private userService: UserService, private router: Router, private searchService: SearchService, @@ -152,7 +158,30 @@ export class UserDatasetComponent implements AfterViewInit { } public onClickOpenDatasetAddComponent(): void { - this.router.navigate([DASHBOARD_USER_DATASET_CREATE]); + const modal = this.modalService.create({ + nzTitle: "Create New Dataset", + nzContent: UserDatasetVersionCreatorComponent, + nzFooter: null, + nzData: { + isCreatingVersion: false, + }, + nzBodyStyle: { + resize: "both", + overflow: "auto", + minHeight: "200px", + minWidth: "550px", + maxWidth: "90vw", + maxHeight: "80vh", + }, + nzWidth: "fit-content", + }); + // Handle the selection from the modal + modal.afterClose.pipe(untilDestroyed(this)).subscribe(result => { + if (result != null) { + const dashboardDataset: DashboardDataset = result as DashboardDataset; + this.router.navigate([`${DASHBOARD_USER_DATASET}/${dashboardDataset.dataset.did}`]); + } + }); } public deleteDataset(entry: DashboardEntry): void { @@ -160,7 +189,7 @@ export class UserDatasetComponent implements AfterViewInit { return; } this.datasetService - .deleteDatasets([entry.dataset.dataset.did]) + .deleteDatasets(entry.dataset.dataset.did) .pipe(untilDestroyed(this)) .subscribe(_ => { this.searchResultsComponent.entries = this.searchResultsComponent.entries.filter( diff --git a/core/gui/src/app/dashboard/service/user/dataset/dataset.service.ts b/core/gui/src/app/dashboard/service/user/dataset/dataset.service.ts index ddcef981906..58a98af3d56 100644 --- a/core/gui/src/app/dashboard/service/user/dataset/dataset.service.ts +++ b/core/gui/src/app/dashboard/service/user/dataset/dataset.service.ts @@ -1,12 +1,13 @@ import { Injectable } from "@angular/core"; import { HttpClient, HttpParams } from "@angular/common/http"; -import { map } from "rxjs/operators"; +import { catchError, map, mergeMap, switchMap, tap, toArray } from "rxjs/operators"; import { Dataset, DatasetVersion } from "../../../../common/type/dataset"; import { AppSettings } from "../../../../common/app-setting"; -import { Observable } from "rxjs"; +import { EMPTY, forkJoin, from, Observable, of, throwError } from "rxjs"; import { DashboardDataset } from "../../../type/dashboard-dataset.interface"; -import { FileUploadItem } from "../../../type/dashboard-file.interface"; import { DatasetFileNode } from "../../../../common/type/datasetVersionFileTree"; +import { DatasetStagedObject } from "../../../../common/type/dataset-staged-object"; +import { environment } from "../../../../../environments/environment"; export const DATASET_BASE_URL = "dataset"; export const DATASET_CREATE_URL = DATASET_BASE_URL + "/create"; @@ -26,28 +27,26 @@ export const DATASET_PUBLIC_VERSION_BASE_URL = "publicVersion"; export const DATASET_PUBLIC_VERSION_RETRIEVE_LIST_URL = DATASET_PUBLIC_VERSION_BASE_URL + "/list"; export const DATASET_GET_OWNERS_URL = DATASET_BASE_URL + "/datasetUserAccess"; +export interface MultipartUploadProgress { + filePath: string; + percentage: number; + status: "initializing" | "uploading" | "finished" | "aborted"; + uploadId: string; + physicalAddress: string; +} + @Injectable({ providedIn: "root", }) export class DatasetService { constructor(private http: HttpClient) {} - public createDataset( - dataset: Dataset, - initialVersionName: string, - filesToBeUploaded: FileUploadItem[] - ): Observable { - const formData = new FormData(); - formData.append("datasetName", dataset.name); - formData.append("datasetDescription", dataset.description); - formData.append("isDatasetPublic", dataset.isPublic.toString()); - formData.append("initialVersionName", initialVersionName); - - filesToBeUploaded.forEach(file => { - formData.append(`file:upload:${file.name}`, file.file); + public createDataset(dataset: Dataset): Observable { + return this.http.post(`${AppSettings.getApiEndpoint()}/${DATASET_CREATE_URL}`, { + datasetName: dataset.name, + datasetDescription: dataset.description, + isDatasetPublic: dataset.isPublic, }); - - return this.http.post(`${AppSettings.getApiEndpoint()}/${DATASET_CREATE_URL}`, formData); } public getDataset(did: number, isLogin: boolean = true): Observable { @@ -57,11 +56,23 @@ export class DatasetService { return this.http.get(apiUrl); } - public retrieveDatasetVersionSingleFile(path: string): Observable { - const encodedPath = encodeURIComponent(path); - return this.http.get(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/file?path=${encodedPath}`, { - responseType: "blob", - }); + /** + * Retrieves a single file from a dataset version using a pre-signed URL. + * @param filePath Relative file path within the dataset. + * @returns Observable + */ + public retrieveDatasetVersionSingleFile(filePath: string): Observable { + return this.http + .get<{ + presignedUrl: string; + }>( + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/presign-download?filePath=${encodeURIComponent(filePath)}` + ) + .pipe( + switchMap(({ presignedUrl }) => { + return this.http.get(presignedUrl, { responseType: "blob" }); + }) + ); } /** @@ -72,6 +83,7 @@ export class DatasetService { * @returns An Observable that emits a Blob containing the zip file */ public retrieveDatasetZip(options: { did: number; dvid?: number }): Observable { + // TODO: finish this let params = new HttpParams(); params = params.set("did", options.did.toString()); if (options.dvid) { @@ -85,31 +97,16 @@ export class DatasetService { } public retrieveAccessibleDatasets(): Observable { - return this.http.get(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}`); + return this.http.get(`${AppSettings.getApiEndpoint()}/${DATASET_LIST_URL}`); } - public createDatasetVersion( - did: number, - newVersion: string, - removedFilePaths: string[], - filesToBeUploaded: FileUploadItem[] - ): Observable { - const formData = new FormData(); - formData.append("versionName", newVersion); - - if (removedFilePaths.length > 0) { - const removedFilesString = JSON.stringify(removedFilePaths); - formData.append("file:remove", removedFilesString); - } - - filesToBeUploaded.forEach(file => { - formData.append(`file:upload:${file.name}`, file.file); - }); - + public createDatasetVersion(did: number, newVersion: string): Observable { return this.http .post<{ datasetVersion: DatasetVersion; fileNodes: DatasetFileNode[]; - }>(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/${did}/version/create`, formData) + }>(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/${did}/version/create`, newVersion, { + headers: { "Content-Type": "text/plain" }, + }) .pipe( map(response => { response.datasetVersion.fileNodes = response.fileNodes; @@ -118,6 +115,193 @@ export class DatasetService { ); } + /** + * Handles multipart upload for large files using RxJS, + * with a concurrency limit on how many parts we process in parallel. + */ + public multipartUpload(datasetName: string, filePath: string, file: File): Observable { + const partCount = Math.ceil(file.size / environment.multipartUploadChunkSizeByte); + const concurrencyLimit = environment.maxNumberOfConcurrentUploadingFileChunks; + + return new Observable(observer => { + this.initiateMultipartUpload(datasetName, filePath, partCount) + .pipe( + switchMap(initiateResponse => { + const { uploadId, presignedUrls, physicalAddress } = initiateResponse; + if (!uploadId) { + observer.error(new Error("Failed to initiate multipart upload")); + return EMPTY; + } + observer.next({ + filePath: filePath, + percentage: 0, + status: "initializing", + uploadId: uploadId, + physicalAddress: physicalAddress, + }); + + // Keep track of all uploaded parts + const uploadedParts: { PartNumber: number; ETag: string }[] = []; + let uploadedCount = 0; + + // 1) Convert presignedUrls into a stream of URLs + return from(presignedUrls).pipe( + // 2) Use mergeMap with concurrency limit to upload chunk by chunk + mergeMap((url, index) => { + const start = index * environment.multipartUploadChunkSizeByte; + const end = Math.min(start + environment.multipartUploadChunkSizeByte, file.size); + const chunk = file.slice(start, end); + + // Upload the chunk + return from(fetch(url, { method: "PUT", body: chunk })).pipe( + switchMap(response => { + if (!response.ok) { + return throwError(() => new Error(`Failed to upload part ${index + 1}`)); + } + const etag = response.headers.get("ETag")?.replace(/"/g, ""); + if (!etag) { + return throwError(() => new Error(`Missing ETag for part ${index + 1}`)); + } + + // Record the uploaded part + uploadedParts.push({ PartNumber: index + 1, ETag: etag }); + uploadedCount++; + + // Emit progress after each part + observer.next({ + filePath, + percentage: Math.round((uploadedCount / partCount) * 100), + status: "uploading", + uploadId: uploadId, + physicalAddress: physicalAddress, + }); + + return of(null); // indicate success + }) + ); + }, concurrencyLimit), + // 3) Collect results from all uploads (like forkJoin, but respects concurrency) + toArray(), + // 4) Finalize if all parts succeeded + switchMap(() => + this.finalizeMultipartUpload(datasetName, filePath, uploadId, uploadedParts, physicalAddress, false) + ), + tap(() => { + observer.next({ + filePath, + percentage: 100, + status: "finished", + uploadId: uploadId, + physicalAddress: physicalAddress, + }); + observer.complete(); + }), + catchError((error: unknown) => { + // If an error occurred, abort the upload + observer.next({ + filePath, + percentage: Math.round((uploadedCount / partCount) * 100), + status: "aborted", + uploadId: uploadId, + physicalAddress: physicalAddress, + }); + + return this.finalizeMultipartUpload( + datasetName, + filePath, + uploadId, + uploadedParts, + physicalAddress, + true + ).pipe(switchMap(() => throwError(() => error))); + }) + ); + }) + ) + .subscribe({ + error: (err: unknown) => observer.error(err), + }); + }); + } + + /** + * Initiates a multipart upload and retrieves presigned URLs for each part. + * @param datasetName Dataset Name + * @param filePath File path within the dataset + * @param numParts Number of parts for the multipart upload + */ + private initiateMultipartUpload( + datasetName: string, + filePath: string, + numParts: number + ): Observable<{ uploadId: string; presignedUrls: string[]; physicalAddress: string }> { + const params = new HttpParams() + .set("type", "init") + .set("datasetName", datasetName) + .set("filePath", encodeURIComponent(filePath)) + .set("numParts", numParts.toString()); + + return this.http.post<{ uploadId: string; presignedUrls: string[]; physicalAddress: string }>( + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, + {}, + { params } + ); + } + + /** + * Completes or aborts a multipart upload, sending part numbers and ETags to the backend. + */ + public finalizeMultipartUpload( + datasetName: string, + filePath: string, + uploadId: string, + parts: { PartNumber: number; ETag: string }[], + physicalAddress: string, + isAbort: boolean + ): Observable { + const params = new HttpParams() + .set("type", isAbort ? "abort" : "finish") + .set("datasetName", datasetName) + .set("filePath", encodeURIComponent(filePath)) + .set("uploadId", uploadId); + + return this.http.post( + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, + { parts, physicalAddress }, + { params } + ); + } + + /** + * Resets a dataset file difference in LakeFS. + * @param did Dataset ID + * @param filePath File path to reset + */ + public resetDatasetFileDiff(did: number, filePath: string): Observable { + const params = new HttpParams().set("filePath", encodeURIComponent(filePath)); + + return this.http.put(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/${did}/diff`, {}, { params }); + } + + /** + * Deletes a dataset file from LakeFS. + * @param did Dataset ID + * @param filePath File path to delete + */ + public deleteDatasetFile(did: number, filePath: string): Observable { + const params = new HttpParams().set("filePath", encodeURIComponent(filePath)); + + return this.http.delete(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/${did}/file`, { params }); + } + + /** + * Retrieves the list of uncommitted dataset changes (diffs). + * @param did Dataset ID + */ + public getDatasetDiff(did: number): Observable { + return this.http.get(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/${did}/diff`); + } + /** * retrieve a list of versions of a dataset. The list is sorted so that the latest versions are at front. * @param did @@ -165,10 +349,8 @@ export class DatasetService { return this.http.get<{ fileNodes: DatasetFileNode[]; size: number }>(apiUrl); } - public deleteDatasets(dids: number[]): Observable { - return this.http.post(`${AppSettings.getApiEndpoint()}/${DATASET_DELETE_URL}`, { - dids: dids, - }); + public deleteDatasets(did: number): Observable { + return this.http.delete(`${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/${did}`); } public updateDatasetName(did: number, name: string): Observable { diff --git a/core/gui/src/app/workspace/component/input-autocomplete/input-autocomplete.component.ts b/core/gui/src/app/workspace/component/input-autocomplete/input-autocomplete.component.ts index 008f5c81b68..24bce4c1fa9 100644 --- a/core/gui/src/app/workspace/component/input-autocomplete/input-autocomplete.component.ts +++ b/core/gui/src/app/workspace/component/input-autocomplete/input-autocomplete.component.ts @@ -50,7 +50,7 @@ export class InputAutoCompleteComponent extends FieldType { } get isFileSelectionEnabled(): boolean { - return environment.userSystemEnabled; + return environment.userSystemEnabled && environment.selectingFilesFromDatasetsEnabled; } get selectedFilePath(): string | null { diff --git a/core/gui/src/environments/environment.default.ts b/core/gui/src/environments/environment.default.ts index 52560cdcfe0..8d708f011a6 100644 --- a/core/gui/src/environments/environment.default.ts +++ b/core/gui/src/environments/environment.default.ts @@ -28,6 +28,12 @@ export const defaultEnvironment = { */ userSystemEnabled: false, + /** + * whether selecting files from datasets instead of the local file system. + * The user system must be enabled to make this flag work! + */ + selectingFilesFromDatasetsEnabled: true, + /** * whether local login is enabled */ @@ -78,6 +84,17 @@ export const defaultEnvironment = { */ singleFileUploadMaximumSizeMB: 20, + /** + * the maximum number of file chunks that can be held in the memory; + * you may increase this number if your deployment environment has enough memory resource. + */ + maxNumberOfConcurrentUploadingFileChunks: 10, + + /** + * the size of each chunk during the multipart upload of file + */ + multipartUploadChunkSizeByte: 50 * 1024 * 1024, // 50 MB + /** * default data transfer batch size for workflows */ diff --git a/core/gui/yarn.lock b/core/gui/yarn.lock index 050ab8bfee1..eefc75f7e67 100644 --- a/core/gui/yarn.lock +++ b/core/gui/yarn.lock @@ -11191,7 +11191,6 @@ __metadata: ring-buffer-ts: "npm:1.0.3" rxjs: "npm:7.8.1" rxjs-marbles: "npm:7.0.1" - sanitize-filename: "npm:1.6.3" sass: "npm:1.71.1" style-loader: "npm:3.3.4" tinyqueue: "npm:2.0.3" @@ -16514,15 +16513,6 @@ __metadata: languageName: node linkType: hard -"sanitize-filename@npm:1.6.3": - version: 1.6.3 - resolution: "sanitize-filename@npm:1.6.3" - dependencies: - truncate-utf8-bytes: "npm:^1.0.0" - checksum: 10c0/16ff47556a6e54e228c28db096bedd303da67b030d4bea4925fd71324932d6b02c7b0446f00ad33987b25b6414f24ae968e01a1a1679ce599542e82c4b07eb1f - languageName: node - linkType: hard - "sass-loader@npm:13.3.2": version: 13.3.2 resolution: "sass-loader@npm:13.3.2" @@ -17763,15 +17753,6 @@ __metadata: languageName: node linkType: hard -"truncate-utf8-bytes@npm:^1.0.0": - version: 1.0.2 - resolution: "truncate-utf8-bytes@npm:1.0.2" - dependencies: - utf8-byte-length: "npm:^1.0.1" - checksum: 10c0/af2b431fc4314f119b551e5fccfad49d4c0ef82e13ba9ca61be6567801195b08e732ce9643542e8ad1b3df44f3df2d7345b3dd34f723954b6bb43a14584d6b3c - languageName: node - linkType: hard - "ts-api-utils@npm:^1.0.1, ts-api-utils@npm:^1.3.0": version: 1.3.0 resolution: "ts-api-utils@npm:1.3.0" @@ -18436,13 +18417,6 @@ __metadata: languageName: node linkType: hard -"utf8-byte-length@npm:^1.0.1": - version: 1.0.5 - resolution: "utf8-byte-length@npm:1.0.5" - checksum: 10c0/e69bda3299608f4cc75976da9fb74ac94801a58b9ca29fdad03a20ec952e7477d7f226c12716b5f36bd4cff8151d1d152d02ee1df3752f017d4b2c725ce3e47a - languageName: node - linkType: hard - "util-deprecate@npm:^1.0.1, util-deprecate@npm:^1.0.2, util-deprecate@npm:~1.0.1": version: 1.0.2 resolution: "util-deprecate@npm:1.0.2" diff --git a/core/scripts/build-services.sh b/core/scripts/build-services.sh index 47f273640b1..b18585ea795 100755 --- a/core/scripts/build-services.sh +++ b/core/scripts/build-services.sh @@ -2,5 +2,8 @@ sbt clean dist unzip workflow-compiling-service/target/universal/workflow-compiling-service-0.1.0.zip -d target/ rm workflow-compiling-service/target/universal/workflow-compiling-service-0.1.0.zip +unzip file-service/target/universal/file-service-0.1.0.zip -d target/ +rm file-service/target/universal/file-service-0.1.0.zip + unzip amber/target/universal/texera-0.1-SNAPSHOT.zip -d amber/target/ rm amber/target/universal/texera-0.1-SNAPSHOT.zip diff --git a/core/scripts/deploy-daemon.sh b/core/scripts/deploy-daemon.sh index 0e17773f7a0..ae8f5f996fe 100755 --- a/core/scripts/deploy-daemon.sh +++ b/core/scripts/deploy-daemon.sh @@ -42,6 +42,15 @@ done echo "${green}WorkflowCompilingService launched at $(pgrep -f TexeraWorkflowCompilingService)${reset}" echo +echo "${green}Starting FileService in daemon...${reset}" +setsid nohup ./scripts/file-service.sh >/dev/null 2>&1 & +echo "${green}Waiting FileService to launch on 9092...${reset}" +while ! nc -z localhost 9092; do + sleep 0.1 # wait 100ms before check again +done +echo "${green}FileService launched at $(pgrep -f FileService)${reset}" +echo + echo "${green}Starting WorkflowComputingUnit in daemon...${reset}" setsid nohup ./scripts/workflow-computing-unit.sh >/dev/null 2>&1 & echo "${green}Waiting WorkflowComputingUnit to launch on 8085...${reset}" diff --git a/core/scripts/file-service.sh b/core/scripts/file-service.sh new file mode 100755 index 00000000000..c3b75743c42 --- /dev/null +++ b/core/scripts/file-service.sh @@ -0,0 +1 @@ +target/file-service-0.1.0/bin/file-service \ No newline at end of file diff --git a/core/scripts/terminate-daemon.sh b/core/scripts/terminate-daemon.sh index c4efeb8b754..306c27aa187 100755 --- a/core/scripts/terminate-daemon.sh +++ b/core/scripts/terminate-daemon.sh @@ -11,11 +11,16 @@ kill -9 $(pgrep -f WorkflowCompilingService) echo "${green}Terminated.${reset}" echo +echo "${red}Terminating FileService at $(pgrep -f FileService)...${reset}" +kill -9 $(pgrep -f FileService) +echo "${green}Terminated.${reset}" +echo + echo "${red}Terminating TexeraWebApplication at $(pgrep -f TexeraWebApplication)...${reset}" kill -9 $(pgrep -f TexeraWebApplication) echo "${green}Terminated.${reset}" echo -echo "${red}Terminating TexeraRunWorker at $(pgrep -f TexeraRunWorker)...${reset}" -kill -9 $(pgrep -f TexeraRunWorker) +echo "${red}Terminating ComputingUnitMaster at $(pgrep -f ComputingUnitMaster)...${reset}" +kill -9 $(pgrep -f ComputingUnitMaster) echo "${green}Terminated.${reset}" diff --git a/core/workflow-core/build.sbt b/core/workflow-core/build.sbt index e3a2ebb7eec..4cacebc207a 100644 --- a/core/workflow-core/build.sbt +++ b/core/workflow-core/build.sbt @@ -173,4 +173,5 @@ libraryDependencies ++= Seq( "org.eclipse.jgit" % "org.eclipse.jgit" % "5.13.0.202109080827-r", // jgit "org.yaml" % "snakeyaml" % "1.30", // yaml reader (downgrade to 1.30 due to dropwizard 1.3.23 required by amber) "org.apache.commons" % "commons-vfs2" % "2.9.0", // for FileResolver throw VFS-related exceptions + "io.lakefs" % "sdk" % "1.51.0", // for lakeFS api calls ) \ No newline at end of file diff --git a/core/workflow-core/src/main/resources/storage-config.yaml b/core/workflow-core/src/main/resources/storage-config.yaml index db01f33d900..3ca54a4969a 100644 --- a/core/workflow-core/src/main/resources/storage-config.yaml +++ b/core/workflow-core/src/main/resources/storage-config.yaml @@ -26,6 +26,25 @@ storage: num-retries: 10 min-wait-ms: 100 # 0.1s max-wait-ms: 10000 # 10s + # Configurations of the LakeFS & S3 for dataset storage; + # Default values are provided for each field, which you don't need to change them if you deployed LakeFS+S3 via docker-compose.yml in file-service/src/main/resources/docker-compose.yml + lakefs: + endpoint: "http://localhost:8000/api/v1" + auth: + api-secret: "random_string_for_lakefs" + username: "AKIAIOSFOLKFSSAMPLES" + password: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + block-storage: + type: "s3" + bucket-name: "texera-dataset" + + s3: + endpoint: "http://localhost:9000" + region: "us-west-2" + auth: + username: "texera_minio" + password: "password" + jdbc: url: "jdbc:postgresql://localhost:5432/texera_db?currentSchema=texera_db,public" username: "postgres" diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala index 18a5acd0c60..a5679ce0126 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala @@ -117,13 +117,13 @@ object FileResolver { } .toArray - // Prepend did and versionHash to the encoded path segments + // Prepend dataset name and versionHash to the encoded path segments val allPathSegments = Array( - dataset.getDid.intValue().toString, + datasetName, datasetVersion.getVersionHash ) ++ encodedFileRelativePath - // Build the the format /{did}/{versionHash}/{fileRelativePath}, both Linux and Windows use forward slash as the splitter + // Build the format /{datasetName}/{versionHash}/{fileRelativePath}, both Linux and Windows use forward slash as the splitter val uriSplitter = "/" val encodedPath = uriSplitter + allPathSegments.mkString(uriSplitter) diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/StorageConfig.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/StorageConfig.scala index c27a1d5f981..51526d00c5a 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/StorageConfig.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/StorageConfig.scala @@ -23,6 +23,12 @@ object StorageConfig { val icebergCommitMap = icebergTableMap("commit").asInstanceOf[JMap[String, Any]].asScala.toMap val icebergRetryMap = icebergCommitMap("retry").asInstanceOf[JMap[String, Any]].asScala.toMap val jdbcMap = storageMap("jdbc").asInstanceOf[JMap[String, Any]].asScala.toMap + val lakefsMap = storageMap("lakefs").asInstanceOf[JMap[String, Any]].asScala.toMap + val lakefsAuthMap = lakefsMap("auth").asInstanceOf[JMap[String, Any]].asScala.toMap + val lakefsBlockStorageMap = + lakefsMap("block-storage").asInstanceOf[JMap[String, Any]].asScala.toMap + val s3Map = storageMap("s3").asInstanceOf[JMap[String, Any]].asScala.toMap + val s3AuthMap = s3Map("auth").asInstanceOf[JMap[String, Any]].asScala.toMap javaConf.updated( "storage", @@ -44,6 +50,11 @@ object StorageConfig { ) ) .updated("jdbc", jdbcMap) + .updated( + "lakefs", + lakefsMap.updated("auth", lakefsAuthMap).updated("block-storage", lakefsBlockStorageMap) + ) + .updated("s3", s3Map.updated("auth", s3AuthMap)) ) } @@ -67,6 +78,7 @@ object StorageConfig { .asInstanceOf[Map[String, Any]]("commit-batch-size") .asInstanceOf[Int] + // Iceberg table configurations val icebergTableResultNamespace: String = conf("storage") .asInstanceOf[Map[String, Any]]("iceberg") .asInstanceOf[Map[String, Any]]("table") @@ -169,4 +181,57 @@ object StorageConfig { // File storage configurations val fileStorageDirectoryPath: Path = corePath.resolve("amber").resolve("user-resources").resolve("workflow-results") + + // LakeFS configurations + val lakefsEndpoint: String = conf("storage") + .asInstanceOf[Map[String, Any]]("lakefs") + .asInstanceOf[Map[String, Any]]("endpoint") + .asInstanceOf[String] + + val lakefsUsername: String = conf("storage") + .asInstanceOf[Map[String, Any]]("lakefs") + .asInstanceOf[Map[String, Any]]("auth") + .asInstanceOf[Map[String, Any]]("username") + .asInstanceOf[String] + + val lakefsPassword: String = conf("storage") + .asInstanceOf[Map[String, Any]]("lakefs") + .asInstanceOf[Map[String, Any]]("auth") + .asInstanceOf[Map[String, Any]]("password") + .asInstanceOf[String] + + // LakeFS Block Storage configurations + val lakefsBlockStorageType: String = conf("storage") + .asInstanceOf[Map[String, Any]]("lakefs") + .asInstanceOf[Map[String, Any]]("block-storage") + .asInstanceOf[Map[String, Any]]("type") + .asInstanceOf[String] + + val lakefsBlockStorageBucketName: String = conf("storage") + .asInstanceOf[Map[String, Any]]("lakefs") + .asInstanceOf[Map[String, Any]]("block-storage") + .asInstanceOf[Map[String, Any]]("bucket-name") + .asInstanceOf[String] + + val s3Endpoint: String = conf("storage") + .asInstanceOf[Map[String, Any]]("s3") + .asInstanceOf[Map[String, Any]]("endpoint") + .asInstanceOf[String] + + val s3Region: String = conf("storage") + .asInstanceOf[Map[String, Any]]("s3") + .asInstanceOf[Map[String, Any]]("region") + .asInstanceOf[String] + + val s3Username: String = conf("storage") + .asInstanceOf[Map[String, Any]]("s3") + .asInstanceOf[Map[String, Any]]("auth") + .asInstanceOf[Map[String, Any]]("username") + .asInstanceOf[String] + + val s3Password: String = conf("storage") + .asInstanceOf[Map[String, Any]]("s3") + .asInstanceOf[Map[String, Any]]("auth") + .asInstanceOf[Map[String, Any]]("password") + .asInstanceOf[String] } diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/DatasetFileDocument.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/DatasetFileDocument.scala index 0f65191e08b..59065b21845 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/DatasetFileDocument.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/DatasetFileDocument.scala @@ -1,45 +1,107 @@ package edu.uci.ics.amber.core.storage.model +import edu.uci.ics.amber.core.storage.model.DatasetFileDocument.{ + fileServiceGetPresignURLEndpoint, + userJwtToken +} +import edu.uci.ics.amber.core.storage.util.LakeFSStorageClient import edu.uci.ics.amber.core.storage.util.dataset.GitVersionControlLocalFileStorage import edu.uci.ics.amber.util.PathUtils import java.io.{File, FileOutputStream, InputStream} -import java.net.{URI, URLDecoder} +import java.net.{HttpURLConnection, URI, URL, URLDecoder, URLEncoder} import java.nio.charset.StandardCharsets import java.nio.file.{Files, Path, Paths} import scala.jdk.CollectionConverters.IteratorHasAsScala -private[storage] class DatasetFileDocument(uri: URI) extends VirtualDocument[Nothing] { +object DatasetFileDocument { + // Since requests need to be sent to the FileService in order to read the file, we store USER_JWT_TOKEN in the environment vars + // This variable should be NON-EMPTY in the dynamic-computing-unit architecture, i.e. each user-created computing unit should store user's jwt token. + // In the local development or other architectures, this token can be empty. + lazy val userJwtToken: String = sys.env.getOrElse("USER_JWT_TOKEN", "").trim + + // The endpoint of getting presigned url from the file service, also stored in the environment vars. + lazy val fileServiceGetPresignURLEndpoint: String = + sys.env + .getOrElse( + "FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT", + "http://localhost:9092/api/dataset/presign-download" + ) + .trim +} + +private[storage] class DatasetFileDocument(uri: URI) + extends VirtualDocument[Nothing] + with OnDataset { // Utility function to parse and decode URI segments into individual components - private def parseUri(uri: URI): (Int, String, Path) = { + private def parseUri(uri: URI): (String, String, Path) = { val segments = Paths.get(uri.getPath).iterator().asScala.map(_.toString).toArray if (segments.length < 3) throw new IllegalArgumentException("URI format is incorrect") - val did = segments(0).toInt + // TODO: consider whether use dataset name or did + val datasetName = segments(0) val datasetVersionHash = URLDecoder.decode(segments(1), StandardCharsets.UTF_8) val decodedRelativeSegments = segments.drop(2).map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) val fileRelativePath = Paths.get(decodedRelativeSegments.head, decodedRelativeSegments.tail: _*) - (did, datasetVersionHash, fileRelativePath) + (datasetName, datasetVersionHash, fileRelativePath) } // Extract components from URI using the utility function - private val (did, datasetVersionHash, fileRelativePath) = parseUri(uri) + private val (datasetName, datasetVersionHash, fileRelativePath) = parseUri(uri) private var tempFile: Option[File] = None override def getURI: URI = uri override def asInputStream(): InputStream = { - val datasetAbsolutePath = PathUtils.getDatasetPath(Integer.valueOf(did)) - GitVersionControlLocalFileStorage - .retrieveFileContentOfVersionAsInputStream( - datasetAbsolutePath, - datasetVersionHash, - datasetAbsolutePath.resolve(fileRelativePath) + if (userJwtToken.isEmpty) { + val presignUrl = LakeFSStorageClient.getFilePresignedUrl( + getDatasetName(), + getVersionHash(), + getFileRelativePath() ) + return new URL(presignUrl).openStream() + } + + // Step 1: Get the presigned URL from the file service + val presignRequestUrl = + s"$fileServiceGetPresignURLEndpoint?datasetName=${getDatasetName()}&commitHash=${getVersionHash()}&filePath=${URLEncoder + .encode(getFileRelativePath(), StandardCharsets.UTF_8.name())}" + + val connection = new URL(presignRequestUrl).openConnection().asInstanceOf[HttpURLConnection] + connection.setRequestMethod("GET") + connection.setRequestProperty("Authorization", s"Bearer $userJwtToken") + + try { + if (connection.getResponseCode != HttpURLConnection.HTTP_OK) { + throw new RuntimeException( + s"Failed to retrieve presigned URL: HTTP ${connection.getResponseCode}" + ) + } + + // Read response body as a string + val responseBody = + new String(connection.getInputStream.readAllBytes(), StandardCharsets.UTF_8) + + // Extract presigned URL from JSON response + val presignedUrl = responseBody + .split("\"presignedUrl\"\\s*:\\s*\"")(1) + .split("\"")(0) + + // Step 2: Fetch the file using the retrieved presigned URL + new URL(presignedUrl).openStream() + } catch { + case e: Exception => + throw new RuntimeException( + s"Failed to retrieve presigned URL from $fileServiceGetPresignURLEndpoint: ${e.getMessage}", + e + ) + } finally { + connection.disconnect() + } } override def asFile(): File = { @@ -75,8 +137,14 @@ private[storage] class DatasetFileDocument(uri: URI) extends VirtualDocument[Not } // then remove the dataset file GitVersionControlLocalFileStorage.removeFileFromRepo( - PathUtils.getDatasetPath(Integer.valueOf(did)), - PathUtils.getDatasetPath(Integer.valueOf(did)).resolve(fileRelativePath) + PathUtils.getDatasetPath(0), + PathUtils.getDatasetPath(0).resolve(fileRelativePath) ) } + + override def getVersionHash(): String = datasetVersionHash + + override def getDatasetName(): String = datasetName + + override def getFileRelativePath(): String = fileRelativePath.toString } diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/OnDataset.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/OnDataset.scala new file mode 100644 index 00000000000..e628f471347 --- /dev/null +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/model/OnDataset.scala @@ -0,0 +1,9 @@ +package edu.uci.ics.amber.core.storage.model + +trait OnDataset { + def getDatasetName(): String + + def getVersionHash(): String + + def getFileRelativePath(): String +} diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/util/LakeFSStorageClient.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/util/LakeFSStorageClient.scala new file mode 100644 index 00000000000..f282639483a --- /dev/null +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/util/LakeFSStorageClient.scala @@ -0,0 +1,293 @@ +package edu.uci.ics.amber.core.storage.util + +import io.lakefs.clients.sdk._ +import io.lakefs.clients.sdk.model._ + +import java.io.{File, FileOutputStream, InputStream} +import java.nio.file.Files +import scala.jdk.CollectionConverters._ +import edu.uci.ics.amber.core.storage.StorageConfig +import io.lakefs.clients.sdk.model.ResetCreation.TypeEnum + +/** + * LakeFSFileStorage provides high-level file storage operations using LakeFS, + * similar to Git operations for version control and file management. + */ +object LakeFSStorageClient { + + private lazy val apiClient: ApiClient = { + val client = new ApiClient() + client.setApiKey(StorageConfig.lakefsPassword) + client.setUsername(StorageConfig.lakefsUsername) + client.setPassword(StorageConfig.lakefsPassword) + client.setServers( + List( + new ServerConfiguration( + StorageConfig.lakefsEndpoint, + "LakeFS API server endpoint", + new java.util.HashMap[String, ServerVariable]() + ) + ).asJava + ) + client + } + private lazy val repoApi: RepositoriesApi = new RepositoriesApi(apiClient) + private lazy val objectsApi: ObjectsApi = new ObjectsApi(apiClient) + private lazy val branchesApi: BranchesApi = new BranchesApi(apiClient) + private lazy val commitsApi: CommitsApi = new CommitsApi(apiClient) + private lazy val refsApi: RefsApi = new RefsApi(apiClient) + private lazy val stagingApi: StagingApi = new StagingApi(apiClient) + private lazy val experimentalApi: ExperimentalApi = new ExperimentalApi(apiClient) + private lazy val healthCheckApi: HealthCheckApi = new HealthCheckApi(apiClient) + + private val storageNamespaceURI: String = + s"${StorageConfig.lakefsBlockStorageType}://${StorageConfig.lakefsBlockStorageBucketName}" + + private val branchName: String = "main" + + def healthCheck(): Unit = { + try { + this.healthCheckApi.healthCheck().execute() + } catch { + case e: Exception => + throw new RuntimeException(s"Failed to connect to lake fs server: ${e.getMessage}") + } + } + + /** + * Initializes a new repository in LakeFS. + * + * @param repoName Name of the repository. + * @param defaultBranch Default branch name, usually "main". + */ + def initRepo( + repoName: String + ): Repository = { + val repoNamePattern = "^[a-z0-9][a-z0-9-]{2,62}$".r + + // Validate repoName + if (!repoNamePattern.matches(repoName)) { + throw new IllegalArgumentException( + s"Invalid dataset name: '$repoName'. " + + "Dataset names must be 3-63 characters long, " + + "contain only lowercase letters, numbers, and hyphens, " + + "and cannot start or end with a hyphen." + ) + } + val storageNamespace = s"$storageNamespaceURI/$repoName" + val repo = new RepositoryCreation() + .name(repoName) + .storageNamespace(storageNamespace) + .defaultBranch(branchName) + .sampleData(false) + + repoApi.createRepository(repo).execute() + } + + /** + * Writes a file to the repository (similar to Git add). + * Converts the InputStream to a temporary file for upload. + * + * @param repoName Repository name. + * @param branch Branch name. + * @param filePath Path in the repository. + * @param inputStream File content stream. + */ + def writeFileToRepo( + repoName: String, + filePath: String, + inputStream: InputStream + ): ObjectStats = { + val tempFilePath = Files.createTempFile("lakefs-upload-", ".tmp") + val tempFileStream = new FileOutputStream(tempFilePath.toFile) + val buffer = new Array[Byte](8192) + + // Create an iterator to repeatedly call inputStream.read, and direct buffered data to file + Iterator + .continually(inputStream.read(buffer)) + .takeWhile(_ != -1) + .foreach(tempFileStream.write(buffer, 0, _)) + + inputStream.close() + tempFileStream.close() + + // Upload the temporary file to LakeFS + objectsApi.uploadObject(repoName, branchName, filePath).content(tempFilePath.toFile).execute() + } + + /** + * Removes a file from the repository (similar to Git rm). + * + * @param repoName Repository name. + * @param branch Branch name. + * @param filePath Path in the repository to delete. + */ + def removeFileFromRepo(repoName: String, branch: String, filePath: String): Unit = { + objectsApi.deleteObject(repoName, branch, filePath).execute() + } + + /** + * Executes operations and creates a commit (similar to a transactional commit). + * + * @param repoName Repository name. + * @param branch Branch name. + * @param commitMessage Commit message. + * @param operations File operations to perform before committing. + */ + def withCreateVersion(repoName: String, commitMessage: String)( + operations: => Unit + ): Commit = { + operations + val commit = new CommitCreation() + .message(commitMessage) + + commitsApi.commit(repoName, branchName, commit).execute() + } + + /** + * Retrieves file content from a specific commit and path. + * + * @param repoName Repository name. + * @param commitHash Commit hash of the version. + * @param filePath Path to the file in the repository. + */ + def retrieveFileContent(repoName: String, commitHash: String, filePath: String): File = { + objectsApi.getObject(repoName, commitHash, filePath).execute() + } + + /** + * Retrieves file content from a specific commit and path. + * + * @param repoName Repository name. + * @param commitHash Commit hash of the version. + * @param filePath Path to the file in the repository. + */ + def getFilePresignedUrl(repoName: String, commitHash: String, filePath: String): String = { + objectsApi.statObject(repoName, commitHash, filePath).presign(true).execute().getPhysicalAddress + } + + def getFilePresignedUploadUrl(repoName: String, filePath: String): String = { + stagingApi + .getPhysicalAddress(repoName, branchName, filePath) + .presign(true) + .execute() + .getPresignedUrl + } + + /** + */ + def initiatePresignedMultipartUploads( + repoName: String, + filePath: String, + numberOfParts: Int + ): PresignMultipartUpload = { + experimentalApi + .createPresignMultipartUpload(repoName, branchName, filePath) + .parts(numberOfParts) + .execute() + + } + + def completePresignedMultipartUploads( + repoName: String, + filePath: String, + uploadId: String, + partsList: List[(Int, String)], + physicalAddress: String + ): ObjectStats = { + val completePresignMultipartUpload: CompletePresignMultipartUpload = + new CompletePresignMultipartUpload() + + // Sort parts by part number in ascending order + val sortedParts = partsList.sortBy(_._1) + + completePresignMultipartUpload.setParts( + sortedParts + .map(part => { + val newUploadPart = new UploadPart + newUploadPart.setPartNumber(part._1) + newUploadPart.setEtag(part._2) + newUploadPart + }) + .asJava + ) + + completePresignMultipartUpload.setPhysicalAddress(physicalAddress) + + experimentalApi + .completePresignMultipartUpload(repoName, branchName, uploadId, filePath) + .completePresignMultipartUpload(completePresignMultipartUpload) + .execute() + } + + def abortPresignedMultipartUploads( + repoName: String, + filePath: String, + uploadId: String, + physicalAddress: String + ): Unit = { + val abortPresignMultipartUpload: AbortPresignMultipartUpload = new AbortPresignMultipartUpload + abortPresignMultipartUpload.setPhysicalAddress(physicalAddress) + + experimentalApi + .abortPresignMultipartUpload(repoName, branchName, uploadId, filePath) + .abortPresignMultipartUpload(abortPresignMultipartUpload) + .execute() + } + + /** + * Deletes an entire repository. + * + * @param repoName Name of the repository to delete. + */ + def deleteRepo(repoName: String): Unit = { + repoApi.deleteRepository(repoName).execute() + } + + def retrieveVersionsOfRepository(repoName: String): List[Commit] = { + refsApi + .logCommits(repoName, branchName) + .execute() + .getResults + .asScala + .toList + .sortBy(_.getCreationDate)(Ordering[java.lang.Long].reverse) // Sort in descending order + } + + def retrieveObjectsOfVersion(repoName: String, commitHash: String): List[ObjectStats] = { + objectsApi.listObjects(repoName, commitHash).execute().getResults.asScala.toList + } + + /** + * Retrieves a list of uncommitted (staged) objects in a repository branch. + * + * @param repoName Repository name. + * @return List of uncommitted object stats. + */ + def retrieveUncommittedObjects(repoName: String): List[Diff] = { + branchesApi + .diffBranch(repoName, branchName) + .execute() + .getResults + .asScala + .toList + } + + def createCommit(repoName: String, branch: String, commitMessage: String): Commit = { + val commit = new CommitCreation() + .message(commitMessage) + commitsApi.commit(repoName, branch, commit).execute() + } + + def deleteObject(repoName: String, filePath: String): Unit = { + objectsApi.deleteObject(repoName, branchName, filePath).execute() + } + + def resetObjectUploadOrDeletion(repoName: String, filePath: String): Unit = { + val resetCreation: ResetCreation = new ResetCreation + resetCreation.setType(TypeEnum.OBJECT) + resetCreation.setPath(filePath) + + branchesApi.resetBranch(repoName, branchName, resetCreation).execute() + } +} diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/PathUtils.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/PathUtils.scala index f88f29d0b30..b7d34e134b9 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/PathUtils.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/PathUtils.scala @@ -37,6 +37,8 @@ object PathUtils { lazy val workflowCompilingServicePath: Path = corePath.resolve("workflow-compiling-service") + lazy val fileServicePath: Path = corePath.resolve("file-service") + private lazy val datasetsRootPath = corePath.resolve("amber").resolve("user-resources").resolve("datasets") diff --git a/core/workflow-core/src/test/scala/edu/uci/ics/amber/storage/FileResolverSpec.scala b/core/workflow-core/src/test/scala/edu/uci/ics/amber/storage/FileResolverSpec.scala index 54c833fa6af..8442a995f3e 100644 --- a/core/workflow-core/src/test/scala/edu/uci/ics/amber/storage/FileResolverSpec.scala +++ b/core/workflow-core/src/test/scala/edu/uci/ics/amber/storage/FileResolverSpec.scala @@ -91,10 +91,10 @@ class FileResolverSpec val dataset1TxtUri = FileResolver.resolve(dataset1TxtFilePath) assert( - datasetACsvUri.toString == f"${FileResolver.DATASET_FILE_URI_SCHEME}:///${testDataset.getDid}/${testDatasetVersion2.getVersionHash}/directory/a.csv" + datasetACsvUri.toString == f"${FileResolver.DATASET_FILE_URI_SCHEME}:///${testDataset.getName}/${testDatasetVersion2.getVersionHash}/directory/a.csv" ) assert( - dataset1TxtUri.toString == f"${FileResolver.DATASET_FILE_URI_SCHEME}:///${testDataset.getDid}/${testDatasetVersion1.getVersionHash}/1.txt" + dataset1TxtUri.toString == f"${FileResolver.DATASET_FILE_URI_SCHEME}:///${testDataset.getName}/${testDatasetVersion1.getVersionHash}/1.txt" ) }