From 48a366c904213255f890ef63c1c99e7c1e7b9c35 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Mon, 17 Nov 2025 20:54:49 -0800 Subject: [PATCH 01/10] Add BIG_OBJECT type support for Java --- .../architecture/rpc/controlcommands.proto | 1 + .../RegionExecutionCoordinator.scala | 3 +- .../InitializeExecutorHandler.scala | 12 +- .../user/workflow/WorkflowResource.scala | 6 + .../texera/web/service/WorkflowService.scala | 5 + .../architecture/worker/WorkerSpec.scala | 3 +- common/workflow-core/build.sbt | 13 +- .../core/executor/OperatorExecutor.scala | 15 + .../amber/core/tuple/AttributeType.java | 3 + .../amber/core/tuple/AttributeTypeUtils.scala | 21 +- .../amber/core/tuple/BigObjectPointer.java | 76 ++++ .../org/apache/amber/util/IcebergUtil.scala | 89 +++- .../service/util/BigObjectManager.scala | 140 ++++++ .../texera/service/util/S3StorageClient.scala | 134 ++++++ .../core/tuple/AttributeTypeUtilsSpec.scala | 21 + .../apache/amber/util/IcebergUtilSpec.scala | 100 ++++- .../service/util/BigObjectManagerSpec.scala | 350 +++++++++++++++ .../service/util/S3StorageClientSpec.scala | 415 ++++++++++++++++++ .../source/scan/FileAttributeType.java | 5 +- .../source/scan/FileScanSourceOpExec.scala | 4 + .../scan/FileScanSourceOpExecSpec.scala | 163 +++++++ file-service/build.sbt | 3 - sql/texera_ddl.sql | 8 + sql/updates/16.sql | 34 ++ 24 files changed, 1592 insertions(+), 32 deletions(-) create mode 100644 common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala rename {file-service => common/workflow-core}/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala (56%) create mode 100644 common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala create mode 100644 common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala create mode 100644 sql/updates/16.sql diff --git a/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto index 41f0976314c..d596f8b0447 100644 --- a/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto @@ -256,6 +256,7 @@ message InitializeExecutorRequest { int32 totalWorkerCount = 1; core.OpExecInitInfo opExecInitInfo = 2; bool isSource = 3; + int64 executionId = 4; } message UpdateExecutorRequest { diff --git a/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index a83af49dde1..7dd319fb98d 100644 --- a/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -361,7 +361,8 @@ class RegionExecutionCoordinator( InitializeExecutorRequest( workerConfigs.length, physicalOp.opExecInitInfo, - physicalOp.isSourceOperator + physicalOp.isSourceOperator, + physicalOp.executionId.id ), asyncRPCClient.mkContext(workerId) ) diff --git a/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala b/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala index 32a718606cb..d4d548a7f44 100644 --- a/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala +++ b/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala @@ -42,7 +42,12 @@ trait InitializeExecutorHandler { dp.serializationManager.setOpInitialization(req) val workerIdx = VirtualIdentityUtils.getWorkerIndex(actorId) val workerCount = req.totalWorkerCount - dp.executor = req.opExecInitInfo match { + + val executionId = req.executionId.toInt + val operatorId = VirtualIdentityUtils.getPhysicalOpId(actorId).logicalOpId.id + + // Create the executor + val executor = req.opExecInitInfo match { case OpExecWithClassName(className, descString) => ExecFactory.newExecFromJavaClassName(className, descString, workerIdx, workerCount) case OpExecWithCode(code, _) => @@ -52,6 +57,11 @@ trait InitializeExecutorHandler { case OpExecInitInfo.Empty => throw new IllegalArgumentException("Empty executor initialization info") } + + // Initialize execution context on the executor instance + executor.initializeExecutionContext(executionId, operatorId) + + dp.executor = executor EmptyReturn() } diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala index 01ae898a66a..b132fc1d185 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala @@ -36,6 +36,7 @@ import org.apache.texera.dao.jooq.generated.tables.daos.{ WorkflowUserAccessDao } import org.apache.texera.dao.jooq.generated.tables.pojos._ +import org.apache.texera.service.util.BigObjectManager import org.apache.texera.web.resource.dashboard.hub.EntityType import org.apache.texera.web.resource.dashboard.hub.HubResource.recordCloneAction import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowAccessResource.hasReadAccess @@ -600,6 +601,11 @@ class WorkflowResource extends LazyLogging { .asScala .toList + // Delete big objects + eids.foreach { eid => + BigObjectManager.delete(eid.toInt) + } + // Collect all URIs related to executions for cleanup val uris = eids.flatMap { eid => val executionId = ExecutionIdentity(eid.longValue()) diff --git a/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala b/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala index 01c66fb4589..35dd9fde27a 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala @@ -46,6 +46,7 @@ import org.apache.amber.engine.architecture.worker.WorkflowWorker.{ } import org.apache.amber.error.ErrorUtils.{getOperatorFromActorIdOpt, getStackTraceWithAllCauses} import org.apache.texera.dao.jooq.generated.tables.pojos.User +import org.apache.texera.service.util.BigObjectManager import org.apache.texera.web.model.websocket.event.TexeraWebSocketEvent import org.apache.texera.web.model.websocket.request.WorkflowExecuteRequest import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutionsResource @@ -307,6 +308,7 @@ class WorkflowService( * 2. Clears URI references from the execution registry * 3. Safely clears all result and console message documents * 4. Expires Iceberg snapshots for runtime statistics + * 5. Deletes big objects from MinIO * * @param eid The execution identity to clean up resources for */ @@ -343,6 +345,9 @@ class WorkflowService( logger.debug(s"Error processing document at $uri: ${error.getMessage}") } } + + // Delete big objects + BigObjectManager.delete(eid.id.toInt) } } diff --git a/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala b/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala index 1bc3a160783..895f06c1371 100644 --- a/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala +++ b/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala @@ -194,7 +194,8 @@ class WorkerSpec InitializeExecutorRequest( 1, OpExecWithClassName("org.apache.amber.engine.architecture.worker.DummyOperatorExecutor"), - isSource = false + isSource = false, + 1 ), AsyncRPCContext(CONTROLLER, identifier1), 4 diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index 82a79e8e04b..ab6b8f27c65 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -83,11 +83,15 @@ Test / PB.protoSources += PB.externalSourcePath.value // Test-related Dependencies ///////////////////////////////////////////////////////////////////////////// +val testcontainersVersion = "0.43.0" + libraryDependencies ++= Seq( "org.scalamock" %% "scalamock" % "5.2.0" % Test, // ScalaMock "org.scalatest" %% "scalatest" % "3.2.15" % Test, // ScalaTest "junit" % "junit" % "4.13.2" % Test, // JUnit - "com.novocode" % "junit-interface" % "0.11" % Test // SBT interface for JUnit + "com.novocode" % "junit-interface" % "0.11" % Test, // SBT interface for JUnit + "com.dimafeng" %% "testcontainers-scala-scalatest" % testcontainersVersion % Test, // Testcontainers ScalaTest integration + "com.dimafeng" %% "testcontainers-scala-minio" % testcontainersVersion % Test // MinIO Testcontainer Scala integration ) @@ -183,5 +187,10 @@ libraryDependencies ++= Seq( "org.apache.commons" % "commons-vfs2" % "2.9.0", // for FileResolver throw VFS-related exceptions "io.lakefs" % "sdk" % "1.51.0", // for lakeFS api calls "com.typesafe" % "config" % "1.4.3", // config reader - "org.apache.commons" % "commons-jcs3-core" % "3.2" // Apache Commons JCS + "org.apache.commons" % "commons-jcs3-core" % "3.2", // Apache Commons JCS + "software.amazon.awssdk" % "s3" % "2.29.51" excludeAll( + ExclusionRule(organization = "io.netty") + ), + "software.amazon.awssdk" % "auth" % "2.29.51", + "software.amazon.awssdk" % "regions" % "2.29.51", ) \ No newline at end of file diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala index 69e62a8f308..48b66e661fc 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala @@ -25,6 +25,21 @@ import org.apache.amber.core.workflow.PortIdentity trait OperatorExecutor { + // Execution context + private var _executionId: Option[Int] = None + private var _operatorId: Option[String] = None + + protected def executionId: Int = + _executionId.getOrElse(throw new IllegalStateException("Execution context not initialized")) + + protected def operatorId: String = + _operatorId.getOrElse(throw new IllegalStateException("Execution context not initialized")) + + final def initializeExecutionContext(execId: Int, opId: String): Unit = { + _executionId = Some(execId) + _operatorId = Some(opId) + } + def open(): Unit = {} def produceStateOnStart(port: Int): Option[State] = None diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java index 472679f5275..93aadf31a99 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java @@ -70,6 +70,7 @@ public enum AttributeType implements Serializable { BOOLEAN("boolean", Boolean.class), TIMESTAMP("timestamp", Timestamp.class), BINARY("binary", byte[].class), + BIG_OBJECT("big_object", BigObjectPointer.class), ANY("ANY", Object.class); private final String name; @@ -109,6 +110,8 @@ public static AttributeType getAttributeType(Class fieldClass) { return TIMESTAMP; } else if (fieldClass.equals(byte[].class)) { return BINARY; + } else if (fieldClass.equals(BigObjectPointer.class)) { + return BIG_OBJECT; } else { return ANY; } diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index e4fdcb4611d..0efeea960f0 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -121,14 +121,15 @@ object AttributeTypeUtils extends Serializable { ): Any = { if (field == null) return null attributeType match { - case AttributeType.INTEGER => parseInteger(field, force) - case AttributeType.LONG => parseLong(field, force) - case AttributeType.DOUBLE => parseDouble(field) - case AttributeType.BOOLEAN => parseBoolean(field) - case AttributeType.TIMESTAMP => parseTimestamp(field) - case AttributeType.STRING => field.toString - case AttributeType.BINARY => field - case AttributeType.ANY | _ => field + case AttributeType.INTEGER => parseInteger(field, force) + case AttributeType.LONG => parseLong(field, force) + case AttributeType.DOUBLE => parseDouble(field) + case AttributeType.BOOLEAN => parseBoolean(field) + case AttributeType.TIMESTAMP => parseTimestamp(field) + case AttributeType.STRING => field.toString + case AttributeType.BINARY => field + case AttributeType.BIG_OBJECT => new BigObjectPointer(field.toString) + case AttributeType.ANY | _ => field } } @@ -383,7 +384,9 @@ object AttributeTypeUtils extends Serializable { case AttributeType.INTEGER => tryParseInteger(fieldValue) case AttributeType.TIMESTAMP => tryParseTimestamp(fieldValue) case AttributeType.BINARY => tryParseString() - case _ => tryParseString() + case AttributeType.BIG_OBJECT => + AttributeType.BIG_OBJECT // Big objects are never inferred from data + case _ => tryParseString() } } diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java new file mode 100644 index 00000000000..b938561f4d1 --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.amber.core.tuple; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonValue; + +import java.net.URI; +import java.util.Objects; + +/** + * BigObjectPointer represents a pointer to a large object stored in S3. + * The pointer is formatted as a URI: s3://bucket/path/to/object + */ +public class BigObjectPointer { + + private final String uri; + + @JsonCreator + public BigObjectPointer(@JsonProperty("uri") String uri) { + if (uri == null || !uri.startsWith("s3://")) { + throw new IllegalArgumentException("BigObjectPointer URI must start with 's3://' but was: " + uri); + } + this.uri = uri; + } + + @JsonValue + public String getUri() { + return uri; + } + + public String getBucketName() { + return URI.create(uri).getHost(); + } + + public String getObjectKey() { + String path = URI.create(uri).getPath(); + return path.startsWith("/") ? path.substring(1) : path; + } + + @Override + public String toString() { + return uri; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!(obj instanceof BigObjectPointer)) return false; + BigObjectPointer that = (BigObjectPointer) obj; + return Objects.equals(uri, that.uri); + } + + @Override + public int hashCode() { + return Objects.hash(uri); + } +} diff --git a/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala index bc171396418..438d6c30c37 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala @@ -20,7 +20,7 @@ package org.apache.amber.util import org.apache.amber.config.StorageConfig -import org.apache.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.amber.core.tuple.{Attribute, AttributeType, BigObjectPointer, Schema, Tuple} import org.apache.hadoop.conf.Configuration import org.apache.iceberg.catalog.{Catalog, TableIdentifier} import org.apache.iceberg.data.parquet.GenericParquetReaders @@ -52,6 +52,9 @@ import scala.jdk.CollectionConverters._ */ object IcebergUtil { + // Unique suffix for BIG_OBJECT field encoding + private val BIG_OBJECT_FIELD_SUFFIX = "__texera_big_obj_ptr" + /** * Creates and initializes a HadoopCatalog with the given parameters. * - Uses an empty Hadoop `Configuration`, meaning the local file system (or `file:/`) will be used by default @@ -200,6 +203,7 @@ object IcebergUtil { /** * Converts a custom Amber `Schema` to an Iceberg `Schema`. + * Field names are encoded to preserve BIG_OBJECT type information. * * @param amberSchema The custom Amber Schema. * @return An Iceberg Schema. @@ -207,13 +211,16 @@ object IcebergUtil { def toIcebergSchema(amberSchema: Schema): IcebergSchema = { val icebergFields = amberSchema.getAttributes.zipWithIndex.map { case (attribute, index) => - Types.NestedField.optional(index + 1, attribute.getName, toIcebergType(attribute.getType)) + val encodedName = encodeBigObjectFieldName(attribute.getName, attribute.getType) + val icebergType = toIcebergType(attribute.getType) + Types.NestedField.optional(index + 1, encodedName, icebergType) } new IcebergSchema(icebergFields.asJava) } /** * Converts a custom Amber `AttributeType` to an Iceberg `Type`. + * Note: BIG_OBJECT is stored as StringType; field name encoding is used to distinguish it. * * @param attributeType The custom Amber AttributeType. * @return The corresponding Iceberg Type. @@ -227,6 +234,8 @@ object IcebergUtil { case AttributeType.BOOLEAN => Types.BooleanType.get() case AttributeType.TIMESTAMP => Types.TimestampType.withoutZone() case AttributeType.BINARY => Types.BinaryType.get() + case AttributeType.BIG_OBJECT => + Types.StringType.get() // Store BigObjectPointer URI as string case AttributeType.ANY => throw new IllegalArgumentException("ANY type is not supported in Iceberg") } @@ -243,13 +252,15 @@ object IcebergUtil { tuple.schema.getAttributes.zipWithIndex.foreach { case (attribute, index) => + val fieldName = encodeBigObjectFieldName(attribute.getName, attribute.getType) val value = tuple.getField[AnyRef](index) match { - case null => null - case ts: Timestamp => ts.toInstant.atZone(ZoneId.systemDefault()).toLocalDateTime - case bytes: Array[Byte] => ByteBuffer.wrap(bytes) - case other => other + case null => null + case ts: Timestamp => ts.toInstant.atZone(ZoneId.systemDefault()).toLocalDateTime + case bytes: Array[Byte] => ByteBuffer.wrap(bytes) + case bigObjPtr: BigObjectPointer => bigObjPtr.getUri + case other => other } - record.setField(attribute.getName, value) + record.setField(fieldName, value) } record @@ -264,23 +275,69 @@ object IcebergUtil { */ def fromRecord(record: Record, amberSchema: Schema): Tuple = { val fieldValues = amberSchema.getAttributes.map { attribute => - val value = record.getField(attribute.getName) match { + val fieldName = encodeBigObjectFieldName(attribute.getName, attribute.getType) + val rawValue = record.getField(fieldName) + + rawValue match { case null => null case ldt: LocalDateTime => Timestamp.valueOf(ldt) case buffer: ByteBuffer => val bytes = new Array[Byte](buffer.remaining()) buffer.get(bytes) bytes + case uri: String if attribute.getType == AttributeType.BIG_OBJECT => + new BigObjectPointer(uri) case other => other } - value } Tuple(amberSchema, fieldValues.toArray) } + /** + * Encodes a field name for BIG_OBJECT types by adding a unique system suffix. + * This ensures BIG_OBJECT fields can be identified when reading from Iceberg. + * + * @param fieldName The original field name + * @param attributeType The attribute type + * @return The encoded field name with a unique suffix for BIG_OBJECT types + */ + private def encodeBigObjectFieldName(fieldName: String, attributeType: AttributeType): String = { + if (attributeType == AttributeType.BIG_OBJECT) { + s"${fieldName}${BIG_OBJECT_FIELD_SUFFIX}" + } else { + fieldName + } + } + + /** + * Decodes a field name by removing the unique system suffix if present. + * This restores the original user-defined field name. + * + * @param fieldName The encoded field name + * @return The original field name with system suffix removed + */ + private def decodeBigObjectFieldName(fieldName: String): String = { + if (isBigObjectField(fieldName)) { + fieldName.substring(0, fieldName.length - BIG_OBJECT_FIELD_SUFFIX.length) + } else { + fieldName + } + } + + /** + * Checks if a field name indicates a BIG_OBJECT type by examining the unique suffix. + * + * @param fieldName The field name to check + * @return true if the field represents a BIG_OBJECT type, false otherwise + */ + private def isBigObjectField(fieldName: String): Boolean = { + fieldName.endsWith(BIG_OBJECT_FIELD_SUFFIX) + } + /** * Converts an Iceberg `Schema` to an Amber `Schema`. + * Field names are decoded to restore original names and detect BIG_OBJECT types. * * @param icebergSchema The Iceberg Schema. * @return The corresponding Amber Schema. @@ -290,7 +347,10 @@ object IcebergUtil { .columns() .asScala .map { field => - new Attribute(field.name(), fromIcebergType(field.`type`().asPrimitiveType())) + val fieldName = field.name() + val attributeType = fromIcebergType(field.`type`().asPrimitiveType(), fieldName) + val originalName = decodeBigObjectFieldName(fieldName) + new Attribute(originalName, attributeType) } .toList @@ -301,11 +361,16 @@ object IcebergUtil { * Converts an Iceberg `Type` to an Amber `AttributeType`. * * @param icebergType The Iceberg Type. + * @param fieldName The field name (used to detect BIG_OBJECT by suffix). * @return The corresponding Amber AttributeType. */ - def fromIcebergType(icebergType: PrimitiveType): AttributeType = { + def fromIcebergType( + icebergType: PrimitiveType, + fieldName: String = "" + ): AttributeType = { icebergType match { - case _: Types.StringType => AttributeType.STRING + case _: Types.StringType => + if (isBigObjectField(fieldName)) AttributeType.BIG_OBJECT else AttributeType.STRING case _: Types.IntegerType => AttributeType.INTEGER case _: Types.LongType => AttributeType.LONG case _: Types.DoubleType => AttributeType.DOUBLE diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala new file mode 100644 index 00000000000..cf240b1e1c0 --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import com.typesafe.scalalogging.LazyLogging +import org.apache.amber.core.tuple.BigObjectPointer +import org.apache.texera.dao.SqlServer +import org.apache.texera.dao.jooq.generated.Tables.BIG_OBJECT + +import java.io.{Closeable, InputStream} +import java.util.UUID +import scala.jdk.CollectionConverters._ + +/** + * Stream for reading big objects from S3. + * All read methods guarantee to read the full requested amount (or until EOF). + */ +class BigObjectStream(private val inputStream: InputStream) extends Closeable { + + @volatile private var closed = false + + private def ensureOpen(): Unit = + if (closed) throw new IllegalStateException("Stream is closed") + + /** Reads all remaining bytes. */ + def read(): Array[Byte] = { + ensureOpen() + val out = new java.io.ByteArrayOutputStream() + val chunk = new Array[Byte](8192) + var n = inputStream.read(chunk) + while (n != -1) { + out.write(chunk, 0, n) + n = inputStream.read(chunk) + } + out.toByteArray + } + + /** Reads exactly `len` bytes (or until EOF). */ + def read(len: Int): Array[Byte] = { + ensureOpen() + if (len <= 0) return Array.emptyByteArray + + val buffer = new Array[Byte](len) + var total = 0 + while (total < len) { + val n = inputStream.read(buffer, total, len - total) + if (n == -1) return if (total == 0) Array.emptyByteArray else buffer.take(total) + total += n + } + buffer + } + + override def close(): Unit = if (!closed) { closed = true; inputStream.close() } + def isClosed: Boolean = closed +} + +/** Manages the lifecycle of large objects (>2GB) stored in S3. */ +object BigObjectManager extends LazyLogging { + private val DEFAULT_BUCKET = "texera-big-objects" + private lazy val db = SqlServer.getInstance().createDSLContext() + + /** Creates a big object from InputStream, uploads to S3, and registers in database. */ + def create(stream: InputStream, executionId: Int, operatorId: String): BigObjectPointer = { + + S3StorageClient.createBucketIfNotExist(DEFAULT_BUCKET) + + val objectKey = s"${System.currentTimeMillis()}/${UUID.randomUUID()}" + val uri = s"s3://$DEFAULT_BUCKET/$objectKey" + + S3StorageClient.uploadObject(DEFAULT_BUCKET, objectKey, stream) + + try { + db.insertInto(BIG_OBJECT) + .columns(BIG_OBJECT.EXECUTION_ID, BIG_OBJECT.OPERATOR_ID, BIG_OBJECT.URI) + .values(Int.box(executionId), operatorId, uri) + .execute() + logger.debug(s"Created big object: eid=$executionId, opid=$operatorId, uri=$uri") + } catch { + case e: Exception => + logger.error(s"Failed to register, cleaning up: $uri", e) + try S3StorageClient.deleteObject(DEFAULT_BUCKET, objectKey) + catch { case _: Exception => } + throw new RuntimeException(s"Failed to create big object: ${e.getMessage}", e) + } + + new BigObjectPointer(uri) + } + + /** Opens a big object for reading. */ + def open(ptr: BigObjectPointer): BigObjectStream = { + require( + S3StorageClient.objectExists(ptr.getBucketName, ptr.getObjectKey), + s"Big object does not exist: ${ptr.getUri}" + ) + new BigObjectStream(S3StorageClient.downloadObject(ptr.getBucketName, ptr.getObjectKey)) + } + + /** Deletes all big objects associated with an execution ID. */ + def delete(executionId: Int): Unit = { + val uris = db + .select(BIG_OBJECT.URI) + .from(BIG_OBJECT) + .where(BIG_OBJECT.EXECUTION_ID.eq(executionId)) + .fetchInto(classOf[String]) + .asScala + .toList + + if (uris.isEmpty) return logger.debug(s"No big objects for execution $executionId") + + logger.info(s"Deleting ${uris.size} big object(s) for execution $executionId") + + uris.foreach { uri => + try { + val ptr = new BigObjectPointer(uri) + S3StorageClient.deleteObject(ptr.getBucketName, ptr.getObjectKey) + } catch { + case e: Exception => logger.error(s"Failed to delete: $uri", e) + } + } + + db.deleteFrom(BIG_OBJECT).where(BIG_OBJECT.EXECUTION_ID.eq(executionId)).execute() + } +} diff --git a/file-service/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala similarity index 56% rename from file-service/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala rename to common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala index 7c157cc0aeb..3c533477052 100644 --- a/file-service/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala @@ -24,7 +24,9 @@ import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCrede import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.s3.model._ import software.amazon.awssdk.services.s3.{S3Client, S3Configuration} +import software.amazon.awssdk.core.sync.RequestBody +import java.io.InputStream import java.security.MessageDigest import scala.jdk.CollectionConverters._ @@ -139,4 +141,136 @@ object S3StorageClient { s3Client.deleteObjects(deleteObjectsRequest) } } + + /** + * Uploads an object to S3 using multipart upload. + * Handles streams of any size without loading into memory. + */ + def uploadObject(bucketName: String, objectKey: String, inputStream: InputStream): String = { + val buffer = new Array[Byte](MINIMUM_NUM_OF_MULTIPART_S3_PART.toInt) + + // Helper to read a full buffer from input stream + def readChunk(): Int = { + var offset = 0 + var read = 0 + while ( + offset < buffer.length && { + read = inputStream.read(buffer, offset, buffer.length - offset); read > 0 + } + ) { + offset += read + } + offset + } + + // Read first chunk to check if stream is empty + val firstChunkSize = readChunk() + if (firstChunkSize == 0) { + return s3Client + .putObject( + PutObjectRequest.builder().bucket(bucketName).key(objectKey).build(), + RequestBody.fromBytes(Array.empty[Byte]) + ) + .eTag() + } + + val uploadId = s3Client + .createMultipartUpload( + CreateMultipartUploadRequest.builder().bucket(bucketName).key(objectKey).build() + ) + .uploadId() + + try { + // Upload all parts using an iterator + val allParts = Iterator + .iterate((1, firstChunkSize)) { case (partNum, _) => (partNum + 1, readChunk()) } + .takeWhile { case (_, size) => size > 0 } + .map { + case (partNumber, chunkSize) => + val eTag = s3Client + .uploadPart( + UploadPartRequest + .builder() + .bucket(bucketName) + .key(objectKey) + .uploadId(uploadId) + .partNumber(partNumber) + .build(), + RequestBody.fromBytes(buffer.take(chunkSize)) + ) + .eTag() + CompletedPart.builder().partNumber(partNumber).eTag(eTag).build() + } + .toList + + s3Client + .completeMultipartUpload( + CompleteMultipartUploadRequest + .builder() + .bucket(bucketName) + .key(objectKey) + .uploadId(uploadId) + .multipartUpload(CompletedMultipartUpload.builder().parts(allParts.asJava).build()) + .build() + ) + .eTag() + + } catch { + case e: Exception => + try { + s3Client.abortMultipartUpload( + AbortMultipartUploadRequest + .builder() + .bucket(bucketName) + .key(objectKey) + .uploadId(uploadId) + .build() + ) + } catch { case _: Exception => } + throw e + } + } + + /** + * Downloads an object from S3 as an InputStream. + * + * @param bucketName The S3 bucket name. + * @param objectKey The object key (path) in S3. + * @return An InputStream containing the object data. + */ + def downloadObject(bucketName: String, objectKey: String): InputStream = { + s3Client.getObject( + GetObjectRequest.builder().bucket(bucketName).key(objectKey).build() + ) + } + + /** + * Checks if an object exists in S3. + * + * @param bucketName The S3 bucket name. + * @param objectKey The object key (path) in S3. + * @return True if the object exists, false otherwise. + */ + def objectExists(bucketName: String, objectKey: String): Boolean = { + try { + s3Client.headObject( + HeadObjectRequest.builder().bucket(bucketName).key(objectKey).build() + ) + true + } catch { + case _: Exception => false + } + } + + /** + * Deletes a single object from S3. + * + * @param bucketName The S3 bucket name. + * @param objectKey The object key (path) in S3. + */ + def deleteObject(bucketName: String, objectKey: String): Unit = { + s3Client.deleteObject( + DeleteObjectRequest.builder().bucket(bucketName).key(objectKey).build() + ) + } } diff --git a/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala b/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala index 24c998b3f7c..81726656750 100644 --- a/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala @@ -190,4 +190,25 @@ class AttributeTypeUtilsSpec extends AnyFunSuite { assert(parseField("anything", AttributeType.ANY) == "anything") } + test("parseField correctly parses to BIG_OBJECT") { + // Valid S3 URI strings are converted to BigObjectPointer + val pointer1 = parseField("s3://bucket/path/to/object", AttributeType.BIG_OBJECT) + .asInstanceOf[BigObjectPointer] + assert(pointer1.getUri == "s3://bucket/path/to/object") + assert(pointer1.getBucketName == "bucket") + assert(pointer1.getObjectKey == "path/to/object") + + // Null input returns null + assert(parseField(null, AttributeType.BIG_OBJECT) == null) + } + + test("BIG_OBJECT type is preserved but never inferred from data") { + // BIG_OBJECT remains BIG_OBJECT when passed as typeSoFar + assert(inferField(AttributeType.BIG_OBJECT, "any-value") == AttributeType.BIG_OBJECT) + assert(inferField(AttributeType.BIG_OBJECT, null) == AttributeType.BIG_OBJECT) + + // String data is inferred as STRING, never BIG_OBJECT + assert(inferField("s3://bucket/path") == AttributeType.STRING) + } + } diff --git a/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala b/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala index ee7744e8154..608d55dbec0 100644 --- a/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala @@ -19,7 +19,7 @@ package org.apache.amber.util -import org.apache.amber.core.tuple.{AttributeType, Schema, Tuple} +import org.apache.amber.core.tuple.{AttributeType, BigObjectPointer, Schema, Tuple} import org.apache.amber.util.IcebergUtil.toIcebergSchema import org.apache.iceberg.data.GenericRecord import org.apache.iceberg.types.Types @@ -199,4 +199,102 @@ class IcebergUtilSpec extends AnyFlatSpec { assert(tuple.getField[String]("test-6") == "hello world") assert(tuple.getField[Array[Byte]]("test-7") sameElements Array[Byte](1, 2, 3, 4)) } + + // BIG_OBJECT type tests + + it should "convert BIG_OBJECT type correctly between Texera and Iceberg" in { + // BIG_OBJECT stored as StringType with field name suffix + assert(IcebergUtil.toIcebergType(AttributeType.BIG_OBJECT) == Types.StringType.get()) + assert(IcebergUtil.fromIcebergType(Types.StringType.get(), "field") == AttributeType.STRING) + assert( + IcebergUtil.fromIcebergType( + Types.StringType.get(), + "field__texera_big_obj_ptr" + ) == AttributeType.BIG_OBJECT + ) + } + + it should "convert schemas with BIG_OBJECT fields correctly" in { + val texeraSchema = Schema() + .add("id", AttributeType.INTEGER) + .add("large_data", AttributeType.BIG_OBJECT) + + val icebergSchema = IcebergUtil.toIcebergSchema(texeraSchema) + + // BIG_OBJECT field gets encoded name with suffix + assert(icebergSchema.findField("large_data__texera_big_obj_ptr") != null) + assert( + icebergSchema.findField("large_data__texera_big_obj_ptr").`type`() == Types.StringType.get() + ) + + // Round-trip preserves schema + val roundTripSchema = IcebergUtil.fromIcebergSchema(icebergSchema) + assert(roundTripSchema.getAttribute("large_data").getType == AttributeType.BIG_OBJECT) + } + + it should "convert tuples with BIG_OBJECT to records and back correctly" in { + val schema = Schema() + .add("id", AttributeType.INTEGER) + .add("large_data", AttributeType.BIG_OBJECT) + + val tuple = Tuple + .builder(schema) + .addSequentially(Array(Int.box(42), new BigObjectPointer("s3://bucket/object/key.data"))) + .build() + + val record = IcebergUtil.toGenericRecord(toIcebergSchema(schema), tuple) + + // BIG_OBJECT stored as URI string with encoded field name + assert(record.getField("id") == 42) + assert(record.getField("large_data__texera_big_obj_ptr") == "s3://bucket/object/key.data") + + // Round-trip preserves data + val roundTripTuple = IcebergUtil.fromRecord(record, schema) + assert(roundTripTuple == tuple) + + // BigObjectPointer properties are accessible + val bigObj = roundTripTuple.getField[BigObjectPointer]("large_data") + assert(bigObj.getUri == "s3://bucket/object/key.data") + assert(bigObj.getBucketName == "bucket") + assert(bigObj.getObjectKey == "object/key.data") + } + + it should "handle null BIG_OBJECT values correctly" in { + val schema = Schema().add("data", AttributeType.BIG_OBJECT) + + val tupleWithNull = Tuple.builder(schema).addSequentially(Array(null)).build() + val record = IcebergUtil.toGenericRecord(toIcebergSchema(schema), tupleWithNull) + + assert(record.getField("data__texera_big_obj_ptr") == null) + assert(IcebergUtil.fromRecord(record, schema) == tupleWithNull) + } + + it should "handle multiple BIG_OBJECT fields and mixed types correctly" in { + val schema = Schema() + .add("int_field", AttributeType.INTEGER) + .add("big_obj_1", AttributeType.BIG_OBJECT) + .add("string_field", AttributeType.STRING) + .add("big_obj_2", AttributeType.BIG_OBJECT) + + val tuple = Tuple + .builder(schema) + .addSequentially( + Array( + Int.box(123), + new BigObjectPointer("s3://bucket1/file1.dat"), + "normal string", + null // null BIG_OBJECT + ) + ) + .build() + + val record = IcebergUtil.toGenericRecord(toIcebergSchema(schema), tuple) + + assert(record.getField("int_field") == 123) + assert(record.getField("big_obj_1__texera_big_obj_ptr") == "s3://bucket1/file1.dat") + assert(record.getField("string_field") == "normal string") + assert(record.getField("big_obj_2__texera_big_obj_ptr") == null) + + assert(IcebergUtil.fromRecord(record, schema) == tuple) + } } diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala new file mode 100644 index 00000000000..a585f228bcc --- /dev/null +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import com.dimafeng.testcontainers.{ForAllTestContainer, MinIOContainer} +import org.apache.amber.config.StorageConfig +import org.apache.amber.core.tuple.BigObjectPointer +import org.apache.texera.dao.MockTexeraDB +import org.apache.texera.dao.jooq.generated.Tables._ +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} +import org.scalatest.funsuite.AnyFunSuite +import org.testcontainers.utility.DockerImageName + +import java.io.ByteArrayInputStream + +class BigObjectManagerSpec + extends AnyFunSuite + with MockTexeraDB + with ForAllTestContainer + with BeforeAndAfterAll + with BeforeAndAfterEach { + + // MinIO container for S3-compatible storage + override val container: MinIOContainer = MinIOContainer( + dockerImageName = DockerImageName.parse("minio/minio:RELEASE.2025-02-28T09-55-16Z"), + userName = "texera_minio", + password = "password" + ) + + // Initialize database and configure storage after container starts + override def afterStart(): Unit = { + super.afterStart() + + // Initialize the embedded database + initializeDBAndReplaceDSLContext() + + // Configure storage to use MinIO container + // Only s3Endpoint is a var, so we can only override that + StorageConfig.s3Endpoint = s"http://${container.host}:${container.mappedPort(9000)}" + } + + override def afterAll(): Unit = { + shutdownDB() + super.afterAll() + } + + override def beforeEach(): Unit = { + getDSLContext.deleteFrom(BIG_OBJECT).execute() + } + + /** Creates mock workflow execution records needed for foreign key constraints. */ + private def createMockExecution(executionId: Int): Unit = { + val dsl = getDSLContext + val id = Int.box(executionId) + + dsl + .insertInto(USER) + .columns(USER.UID, USER.NAME, USER.EMAIL, USER.PASSWORD) + .values(id, s"test_user_$executionId", s"test$executionId@test.com", "password") + .onConflictDoNothing() + .execute() + + dsl + .insertInto(WORKFLOW) + .columns(WORKFLOW.WID, WORKFLOW.NAME, WORKFLOW.CONTENT) + .values(id, s"test_workflow_$executionId", "{}") + .onConflictDoNothing() + .execute() + + dsl + .insertInto(WORKFLOW_OF_USER) + .columns(WORKFLOW_OF_USER.UID, WORKFLOW_OF_USER.WID) + .values(id, id) + .onConflictDoNothing() + .execute() + + dsl + .insertInto(WORKFLOW_VERSION) + .columns(WORKFLOW_VERSION.VID, WORKFLOW_VERSION.WID, WORKFLOW_VERSION.CONTENT) + .values(id, id, "{}") + .onConflictDoNothing() + .execute() + + dsl + .insertInto(WORKFLOW_EXECUTIONS) + .columns( + WORKFLOW_EXECUTIONS.EID, + WORKFLOW_EXECUTIONS.VID, + WORKFLOW_EXECUTIONS.UID, + WORKFLOW_EXECUTIONS.STATUS, + WORKFLOW_EXECUTIONS.ENVIRONMENT_VERSION + ) + .values(id, id, id, Short.box(1.toShort), "test") + .onConflictDoNothing() + .execute() + } + + /** Creates a big object from string data and returns its pointer. */ + private def createBigObject( + data: String, + execId: Int, + opId: String = "test-op" + ): BigObjectPointer = { + createMockExecution(execId) + BigObjectManager.create(new ByteArrayInputStream(data.getBytes), execId, opId) + } + + /** Creates a BigObjectStream from test data. */ + private def createStream(data: String): BigObjectStream = + new BigObjectStream(new ByteArrayInputStream(data.getBytes)) + + /** Verifies that an object exists in S3. */ + private def assertObjectExists(pointer: BigObjectPointer, shouldExist: Boolean = true): Unit = { + val exists = S3StorageClient.objectExists(pointer.getBucketName, pointer.getObjectKey) + assert(exists == shouldExist) + } + + /** Verifies standard bucket name. */ + private def assertStandardBucket(pointer: BigObjectPointer): Unit = { + assert(pointer.getBucketName == "texera-big-objects") + assert(pointer.getUri.startsWith("s3://texera-big-objects/")) + } + + // ======================================== + // BigObjectStream Tests + // ======================================== + + test("BigObjectStream should read all bytes from stream") { + val data = "Hello, World! This is a test." + val stream = createStream(data) + + assert(stream.read().sameElements(data.getBytes)) + stream.close() + } + + test("BigObjectStream should read exact number of bytes") { + val stream = createStream("0123456789ABCDEF") + val result = stream.read(10) + + assert(result.length == 10) + assert(result.sameElements("0123456789".getBytes)) + stream.close() + } + + test("BigObjectStream should handle reading more bytes than available") { + val data = "Short" + val stream = createStream(data) + val result = stream.read(100) + + assert(result.length == data.length) + assert(result.sameElements(data.getBytes)) + stream.close() + } + + test("BigObjectStream should return empty array for 0 or negative byte reads") { + val stream = createStream("Test") + assert(stream.read(0).isEmpty) + assert(stream.read(-5).isEmpty) + stream.close() + } + + test("BigObjectStream should return empty array at EOF") { + val stream = createStream("EOF") + stream.read() // Read all data + assert(stream.read(10).isEmpty) + stream.close() + } + + test("BigObjectStream should track closed state correctly") { + val stream = createStream("test") + assert(!stream.isClosed) + stream.close() + assert(stream.isClosed) + } + + test("BigObjectStream should throw exception when reading from closed stream") { + val stream = createStream("test") + stream.close() + + assertThrows[IllegalStateException](stream.read()) + assertThrows[IllegalStateException](stream.read(10)) + } + + test("BigObjectStream should handle multiple close calls") { + val stream = createStream("test") + stream.close() + stream.close() // Should not throw + assert(stream.isClosed) + } + + test("BigObjectStream should read large data correctly") { + val largeData = Array.fill[Byte](20000)((scala.util.Random.nextInt(256) - 128).toByte) + val stream = new BigObjectStream(new ByteArrayInputStream(largeData)) + + val result = stream.read() + assert(result.sameElements(largeData)) + stream.close() + } + + // ======================================== + // BigObjectManager Tests + // ======================================== + + test("BigObjectManager should create and register a big object") { + val pointer = createBigObject("Test big object data", execId = 1, opId = "operator-1") + + assertStandardBucket(pointer) + + val record = getDSLContext + .selectFrom(BIG_OBJECT) + .where(BIG_OBJECT.EXECUTION_ID.eq(1).and(BIG_OBJECT.OPERATOR_ID.eq("operator-1"))) + .fetchOne() + + assert(record != null) + assert(record.getUri == pointer.getUri) + } + + test("BigObjectManager should open and read a big object") { + val data = "Hello from big object!" + val pointer = createBigObject(data, execId = 2) + + val stream = BigObjectManager.open(pointer) + val readData = stream.read() + stream.close() + + assert(readData.sameElements(data.getBytes)) + } + + test("BigObjectManager should fail to open non-existent big object") { + val fakePointer = new BigObjectPointer("s3://texera-big-objects/nonexistent/file") + assertThrows[IllegalArgumentException](BigObjectManager.open(fakePointer)) + } + + test("BigObjectManager should delete big objects by execution ID") { + val execId = 3 + createMockExecution(execId) + + val pointer1 = + BigObjectManager.create(new ByteArrayInputStream("Object 1".getBytes), execId, "op-1") + val pointer2 = + BigObjectManager.create(new ByteArrayInputStream("Object 2".getBytes), execId, "op-2") + + assertObjectExists(pointer1) + assertObjectExists(pointer2) + + BigObjectManager.delete(execId) + + assertObjectExists(pointer1, shouldExist = false) + assertObjectExists(pointer2, shouldExist = false) + assert( + getDSLContext.selectFrom(BIG_OBJECT).where(BIG_OBJECT.EXECUTION_ID.eq(execId)).fetch().isEmpty + ) + } + + test("BigObjectManager should handle delete with no objects gracefully") { + BigObjectManager.delete(9999) // Should not throw exception + } + + test("BigObjectManager should not delete objects from different executions") { + val pointer1 = createBigObject("Test data", execId = 4) + val pointer2 = createBigObject("Test data", execId = 5) + + BigObjectManager.delete(4) + + assertObjectExists(pointer1, shouldExist = false) + assertObjectExists(pointer2, shouldExist = true) + + BigObjectManager.delete(5) + } + + test("BigObjectManager should create bucket if it doesn't exist") { + val pointer = createBigObject("Test bucket creation", execId = 6) + + assertStandardBucket(pointer) + assertObjectExists(pointer) + + BigObjectManager.delete(6) + } + + test("BigObjectManager should handle large objects correctly") { + val largeData = Array.fill[Byte](6 * 1024 * 1024)((scala.util.Random.nextInt(256) - 128).toByte) + createMockExecution(7) + val pointer = BigObjectManager.create(new ByteArrayInputStream(largeData), 7, "large-op") + + val stream = BigObjectManager.open(pointer) + val readData = stream.read() + stream.close() + + assert(readData.sameElements(largeData)) + BigObjectManager.delete(7) + } + + test("BigObjectManager should generate unique URIs for different objects") { + createMockExecution(8) + val data = new ByteArrayInputStream("Unique URI test".getBytes) + val pointer1 = BigObjectManager.create(data, 8, "test-op") + val pointer2 = + BigObjectManager.create(new ByteArrayInputStream("Unique URI test".getBytes), 8, "test-op") + + assert(pointer1.getUri != pointer2.getUri) + assert(pointer1.getObjectKey != pointer2.getObjectKey) + + BigObjectManager.delete(8) + } + + test("BigObjectManager should handle multiple reads from the same big object") { + val data = "Multiple reads test data" + val pointer = createBigObject(data, execId = 9) + + val stream1 = BigObjectManager.open(pointer) + val readData1 = stream1.read() + stream1.close() + + val stream2 = BigObjectManager.open(pointer) + val readData2 = stream2.read() + stream2.close() + + assert(readData1.sameElements(data.getBytes)) + assert(readData2.sameElements(data.getBytes)) + + BigObjectManager.delete(9) + } + + test("BigObjectManager should properly parse bucket name and object key from pointer") { + val pointer = createBigObject("URI parsing test", execId = 10) + + assertStandardBucket(pointer) + assert(pointer.getObjectKey.nonEmpty) + assert(!pointer.getObjectKey.startsWith("/")) + + BigObjectManager.delete(10) + } +} diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala new file mode 100644 index 00000000000..12bdfca35ae --- /dev/null +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala @@ -0,0 +1,415 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import com.dimafeng.testcontainers.{ForAllTestContainer, MinIOContainer} +import org.apache.amber.config.StorageConfig +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} +import org.scalatest.funsuite.AnyFunSuite +import org.testcontainers.utility.DockerImageName + +import java.io.ByteArrayInputStream +import scala.util.Random + +class S3StorageClientSpec + extends AnyFunSuite + with ForAllTestContainer + with BeforeAndAfterAll + with BeforeAndAfterEach { + + // MinIO container for S3-compatible storage + override val container: MinIOContainer = MinIOContainer( + dockerImageName = DockerImageName.parse("minio/minio:RELEASE.2025-02-28T09-55-16Z"), + userName = "texera_minio", + password = "password" + ) + + private val testBucketName = "test-s3-storage-client" + + // Configure storage after container starts + override def afterStart(): Unit = { + super.afterStart() + StorageConfig.s3Endpoint = s"http://${container.host}:${container.mappedPort(9000)}" + S3StorageClient.createBucketIfNotExist(testBucketName) + } + + override def afterAll(): Unit = { + // Clean up test bucket + try { + S3StorageClient.deleteDirectory(testBucketName, "") + } catch { + case _: Exception => // Ignore cleanup errors + } + super.afterAll() + } + + // Helper methods + private def createInputStream(data: String): ByteArrayInputStream = { + new ByteArrayInputStream(data.getBytes) + } + + private def createInputStream(data: Array[Byte]): ByteArrayInputStream = { + new ByteArrayInputStream(data) + } + + private def readInputStream(inputStream: java.io.InputStream): Array[Byte] = { + val buffer = new Array[Byte](8192) + val outputStream = new java.io.ByteArrayOutputStream() + var bytesRead = 0 + while ({ + bytesRead = inputStream.read(buffer); bytesRead != -1 + }) { + outputStream.write(buffer, 0, bytesRead) + } + outputStream.toByteArray + } + + // ======================================== + // uploadObject Tests + // ======================================== + + test("uploadObject should upload a small object successfully") { + val testData = "Hello, World! This is a small test object." + val objectKey = "test/small-object.txt" + + val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) + + assert(eTag != null) + assert(eTag.nonEmpty) + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("uploadObject should upload an empty object") { + val objectKey = "test/empty-object.txt" + + val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("")) + + assert(eTag != null) + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("uploadObject should upload a large object using multipart upload") { + // Create data larger than MINIMUM_NUM_OF_MULTIPART_S3_PART (5MB) + val largeData = Array.fill[Byte](6 * 1024 * 1024)((Random.nextInt(256) - 128).toByte) + val objectKey = "test/large-object.bin" + + val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(largeData)) + + assert(eTag != null) + assert(eTag.nonEmpty) + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Verify the uploaded content + val downloadedStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = readInputStream(downloadedStream) + downloadedStream.close() + + assert(downloadedData.length == largeData.length) + assert(downloadedData.sameElements(largeData)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("uploadObject should handle objects with special characters in key") { + val testData = "Testing special characters" + val objectKey = "test/special-chars/file with spaces & symbols!@#.txt" + + val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) + + assert(eTag != null) + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("uploadObject should overwrite existing object") { + val objectKey = "test/overwrite-test.txt" + val data1 = "Original data" + val data2 = "Updated data" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(data1)) + val eTag2 = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(data2)) + + assert(eTag2 != null) + + val downloadedStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = new String(readInputStream(downloadedStream)) + downloadedStream.close() + + assert(downloadedData == data2) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + // ======================================== + // downloadObject Tests + // ======================================== + + test("downloadObject should download an object successfully") { + val testData = "This is test data for download." + val objectKey = "test/download-test.txt" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) + + val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = new String(readInputStream(inputStream)) + inputStream.close() + + assert(downloadedData == testData) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("downloadObject should download large objects correctly") { + val largeData = Array.fill[Byte](10 * 1024 * 1024)((Random.nextInt(256) - 128).toByte) + val objectKey = "test/large-download-test.bin" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(largeData)) + + val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = readInputStream(inputStream) + inputStream.close() + + assert(downloadedData.length == largeData.length) + assert(downloadedData.sameElements(largeData)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("downloadObject should download empty objects") { + val objectKey = "test/empty-download-test.txt" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("")) + + val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = readInputStream(inputStream) + inputStream.close() + + assert(downloadedData.isEmpty) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("downloadObject should throw exception for non-existent object") { + val nonExistentKey = "test/non-existent-object.txt" + + assertThrows[Exception] { + S3StorageClient.downloadObject(testBucketName, nonExistentKey) + } + } + + test("downloadObject should handle binary data correctly") { + val binaryData = Array[Byte](0, 1, 2, 127, -128, -1, 64, 32, 16, 8, 4, 2, 1) + val objectKey = "test/binary-data.bin" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(binaryData)) + + val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = readInputStream(inputStream) + inputStream.close() + + assert(downloadedData.sameElements(binaryData)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + // ======================================== + // objectExists Tests + // ======================================== + + test("objectExists should return true for existing object") { + val objectKey = "test/exists-test.txt" + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("exists test")) + + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + test("objectExists should return false for non-existent object") { + val nonExistentKey = "test/does-not-exist.txt" + + assert(!S3StorageClient.objectExists(testBucketName, nonExistentKey)) + } + + test("objectExists should return false for deleted object") { + val objectKey = "test/deleted-object.txt" + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("to be deleted")) + + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + S3StorageClient.deleteObject(testBucketName, objectKey) + + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + } + + test("objectExists should return false for non-existent bucket") { + val nonExistentBucket = "non-existent-bucket-12345" + val objectKey = "test/object.txt" + + assert(!S3StorageClient.objectExists(nonExistentBucket, objectKey)) + } + + test("objectExists should handle objects with special characters") { + val objectKey = "test/special/path with spaces & chars!@#.txt" + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("special chars")) + + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Clean up + S3StorageClient.deleteObject(testBucketName, objectKey) + } + + // ======================================== + // deleteObject Tests + // ======================================== + + test("deleteObject should delete an existing object") { + val objectKey = "test/delete-test.txt" + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("delete me")) + + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + S3StorageClient.deleteObject(testBucketName, objectKey) + + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + } + + test("deleteObject should not throw exception for non-existent object") { + val nonExistentKey = "test/already-deleted.txt" + + // Should not throw exception + S3StorageClient.deleteObject(testBucketName, nonExistentKey) + } + + test("deleteObject should delete large objects") { + val largeData = Array.fill[Byte](7 * 1024 * 1024)((Random.nextInt(256) - 128).toByte) + val objectKey = "test/large-delete-test.bin" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(largeData)) + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + S3StorageClient.deleteObject(testBucketName, objectKey) + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + } + + test("deleteObject should handle multiple deletions of the same object") { + val objectKey = "test/multi-delete-test.txt" + S3StorageClient.uploadObject( + testBucketName, + objectKey, + createInputStream("delete multiple times") + ) + + S3StorageClient.deleteObject(testBucketName, objectKey) + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + + // Second delete should not throw exception + S3StorageClient.deleteObject(testBucketName, objectKey) + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + } + + // ======================================== + // Integration Tests (combining methods) + // ======================================== + + test("upload, download, and delete workflow should work correctly") { + val testData = "Complete workflow test data" + val objectKey = "test/workflow-test.txt" + + // Upload + val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) + assert(eTag != null) + + // Verify exists + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + // Download + val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = new String(readInputStream(inputStream)) + inputStream.close() + assert(downloadedData == testData) + + // Delete + S3StorageClient.deleteObject(testBucketName, objectKey) + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + } + + test("multiple objects can be managed independently") { + val objects = Map( + "test/object1.txt" -> "Data for object 1", + "test/object2.txt" -> "Data for object 2", + "test/object3.txt" -> "Data for object 3" + ) + + // Upload all objects + objects.foreach { + case (key, data) => + S3StorageClient.uploadObject(testBucketName, key, createInputStream(data)) + } + + // Verify all exist + objects.keys.foreach { key => + assert(S3StorageClient.objectExists(testBucketName, key)) + } + + // Delete one object + S3StorageClient.deleteObject(testBucketName, "test/object2.txt") + + // Verify deletion and others still exist + assert(S3StorageClient.objectExists(testBucketName, "test/object1.txt")) + assert(!S3StorageClient.objectExists(testBucketName, "test/object2.txt")) + assert(S3StorageClient.objectExists(testBucketName, "test/object3.txt")) + + // Clean up remaining objects + S3StorageClient.deleteObject(testBucketName, "test/object1.txt") + S3StorageClient.deleteObject(testBucketName, "test/object3.txt") + } + + test("objects with nested paths should be handled correctly") { + val objectKey = "test/deeply/nested/path/to/object.txt" + val testData = "Nested path test" + + S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) + assert(S3StorageClient.objectExists(testBucketName, objectKey)) + + val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) + val downloadedData = new String(readInputStream(inputStream)) + inputStream.close() + assert(downloadedData == testData) + + S3StorageClient.deleteObject(testBucketName, objectKey) + assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileAttributeType.java b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileAttributeType.java index 84e3de95a61..aa198a9d2d6 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileAttributeType.java +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileAttributeType.java @@ -30,7 +30,8 @@ public enum FileAttributeType { DOUBLE("double", AttributeType.DOUBLE), BOOLEAN("boolean", AttributeType.BOOLEAN), TIMESTAMP("timestamp", AttributeType.TIMESTAMP), - BINARY("binary", AttributeType.BINARY); + BINARY("binary", AttributeType.BINARY), + BIG_OBJECT("big object", AttributeType.BIG_OBJECT); private final String name; @@ -56,6 +57,6 @@ public String toString() { } public boolean isSingle() { - return this == SINGLE_STRING || this == BINARY; + return this == SINGLE_STRING || this == BINARY || this == BIG_OBJECT; } } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala index 2124c9da433..fdb0147cf2b 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala @@ -27,6 +27,7 @@ import org.apache.amber.util.JSONUtils.objectMapper import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream import org.apache.commons.io.IOUtils.toByteArray +import org.apache.texera.service.util.BigObjectManager import java.io._ import java.net.URI @@ -84,6 +85,9 @@ class FileScanSourceOpExec private[scan] ( fields.addOne(desc.attributeType match { case FileAttributeType.SINGLE_STRING => new String(toByteArray(entry), desc.fileEncoding.getCharset) + case FileAttributeType.BIG_OBJECT => + // For big objects, create a big object pointer from the input stream + BigObjectManager.create(entry, executionId, operatorId) case _ => parseField(toByteArray(entry), desc.attributeType.getType) }) TupleLike(fields.toSeq: _*) diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala new file mode 100644 index 00000000000..d99ca9a2bf4 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.amber.operator.source.scan + +import org.apache.amber.core.tuple.{AttributeType, BigObjectPointer, Schema, SchemaEnforceable} +import org.apache.amber.util.JSONUtils.objectMapper +import org.scalatest.BeforeAndAfterAll +import org.scalatest.flatspec.AnyFlatSpec + +import java.io.{BufferedOutputStream, FileOutputStream} +import java.net.URI +import java.nio.file.{Files, Path} +import java.util.zip.{ZipEntry, ZipOutputStream} + +/** + * Unit tests for BIG_OBJECT logic in FileScanSourceOpExec. + * Full integration tests with S3 and database are in BigObjectManagerSpec. + */ +class FileScanSourceOpExecSpec extends AnyFlatSpec with BeforeAndAfterAll { + + private val testDir = Path + .of(sys.env.getOrElse("TEXERA_HOME", ".")) + .resolve("common/workflow-operator/src/test/resources") + .toRealPath() + + private val testFile = testDir.resolve("test_big_object.txt") + private val testZip = testDir.resolve("test_big_object.zip") + + override def beforeAll(): Unit = { + super.beforeAll() + Files.write(testFile, "Test content\nLine 2\nLine 3".getBytes) + createZipFile(testZip, Map("file1.txt" -> "Content 1", "file2.txt" -> "Content 2")) + } + + override def afterAll(): Unit = { + Files.deleteIfExists(testFile) + Files.deleteIfExists(testZip) + super.afterAll() + } + + private def createZipFile(path: Path, entries: Map[String, String]): Unit = { + val zipOut = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(path.toFile))) + try { + entries.foreach { + case (name, content) => + zipOut.putNextEntry(new ZipEntry(name)) + zipOut.write(content.getBytes) + zipOut.closeEntry() + } + } finally { + zipOut.close() + } + } + + private def createDescriptor( + file: Path = testFile, + attributeName: String = "line" + ): FileScanSourceOpDesc = { + val desc = new FileScanSourceOpDesc() + desc.fileName = Some(file.toString) + desc.attributeType = FileAttributeType.BIG_OBJECT + desc.attributeName = attributeName + desc.fileEncoding = FileDecodingMethod.UTF_8 + desc + } + + private def assertSchema(schema: Schema, attributeName: String): Unit = { + assert(schema.getAttributes.length == 1) + assert(schema.getAttribute(attributeName).getType == AttributeType.BIG_OBJECT) + } + + // Schema Tests + it should "infer BIG_OBJECT schema with default attribute name" in { + assertSchema(createDescriptor().sourceSchema(), "line") + } + + it should "infer BIG_OBJECT schema with custom attribute name" in { + assertSchema(createDescriptor(attributeName = "custom_field").sourceSchema(), "custom_field") + } + + it should "map BIG_OBJECT to correct AttributeType" in { + assert(FileAttributeType.BIG_OBJECT.getType == AttributeType.BIG_OBJECT) + } + + // Type Classification Tests + it should "correctly classify BIG_OBJECT as isSingle type" in { + val isSingleTypes = List( + FileAttributeType.BIG_OBJECT, + FileAttributeType.SINGLE_STRING, + FileAttributeType.BINARY + ) + val multiLineTypes = List( + FileAttributeType.STRING, + FileAttributeType.INTEGER, + FileAttributeType.LONG, + FileAttributeType.DOUBLE, + FileAttributeType.BOOLEAN, + FileAttributeType.TIMESTAMP + ) + + isSingleTypes.foreach(t => assert(t.isSingle, s"$t should be isSingle")) + multiLineTypes.foreach(t => assert(!t.isSingle, s"$t should not be isSingle")) + } + + // Execution Tests + it should "create BigObjectPointer when reading file with BIG_OBJECT type" in { + val desc = createDescriptor() + desc.setResolvedFileName(URI.create(testFile.toUri.toString)) + + val executor = new FileScanSourceOpExec(objectMapper.writeValueAsString(desc)) + executor.initializeExecutionContext(1, "test-op") + + try { + executor.open() + val tuples = executor.produceTuple().toSeq + executor.close() + + assert(tuples.size == 1) + val field = tuples.head + .asInstanceOf[SchemaEnforceable] + .enforceSchema(desc.sourceSchema()) + .getField[Any]("line") + + assert(field.isInstanceOf[BigObjectPointer]) + assert(field.asInstanceOf[BigObjectPointer].getUri.startsWith("s3://")) + } catch { + case e: Exception => + info(s"S3 not configured: ${e.getMessage}") + } + } + + // BigObjectPointer Tests + it should "create valid BigObjectPointer with correct URI parsing" in { + val pointer = new BigObjectPointer("s3://bucket/path/to/object") + + assert(pointer.getUri == "s3://bucket/path/to/object") + assert(pointer.getBucketName == "bucket") + assert(pointer.getObjectKey == "path/to/object") + } + + it should "reject invalid BigObjectPointer URIs" in { + assertThrows[IllegalArgumentException](new BigObjectPointer("http://invalid")) + assertThrows[IllegalArgumentException](new BigObjectPointer("not-a-uri")) + assertThrows[IllegalArgumentException](new BigObjectPointer(null)) + } +} diff --git a/file-service/build.sbt b/file-service/build.sbt index 68ac82e6b3b..34b30472e0c 100644 --- a/file-service/build.sbt +++ b/file-service/build.sbt @@ -84,7 +84,4 @@ libraryDependencies ++= Seq( "jakarta.ws.rs" % "jakarta.ws.rs-api" % "3.1.0", // Ensure Jakarta JAX-RS API is available "org.bitbucket.b_c" % "jose4j" % "0.9.6", "org.playframework" %% "play-json" % "3.1.0-M1", - "software.amazon.awssdk" % "s3" % "2.29.51", - "software.amazon.awssdk" % "auth" % "2.29.51", - "software.amazon.awssdk" % "regions" % "2.29.51", ) diff --git a/sql/texera_ddl.sql b/sql/texera_ddl.sql index 7b0f9b9063d..a7db9ebe15f 100644 --- a/sql/texera_ddl.sql +++ b/sql/texera_ddl.sql @@ -443,3 +443,11 @@ BEGIN END $$; -- END Fulltext search index creation (DO NOT EDIT THIS LINE) + +CREATE TABLE big_object ( + execution_id INT NOT NULL, + operator_id VARCHAR(100) NOT NULL, + uri TEXT NOT NULL UNIQUE, + creation_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (execution_id) REFERENCES workflow_executions(eid) ON DELETE CASCADE +); \ No newline at end of file diff --git a/sql/updates/16.sql b/sql/updates/16.sql new file mode 100644 index 00000000000..bc762bb3469 --- /dev/null +++ b/sql/updates/16.sql @@ -0,0 +1,34 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ============================================ +-- 1. Connect to the texera_db database +-- ============================================ +\c texera_db + +SET search_path TO texera_db; + +-- ============================================ +-- 2. Update the table schema +-- ============================================ +CREATE TABLE big_object ( + execution_id INT NOT NULL, + operator_id VARCHAR(100) NOT NULL, + uri TEXT NOT NULL UNIQUE, + creation_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (execution_id) REFERENCES workflow_executions(eid) ON DELETE CASCADE +); \ No newline at end of file From de199bd149c8aef191d6cf312d4e4633c0dab153 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Mon, 17 Nov 2025 21:42:42 -0800 Subject: [PATCH 02/10] Improve user-facing APIs --- .../core/executor/OperatorExecutor.scala | 27 ++++++++++++++++++- .../source/scan/FileScanSourceOpExec.scala | 3 +-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala index 48b66e661fc..55e4b63ea24 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala @@ -20,8 +20,11 @@ package org.apache.amber.core.executor import org.apache.amber.core.state.State -import org.apache.amber.core.tuple.{Tuple, TupleLike} +import org.apache.amber.core.tuple.{BigObjectPointer, Tuple, TupleLike} import org.apache.amber.core.workflow.PortIdentity +import org.apache.texera.service.util.{BigObjectManager, BigObjectStream} + +import java.io.InputStream trait OperatorExecutor { @@ -40,6 +43,28 @@ trait OperatorExecutor { _operatorId = Some(opId) } + /** + * Creates a big object from an InputStream and returns a pointer to it. + * This is a convenience method that automatically uses the operator's execution context. + * + * @param stream The input stream containing the data to store + * @return A BigObjectPointer that can be stored in tuple fields + */ + final def createBigObject(stream: InputStream): BigObjectPointer = { + BigObjectManager.create(stream, executionId, operatorId) + } + + /** + * Opens a big object for reading from a pointer. + * This is a convenience method that wraps BigObjectManager.open(). + * + * @param pointer The pointer to the big object to open + * @return A BigObjectStream for reading the object's contents + */ + final def openBigObject(pointer: BigObjectPointer): BigObjectStream = { + BigObjectManager.open(pointer) + } + def open(): Unit = {} def produceStateOnStart(port: Int): Option[State] = None diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala index fdb0147cf2b..db687cc50da 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala @@ -27,7 +27,6 @@ import org.apache.amber.util.JSONUtils.objectMapper import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream import org.apache.commons.io.IOUtils.toByteArray -import org.apache.texera.service.util.BigObjectManager import java.io._ import java.net.URI @@ -87,7 +86,7 @@ class FileScanSourceOpExec private[scan] ( new String(toByteArray(entry), desc.fileEncoding.getCharset) case FileAttributeType.BIG_OBJECT => // For big objects, create a big object pointer from the input stream - BigObjectManager.create(entry, executionId, operatorId) + createBigObject(entry) case _ => parseField(toByteArray(entry), desc.attributeType.getType) }) TupleLike(fields.toSeq: _*) From b1cd9a4e589939198a979c74104380378f59a54a Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Mon, 17 Nov 2025 22:16:36 -0800 Subject: [PATCH 03/10] Fix unit tests --- .../service/util/BigObjectManagerSpec.scala | 23 +------ .../service/util/S3StorageClientSpec.scala | 18 +---- .../service/util/S3StorageTestBase.scala | 69 +++++++++++++++++++ 3 files changed, 75 insertions(+), 35 deletions(-) create mode 100644 common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageTestBase.scala diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala index a585f228bcc..26435d1f396 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala @@ -19,41 +19,24 @@ package org.apache.texera.service.util -import com.dimafeng.testcontainers.{ForAllTestContainer, MinIOContainer} -import org.apache.amber.config.StorageConfig import org.apache.amber.core.tuple.BigObjectPointer import org.apache.texera.dao.MockTexeraDB import org.apache.texera.dao.jooq.generated.Tables._ import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import org.scalatest.funsuite.AnyFunSuite -import org.testcontainers.utility.DockerImageName import java.io.ByteArrayInputStream class BigObjectManagerSpec extends AnyFunSuite with MockTexeraDB - with ForAllTestContainer + with S3StorageTestBase with BeforeAndAfterAll with BeforeAndAfterEach { - // MinIO container for S3-compatible storage - override val container: MinIOContainer = MinIOContainer( - dockerImageName = DockerImageName.parse("minio/minio:RELEASE.2025-02-28T09-55-16Z"), - userName = "texera_minio", - password = "password" - ) - - // Initialize database and configure storage after container starts - override def afterStart(): Unit = { - super.afterStart() - - // Initialize the embedded database + override def beforeAll(): Unit = { + super.beforeAll() initializeDBAndReplaceDSLContext() - - // Configure storage to use MinIO container - // Only s3Endpoint is a var, so we can only override that - StorageConfig.s3Endpoint = s"http://${container.host}:${container.mappedPort(9000)}" } override def afterAll(): Unit = { diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala index 12bdfca35ae..83927928c4a 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala @@ -19,34 +19,22 @@ package org.apache.texera.service.util -import com.dimafeng.testcontainers.{ForAllTestContainer, MinIOContainer} -import org.apache.amber.config.StorageConfig import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import org.scalatest.funsuite.AnyFunSuite -import org.testcontainers.utility.DockerImageName import java.io.ByteArrayInputStream import scala.util.Random class S3StorageClientSpec extends AnyFunSuite - with ForAllTestContainer + with S3StorageTestBase with BeforeAndAfterAll with BeforeAndAfterEach { - // MinIO container for S3-compatible storage - override val container: MinIOContainer = MinIOContainer( - dockerImageName = DockerImageName.parse("minio/minio:RELEASE.2025-02-28T09-55-16Z"), - userName = "texera_minio", - password = "password" - ) - private val testBucketName = "test-s3-storage-client" - // Configure storage after container starts - override def afterStart(): Unit = { - super.afterStart() - StorageConfig.s3Endpoint = s"http://${container.host}:${container.mappedPort(9000)}" + override def beforeAll(): Unit = { + super.beforeAll() S3StorageClient.createBucketIfNotExist(testBucketName) } diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageTestBase.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageTestBase.scala new file mode 100644 index 00000000000..ad80e6c40eb --- /dev/null +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageTestBase.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import com.dimafeng.testcontainers.MinIOContainer +import org.apache.amber.config.StorageConfig +import org.scalatest.{BeforeAndAfterAll, Suite} +import org.testcontainers.utility.DockerImageName + +/** + * Base trait for tests requiring S3 storage (MinIO). + * Provides access to a single shared MinIO container across all test suites. + * + * Usage: Mix this trait into any test suite that needs S3 storage. + */ +trait S3StorageTestBase extends BeforeAndAfterAll { this: Suite => + + override def beforeAll(): Unit = { + super.beforeAll() + // Trigger lazy initialization of shared container + S3StorageTestBase.ensureContainerStarted() + } +} + +object S3StorageTestBase { + private lazy val container: MinIOContainer = { + val c = MinIOContainer( + dockerImageName = DockerImageName.parse("minio/minio:RELEASE.2025-02-28T09-55-16Z"), + userName = "texera_minio", + password = "password" + ) + c.start() + + val endpoint = s"http://${c.host}:${c.mappedPort(9000)}" + StorageConfig.s3Endpoint = endpoint + + println(s"[S3Storage] Started shared MinIO at $endpoint") + + sys.addShutdownHook { + println("[S3Storage] Stopping shared MinIO...") + c.stop() + } + + c + } + + /** Ensures the container is started (triggers lazy initialization). */ + def ensureContainerStarted(): Unit = { + container // Access lazy val to trigger initialization + () + } +} From c3c8b35d883752c77439e14616dbec7c9757eb1d Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Tue, 18 Nov 2025 14:26:41 -0800 Subject: [PATCH 04/10] Update APIs --- .../core/executor/OperatorExecutor.scala | 31 +------- .../amber/core/tuple/AttributeType.java | 4 +- .../amber/core/tuple/AttributeTypeUtils.scala | 2 +- .../{BigObjectPointer.java => BigObject.java} | 40 ++++++++-- .../org/apache/amber/util/IcebergUtil.scala | 14 ++-- .../service/util/BigObjectManager.scala | 10 +-- .../core/tuple/AttributeTypeUtilsSpec.scala | 4 +- .../apache/amber/util/IcebergUtilSpec.scala | 10 +-- .../service/util/BigObjectManagerSpec.scala | 79 ++++++++++++++++--- .../source/scan/FileScanSourceOpExec.scala | 6 +- .../scan/FileScanSourceOpExecSpec.scala | 22 +++--- 11 files changed, 139 insertions(+), 83 deletions(-) rename common/workflow-core/src/main/scala/org/apache/amber/core/tuple/{BigObjectPointer.java => BigObject.java} (57%) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala index 55e4b63ea24..8caa9a2f3e3 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala @@ -20,11 +20,8 @@ package org.apache.amber.core.executor import org.apache.amber.core.state.State -import org.apache.amber.core.tuple.{BigObjectPointer, Tuple, TupleLike} +import org.apache.amber.core.tuple.{Tuple, TupleLike} import org.apache.amber.core.workflow.PortIdentity -import org.apache.texera.service.util.{BigObjectManager, BigObjectStream} - -import java.io.InputStream trait OperatorExecutor { @@ -32,10 +29,10 @@ trait OperatorExecutor { private var _executionId: Option[Int] = None private var _operatorId: Option[String] = None - protected def executionId: Int = + def executionId: Int = _executionId.getOrElse(throw new IllegalStateException("Execution context not initialized")) - protected def operatorId: String = + def operatorId: String = _operatorId.getOrElse(throw new IllegalStateException("Execution context not initialized")) final def initializeExecutionContext(execId: Int, opId: String): Unit = { @@ -43,28 +40,6 @@ trait OperatorExecutor { _operatorId = Some(opId) } - /** - * Creates a big object from an InputStream and returns a pointer to it. - * This is a convenience method that automatically uses the operator's execution context. - * - * @param stream The input stream containing the data to store - * @return A BigObjectPointer that can be stored in tuple fields - */ - final def createBigObject(stream: InputStream): BigObjectPointer = { - BigObjectManager.create(stream, executionId, operatorId) - } - - /** - * Opens a big object for reading from a pointer. - * This is a convenience method that wraps BigObjectManager.open(). - * - * @param pointer The pointer to the big object to open - * @return A BigObjectStream for reading the object's contents - */ - final def openBigObject(pointer: BigObjectPointer): BigObjectStream = { - BigObjectManager.open(pointer) - } - def open(): Unit = {} def produceStateOnStart(port: Int): Option[State] = None diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java index 93aadf31a99..64fa921d461 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeType.java @@ -70,7 +70,7 @@ public enum AttributeType implements Serializable { BOOLEAN("boolean", Boolean.class), TIMESTAMP("timestamp", Timestamp.class), BINARY("binary", byte[].class), - BIG_OBJECT("big_object", BigObjectPointer.class), + BIG_OBJECT("big_object", BigObject.class), ANY("ANY", Object.class); private final String name; @@ -110,7 +110,7 @@ public static AttributeType getAttributeType(Class fieldClass) { return TIMESTAMP; } else if (fieldClass.equals(byte[].class)) { return BINARY; - } else if (fieldClass.equals(BigObjectPointer.class)) { + } else if (fieldClass.equals(BigObject.class)) { return BIG_OBJECT; } else { return ANY; diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index 0efeea960f0..07a6eb9bd56 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -128,7 +128,7 @@ object AttributeTypeUtils extends Serializable { case AttributeType.TIMESTAMP => parseTimestamp(field) case AttributeType.STRING => field.toString case AttributeType.BINARY => field - case AttributeType.BIG_OBJECT => new BigObjectPointer(field.toString) + case AttributeType.BIG_OBJECT => new BigObject(field.toString) case AttributeType.ANY | _ => field } } diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java similarity index 57% rename from common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java rename to common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java index b938561f4d1..a0a1b288a6b 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObjectPointer.java +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java @@ -22,26 +22,45 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonValue; +import org.apache.amber.core.executor.OperatorExecutor; +import org.apache.texera.service.util.BigObjectManager; +import org.apache.texera.service.util.BigObjectStream; +import java.io.InputStream; import java.net.URI; import java.util.Objects; /** - * BigObjectPointer represents a pointer to a large object stored in S3. - * The pointer is formatted as a URI: s3://bucket/path/to/object + * BigObject represents a reference to a large object stored in S3. + * The reference is formatted as a URI: s3://bucket/path/to/object */ -public class BigObjectPointer { +public class BigObject { private final String uri; + /** + * Creates a BigObject from an S3 URI (primarily for deserialization). + * + * @param uri S3 URI in the format s3://bucket/path/to/object + */ @JsonCreator - public BigObjectPointer(@JsonProperty("uri") String uri) { + public BigObject(@JsonProperty("uri") String uri) { if (uri == null || !uri.startsWith("s3://")) { - throw new IllegalArgumentException("BigObjectPointer URI must start with 's3://' but was: " + uri); + throw new IllegalArgumentException("BigObject URI must start with 's3://' but was: " + uri); } this.uri = uri; } + /** + * Creates a new BigObject by uploading the stream to S3. + * + * @param stream The input stream containing the data to store + * @param executor The operator executor that provides execution context + */ + public BigObject(InputStream stream, OperatorExecutor executor) { + this(BigObjectManager.create(stream, executor.executionId(), executor.operatorId()).getUri()); + } + @JsonValue public String getUri() { return uri; @@ -56,6 +75,13 @@ public String getObjectKey() { return path.startsWith("/") ? path.substring(1) : path; } + /** + * Opens this big object for reading. Caller must close the returned stream. + */ + public BigObjectStream open() { + return BigObjectManager.open(this); + } + @Override public String toString() { return uri; @@ -64,8 +90,8 @@ public String toString() { @Override public boolean equals(Object obj) { if (this == obj) return true; - if (!(obj instanceof BigObjectPointer)) return false; - BigObjectPointer that = (BigObjectPointer) obj; + if (!(obj instanceof BigObject)) return false; + BigObject that = (BigObject) obj; return Objects.equals(uri, that.uri); } diff --git a/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala index 438d6c30c37..7216530a91a 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/util/IcebergUtil.scala @@ -20,7 +20,7 @@ package org.apache.amber.util import org.apache.amber.config.StorageConfig -import org.apache.amber.core.tuple.{Attribute, AttributeType, BigObjectPointer, Schema, Tuple} +import org.apache.amber.core.tuple.{Attribute, AttributeType, BigObject, Schema, Tuple} import org.apache.hadoop.conf.Configuration import org.apache.iceberg.catalog.{Catalog, TableIdentifier} import org.apache.iceberg.data.parquet.GenericParquetReaders @@ -254,11 +254,11 @@ object IcebergUtil { case (attribute, index) => val fieldName = encodeBigObjectFieldName(attribute.getName, attribute.getType) val value = tuple.getField[AnyRef](index) match { - case null => null - case ts: Timestamp => ts.toInstant.atZone(ZoneId.systemDefault()).toLocalDateTime - case bytes: Array[Byte] => ByteBuffer.wrap(bytes) - case bigObjPtr: BigObjectPointer => bigObjPtr.getUri - case other => other + case null => null + case ts: Timestamp => ts.toInstant.atZone(ZoneId.systemDefault()).toLocalDateTime + case bytes: Array[Byte] => ByteBuffer.wrap(bytes) + case bigObjPtr: BigObject => bigObjPtr.getUri + case other => other } record.setField(fieldName, value) } @@ -286,7 +286,7 @@ object IcebergUtil { buffer.get(bytes) bytes case uri: String if attribute.getType == AttributeType.BIG_OBJECT => - new BigObjectPointer(uri) + new BigObject(uri) case other => other } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala index cf240b1e1c0..888a4d29c40 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala @@ -20,7 +20,7 @@ package org.apache.texera.service.util import com.typesafe.scalalogging.LazyLogging -import org.apache.amber.core.tuple.BigObjectPointer +import org.apache.amber.core.tuple.BigObject import org.apache.texera.dao.SqlServer import org.apache.texera.dao.jooq.generated.Tables.BIG_OBJECT @@ -77,7 +77,7 @@ object BigObjectManager extends LazyLogging { private lazy val db = SqlServer.getInstance().createDSLContext() /** Creates a big object from InputStream, uploads to S3, and registers in database. */ - def create(stream: InputStream, executionId: Int, operatorId: String): BigObjectPointer = { + def create(stream: InputStream, executionId: Int, operatorId: String): BigObject = { S3StorageClient.createBucketIfNotExist(DEFAULT_BUCKET) @@ -100,11 +100,11 @@ object BigObjectManager extends LazyLogging { throw new RuntimeException(s"Failed to create big object: ${e.getMessage}", e) } - new BigObjectPointer(uri) + new BigObject(uri) } /** Opens a big object for reading. */ - def open(ptr: BigObjectPointer): BigObjectStream = { + def open(ptr: BigObject): BigObjectStream = { require( S3StorageClient.objectExists(ptr.getBucketName, ptr.getObjectKey), s"Big object does not exist: ${ptr.getUri}" @@ -128,7 +128,7 @@ object BigObjectManager extends LazyLogging { uris.foreach { uri => try { - val ptr = new BigObjectPointer(uri) + val ptr = new BigObject(uri) S3StorageClient.deleteObject(ptr.getBucketName, ptr.getObjectKey) } catch { case e: Exception => logger.error(s"Failed to delete: $uri", e) diff --git a/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala b/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala index 81726656750..949e43d7484 100644 --- a/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/amber/core/tuple/AttributeTypeUtilsSpec.scala @@ -191,9 +191,9 @@ class AttributeTypeUtilsSpec extends AnyFunSuite { } test("parseField correctly parses to BIG_OBJECT") { - // Valid S3 URI strings are converted to BigObjectPointer + // Valid S3 URI strings are converted to BigObject val pointer1 = parseField("s3://bucket/path/to/object", AttributeType.BIG_OBJECT) - .asInstanceOf[BigObjectPointer] + .asInstanceOf[BigObject] assert(pointer1.getUri == "s3://bucket/path/to/object") assert(pointer1.getBucketName == "bucket") assert(pointer1.getObjectKey == "path/to/object") diff --git a/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala b/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala index 608d55dbec0..0b20d5cd970 100644 --- a/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/amber/util/IcebergUtilSpec.scala @@ -19,7 +19,7 @@ package org.apache.amber.util -import org.apache.amber.core.tuple.{AttributeType, BigObjectPointer, Schema, Tuple} +import org.apache.amber.core.tuple.{AttributeType, BigObject, Schema, Tuple} import org.apache.amber.util.IcebergUtil.toIcebergSchema import org.apache.iceberg.data.GenericRecord import org.apache.iceberg.types.Types @@ -239,7 +239,7 @@ class IcebergUtilSpec extends AnyFlatSpec { val tuple = Tuple .builder(schema) - .addSequentially(Array(Int.box(42), new BigObjectPointer("s3://bucket/object/key.data"))) + .addSequentially(Array(Int.box(42), new BigObject("s3://bucket/object/key.data"))) .build() val record = IcebergUtil.toGenericRecord(toIcebergSchema(schema), tuple) @@ -252,8 +252,8 @@ class IcebergUtilSpec extends AnyFlatSpec { val roundTripTuple = IcebergUtil.fromRecord(record, schema) assert(roundTripTuple == tuple) - // BigObjectPointer properties are accessible - val bigObj = roundTripTuple.getField[BigObjectPointer]("large_data") + // BigObject properties are accessible + val bigObj = roundTripTuple.getField[BigObject]("large_data") assert(bigObj.getUri == "s3://bucket/object/key.data") assert(bigObj.getBucketName == "bucket") assert(bigObj.getObjectKey == "object/key.data") @@ -281,7 +281,7 @@ class IcebergUtilSpec extends AnyFlatSpec { .addSequentially( Array( Int.box(123), - new BigObjectPointer("s3://bucket1/file1.dat"), + new BigObject("s3://bucket1/file1.dat"), "normal string", null // null BIG_OBJECT ) diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala index 26435d1f396..b25acb7d130 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala @@ -19,7 +19,7 @@ package org.apache.texera.service.util -import org.apache.amber.core.tuple.BigObjectPointer +import org.apache.amber.core.tuple.BigObject import org.apache.texera.dao.MockTexeraDB import org.apache.texera.dao.jooq.generated.Tables._ import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} @@ -95,12 +95,12 @@ class BigObjectManagerSpec .execute() } - /** Creates a big object from string data and returns its pointer. */ + /** Creates a big object from string data and returns it. */ private def createBigObject( data: String, execId: Int, opId: String = "test-op" - ): BigObjectPointer = { + ): BigObject = { createMockExecution(execId) BigObjectManager.create(new ByteArrayInputStream(data.getBytes), execId, opId) } @@ -110,13 +110,13 @@ class BigObjectManagerSpec new BigObjectStream(new ByteArrayInputStream(data.getBytes)) /** Verifies that an object exists in S3. */ - private def assertObjectExists(pointer: BigObjectPointer, shouldExist: Boolean = true): Unit = { + private def assertObjectExists(pointer: BigObject, shouldExist: Boolean = true): Unit = { val exists = S3StorageClient.objectExists(pointer.getBucketName, pointer.getObjectKey) assert(exists == shouldExist) } /** Verifies standard bucket name. */ - private def assertStandardBucket(pointer: BigObjectPointer): Unit = { + private def assertStandardBucket(pointer: BigObject): Unit = { assert(pointer.getBucketName == "texera-big-objects") assert(pointer.getUri.startsWith("s3://texera-big-objects/")) } @@ -227,8 +227,8 @@ class BigObjectManagerSpec } test("BigObjectManager should fail to open non-existent big object") { - val fakePointer = new BigObjectPointer("s3://texera-big-objects/nonexistent/file") - assertThrows[IllegalArgumentException](BigObjectManager.open(fakePointer)) + val fakeBigObject = new BigObject("s3://texera-big-objects/nonexistent/file") + assertThrows[IllegalArgumentException](BigObjectManager.open(fakeBigObject)) } test("BigObjectManager should delete big objects by execution ID") { @@ -321,13 +321,68 @@ class BigObjectManagerSpec BigObjectManager.delete(9) } - test("BigObjectManager should properly parse bucket name and object key from pointer") { - val pointer = createBigObject("URI parsing test", execId = 10) + test("BigObjectManager should properly parse bucket name and object key from big object") { + val bigObject = createBigObject("URI parsing test", execId = 10) - assertStandardBucket(pointer) - assert(pointer.getObjectKey.nonEmpty) - assert(!pointer.getObjectKey.startsWith("/")) + assertStandardBucket(bigObject) + assert(bigObject.getObjectKey.nonEmpty) + assert(!bigObject.getObjectKey.startsWith("/")) BigObjectManager.delete(10) } + + // ======================================== + // Object-Oriented API Tests + // ======================================== + + test("BigObjectManager.create() should create and register a big object") { + createMockExecution(11) + val data = "Test data for BigObjectManager.create()" + val stream = new ByteArrayInputStream(data.getBytes) + + val bigObject = BigObjectManager.create(stream, 11, "operator-11") + + assertStandardBucket(bigObject) + + val record = getDSLContext + .selectFrom(BIG_OBJECT) + .where(BIG_OBJECT.EXECUTION_ID.eq(11).and(BIG_OBJECT.OPERATOR_ID.eq("operator-11"))) + .fetchOne() + + assert(record != null) + assert(record.getUri == bigObject.getUri) + + BigObjectManager.delete(11) + } + + test("BigObject.open() should read big object contents") { + val data = "Test data for bigObject.open()" + val bigObject = createBigObject(data, execId = 12) + + val stream = bigObject.open() + val readData = stream.read() + stream.close() + + assert(readData.sameElements(data.getBytes)) + + BigObjectManager.delete(12) + } + + test("BigObjectManager.create() and BigObject.open() should work together end-to-end") { + createMockExecution(13) + val data = "End-to-end test data" + + // Create using BigObjectManager + val bigObject = + BigObjectManager.create(new ByteArrayInputStream(data.getBytes), 13, "operator-13") + + // Read using BigObject instance method + val stream = bigObject.open() + val readData = stream.read() + stream.close() + + assert(readData.sameElements(data.getBytes)) + + BigObjectManager.delete(13) + } } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala index db687cc50da..b0be7fdf2ee 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala @@ -22,7 +22,7 @@ package org.apache.amber.operator.source.scan import org.apache.amber.core.executor.SourceOperatorExecutor import org.apache.amber.core.storage.DocumentFactory import org.apache.amber.core.tuple.AttributeTypeUtils.parseField -import org.apache.amber.core.tuple.TupleLike +import org.apache.amber.core.tuple.{BigObject, TupleLike} import org.apache.amber.util.JSONUtils.objectMapper import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream @@ -85,8 +85,8 @@ class FileScanSourceOpExec private[scan] ( case FileAttributeType.SINGLE_STRING => new String(toByteArray(entry), desc.fileEncoding.getCharset) case FileAttributeType.BIG_OBJECT => - // For big objects, create a big object pointer from the input stream - createBigObject(entry) + // For big objects, create a big object from the input stream + new BigObject(entry, FileScanSourceOpExec.this) case _ => parseField(toByteArray(entry), desc.attributeType.getType) }) TupleLike(fields.toSeq: _*) diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala index d99ca9a2bf4..509ba0bdf59 100644 --- a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala @@ -19,7 +19,7 @@ package org.apache.amber.operator.source.scan -import org.apache.amber.core.tuple.{AttributeType, BigObjectPointer, Schema, SchemaEnforceable} +import org.apache.amber.core.tuple.{AttributeType, BigObject, Schema, SchemaEnforceable} import org.apache.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfterAll import org.scalatest.flatspec.AnyFlatSpec @@ -120,7 +120,7 @@ class FileScanSourceOpExecSpec extends AnyFlatSpec with BeforeAndAfterAll { } // Execution Tests - it should "create BigObjectPointer when reading file with BIG_OBJECT type" in { + it should "create BigObject when reading file with BIG_OBJECT type" in { val desc = createDescriptor() desc.setResolvedFileName(URI.create(testFile.toUri.toString)) @@ -138,26 +138,26 @@ class FileScanSourceOpExecSpec extends AnyFlatSpec with BeforeAndAfterAll { .enforceSchema(desc.sourceSchema()) .getField[Any]("line") - assert(field.isInstanceOf[BigObjectPointer]) - assert(field.asInstanceOf[BigObjectPointer].getUri.startsWith("s3://")) + assert(field.isInstanceOf[BigObject]) + assert(field.asInstanceOf[BigObject].getUri.startsWith("s3://")) } catch { case e: Exception => info(s"S3 not configured: ${e.getMessage}") } } - // BigObjectPointer Tests - it should "create valid BigObjectPointer with correct URI parsing" in { - val pointer = new BigObjectPointer("s3://bucket/path/to/object") + // BigObject Tests + it should "create valid BigObject with correct URI parsing" in { + val pointer = new BigObject("s3://bucket/path/to/object") assert(pointer.getUri == "s3://bucket/path/to/object") assert(pointer.getBucketName == "bucket") assert(pointer.getObjectKey == "path/to/object") } - it should "reject invalid BigObjectPointer URIs" in { - assertThrows[IllegalArgumentException](new BigObjectPointer("http://invalid")) - assertThrows[IllegalArgumentException](new BigObjectPointer("not-a-uri")) - assertThrows[IllegalArgumentException](new BigObjectPointer(null)) + it should "reject invalid BigObject URIs" in { + assertThrows[IllegalArgumentException](new BigObject("http://invalid")) + assertThrows[IllegalArgumentException](new BigObject("not-a-uri")) + assertThrows[IllegalArgumentException](new BigObject(null)) } } From e84fd4a36032a838b648280a312768f135a4235d Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Tue, 18 Nov 2025 21:32:00 -0800 Subject: [PATCH 05/10] Add stream APIs --- .../apache/amber/core/tuple/BigObject.java | 44 +- .../service/util/BigObjectInputStream.scala | 107 +++++ .../service/util/BigObjectManager.scala | 110 ++--- .../service/util/BigObjectOutputStream.scala | 125 +++++ .../texera/service/util/S3StorageClient.scala | 18 - .../util/BigObjectInputStreamSpec.scala | 352 ++++++++++++++ .../service/util/BigObjectManagerSpec.scala | 429 ++++++++++++++---- .../util/BigObjectOutputStreamSpec.scala | 252 ++++++++++ .../service/util/S3StorageClientSpec.scala | 84 +--- .../source/scan/FileScanSourceOpExec.scala | 17 +- .../scan/FileScanSourceOpExecSpec.scala | 2 +- 11 files changed, 1256 insertions(+), 284 deletions(-) create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala create mode 100644 common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectInputStreamSpec.scala create mode 100644 common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectOutputStreamSpec.scala diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java index a0a1b288a6b..67ceaf01c92 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java @@ -24,41 +24,56 @@ import com.fasterxml.jackson.annotation.JsonValue; import org.apache.amber.core.executor.OperatorExecutor; import org.apache.texera.service.util.BigObjectManager; -import org.apache.texera.service.util.BigObjectStream; -import java.io.InputStream; import java.net.URI; import java.util.Objects; /** * BigObject represents a reference to a large object stored in S3. - * The reference is formatted as a URI: s3://bucket/path/to/object + * + * Each BigObject is identified by an S3 URI (s3://bucket/path/to/object). + * BigObjects are automatically tracked and cleaned up when the workflow execution completes. */ public class BigObject { private final String uri; /** - * Creates a BigObject from an S3 URI (primarily for deserialization). + * Creates a BigObject from an existing S3 URI. + * Used primarily for deserialization from JSON. * * @param uri S3 URI in the format s3://bucket/path/to/object + * @throws IllegalArgumentException if URI is null or doesn't start with "s3://" */ @JsonCreator public BigObject(@JsonProperty("uri") String uri) { - if (uri == null || !uri.startsWith("s3://")) { - throw new IllegalArgumentException("BigObject URI must start with 's3://' but was: " + uri); + if (uri == null) { + throw new IllegalArgumentException("BigObject URI cannot be null"); + } + if (!uri.startsWith("s3://")) { + throw new IllegalArgumentException( + "BigObject URI must start with 's3://', got: " + uri + ); } this.uri = uri; } /** - * Creates a new BigObject by uploading the stream to S3. + * Creates a new BigObject for writing data. + * Generates a unique S3 URI and registers it with the execution context. + * + * Usage example: * - * @param stream The input stream containing the data to store - * @param executor The operator executor that provides execution context + * BigObject bigObject = new BigObject(executor); + * try (BigObjectOutputStream out = new BigObjectOutputStream(bigObject)) { + * out.write(data); + * } + * // bigObject is now ready to be added to tuples + * + * @param executor The operator executor providing execution context */ - public BigObject(InputStream stream, OperatorExecutor executor) { - this(BigObjectManager.create(stream, executor.executionId(), executor.operatorId()).getUri()); + public BigObject(OperatorExecutor executor) { + this(BigObjectManager.create(executor)); } @JsonValue @@ -75,13 +90,6 @@ public String getObjectKey() { return path.startsWith("/") ? path.substring(1) : path; } - /** - * Opens this big object for reading. Caller must close the returned stream. - */ - public BigObjectStream open() { - return BigObjectManager.open(this); - } - @Override public String toString() { return uri; diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala new file mode 100644 index 00000000000..841bdc8cb27 --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import org.apache.amber.core.tuple.BigObject + +import java.io.InputStream + +/** + * InputStream for reading BigObject data from S3. + * + * The underlying S3 download is lazily initialized on first read. + * The stream will fail if the S3 object doesn't exist when read is attempted. + * + * Usage: + * {{{ + * val bigObject: BigObject = ... + * try (val in = new BigObjectInputStream(bigObject)) { + * val bytes = in.readAllBytes() + * } + * }}} + */ +class BigObjectInputStream(bigObject: BigObject) extends InputStream { + + require(bigObject != null, "BigObject cannot be null") + + // Lazy initialization - downloads only when first read() is called + private lazy val underlying: InputStream = + S3StorageClient.downloadObject(bigObject.getBucketName, bigObject.getObjectKey) + + @volatile private var closed = false + + override def read(): Int = { + ensureOpen() + underlying.read() + } + + override def read(b: Array[Byte], off: Int, len: Int): Int = { + ensureOpen() + underlying.read(b, off, len) + } + + override def readAllBytes(): Array[Byte] = { + ensureOpen() + underlying.readAllBytes() + } + + override def readNBytes(n: Int): Array[Byte] = { + ensureOpen() + underlying.readNBytes(n) + } + + override def skip(n: Long): Long = { + ensureOpen() + underlying.skip(n) + } + + override def available(): Int = { + ensureOpen() + underlying.available() + } + + override def close(): Unit = { + if (!closed) { + closed = true + if (underlying != null) { // Only close if initialized + underlying.close() + } + } + } + + override def markSupported(): Boolean = { + ensureOpen() + underlying.markSupported() + } + + override def mark(readlimit: Int): Unit = { + ensureOpen() + underlying.mark(readlimit) + } + + override def reset(): Unit = { + ensureOpen() + underlying.reset() + } + + private def ensureOpen(): Unit = { + if (closed) throw new java.io.IOException("Stream is closed") + } +} diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala index 888a4d29c40..9d647067749 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala @@ -20,99 +20,62 @@ package org.apache.texera.service.util import com.typesafe.scalalogging.LazyLogging +import org.apache.amber.core.executor.OperatorExecutor import org.apache.amber.core.tuple.BigObject import org.apache.texera.dao.SqlServer import org.apache.texera.dao.jooq.generated.Tables.BIG_OBJECT -import java.io.{Closeable, InputStream} import java.util.UUID import scala.jdk.CollectionConverters._ /** - * Stream for reading big objects from S3. - * All read methods guarantee to read the full requested amount (or until EOF). + * Manages the lifecycle of BigObjects stored in S3. + * + * Handles creation, tracking, and cleanup of large objects that exceed + * normal tuple size limits. Objects are automatically cleaned up when + * their associated workflow execution completes. */ -class BigObjectStream(private val inputStream: InputStream) extends Closeable { - - @volatile private var closed = false - - private def ensureOpen(): Unit = - if (closed) throw new IllegalStateException("Stream is closed") - - /** Reads all remaining bytes. */ - def read(): Array[Byte] = { - ensureOpen() - val out = new java.io.ByteArrayOutputStream() - val chunk = new Array[Byte](8192) - var n = inputStream.read(chunk) - while (n != -1) { - out.write(chunk, 0, n) - n = inputStream.read(chunk) - } - out.toByteArray - } - - /** Reads exactly `len` bytes (or until EOF). */ - def read(len: Int): Array[Byte] = { - ensureOpen() - if (len <= 0) return Array.emptyByteArray - - val buffer = new Array[Byte](len) - var total = 0 - while (total < len) { - val n = inputStream.read(buffer, total, len - total) - if (n == -1) return if (total == 0) Array.emptyByteArray else buffer.take(total) - total += n - } - buffer - } - - override def close(): Unit = if (!closed) { closed = true; inputStream.close() } - def isClosed: Boolean = closed -} - -/** Manages the lifecycle of large objects (>2GB) stored in S3. */ object BigObjectManager extends LazyLogging { private val DEFAULT_BUCKET = "texera-big-objects" private lazy val db = SqlServer.getInstance().createDSLContext() - /** Creates a big object from InputStream, uploads to S3, and registers in database. */ - def create(stream: InputStream, executionId: Int, operatorId: String): BigObject = { - + /** + * Creates a new BigObject reference and registers it for tracking. + * The actual data upload happens separately via BigObjectOutputStream. + * + * @param executor The operator executor providing execution context + * @return S3 URI string for the new BigObject (format: s3://bucket/key) + * @throws RuntimeException if database registration fails + */ + def create(executor: OperatorExecutor): String = { S3StorageClient.createBucketIfNotExist(DEFAULT_BUCKET) val objectKey = s"${System.currentTimeMillis()}/${UUID.randomUUID()}" val uri = s"s3://$DEFAULT_BUCKET/$objectKey" - S3StorageClient.uploadObject(DEFAULT_BUCKET, objectKey, stream) - try { db.insertInto(BIG_OBJECT) .columns(BIG_OBJECT.EXECUTION_ID, BIG_OBJECT.OPERATOR_ID, BIG_OBJECT.URI) - .values(Int.box(executionId), operatorId, uri) + .values(Int.box(executor.executionId), executor.operatorId, uri) .execute() - logger.debug(s"Created big object: eid=$executionId, opid=$operatorId, uri=$uri") + + logger.debug( + s"Created BigObject: eid=${executor.executionId}, opid=${executor.operatorId}, uri=$uri" + ) } catch { case e: Exception => - logger.error(s"Failed to register, cleaning up: $uri", e) - try S3StorageClient.deleteObject(DEFAULT_BUCKET, objectKey) - catch { case _: Exception => } - throw new RuntimeException(s"Failed to create big object: ${e.getMessage}", e) + throw new RuntimeException(s"Failed to register BigObject in database: ${e.getMessage}", e) } - new BigObject(uri) + uri } - /** Opens a big object for reading. */ - def open(ptr: BigObject): BigObjectStream = { - require( - S3StorageClient.objectExists(ptr.getBucketName, ptr.getObjectKey), - s"Big object does not exist: ${ptr.getUri}" - ) - new BigObjectStream(S3StorageClient.downloadObject(ptr.getBucketName, ptr.getObjectKey)) - } - - /** Deletes all big objects associated with an execution ID. */ + /** + * Deletes all BigObjects associated with an execution. + * Removes both the S3 objects and database records. + * + * @param executionId The execution ID whose BigObjects should be deleted + */ def delete(executionId: Int): Unit = { val uris = db .select(BIG_OBJECT.URI) @@ -122,19 +85,24 @@ object BigObjectManager extends LazyLogging { .asScala .toList - if (uris.isEmpty) return logger.debug(s"No big objects for execution $executionId") + if (uris.isEmpty) { + logger.debug(s"No BigObjects found for execution $executionId") + return + } - logger.info(s"Deleting ${uris.size} big object(s) for execution $executionId") + logger.info(s"Deleting ${uris.size} BigObject(s) for execution $executionId") uris.foreach { uri => try { - val ptr = new BigObject(uri) - S3StorageClient.deleteObject(ptr.getBucketName, ptr.getObjectKey) + val bigObject = new BigObject(uri) + S3StorageClient.deleteObject(bigObject.getBucketName, bigObject.getObjectKey) } catch { - case e: Exception => logger.error(s"Failed to delete: $uri", e) + case e: Exception => logger.error(s"Failed to delete BigObject from S3: $uri", e) } } - db.deleteFrom(BIG_OBJECT).where(BIG_OBJECT.EXECUTION_ID.eq(executionId)).execute() + db.deleteFrom(BIG_OBJECT) + .where(BIG_OBJECT.EXECUTION_ID.eq(executionId)) + .execute() } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala new file mode 100644 index 00000000000..45c43e889d7 --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import com.typesafe.scalalogging.LazyLogging +import org.apache.amber.core.tuple.BigObject + +import java.io.{IOException, OutputStream, PipedInputStream, PipedOutputStream} +import java.util.concurrent.atomic.AtomicReference +import scala.concurrent.{Await, ExecutionContext, Future} +import scala.concurrent.duration.Duration + +/** + * OutputStream for streaming BigObject data to S3. + * + * Data is uploaded in the background using multipart upload as you write. + * Call close() to complete the upload and ensure all data is persisted. + * + * Usage: + * {{{ + * val bigObject = new BigObject(executor) + * try (val out = new BigObjectOutputStream(bigObject)) { + * out.write(myBytes) + * } + * // bigObject is now ready to use + * }}} + * + * Note: Not thread-safe. Do not access from multiple threads concurrently. + * + * @param bigObject The BigObject reference to write to + */ +class BigObjectOutputStream(bigObject: BigObject) extends OutputStream with LazyLogging { + + private val PIPE_BUFFER_SIZE = 64 * 1024 // 64KB + + require(bigObject != null, "BigObject cannot be null") + + private val bucketName: String = bigObject.getBucketName + private val objectKey: String = bigObject.getObjectKey + private implicit val ec: ExecutionContext = ExecutionContext.global + + // Pipe: we write to pipedOut, and S3 reads from pipedIn + private val pipedIn = new PipedInputStream(PIPE_BUFFER_SIZE) + private val pipedOut = new PipedOutputStream(pipedIn) + + @volatile private var closed = false + private val uploadException = new AtomicReference[Option[Throwable]](None) + + // Start background upload immediately + private val uploadFuture: Future[Unit] = Future { + try { + S3StorageClient.createBucketIfNotExist(bucketName) + S3StorageClient.uploadObject(bucketName, objectKey, pipedIn) + logger.debug(s"Upload completed: ${bigObject.getUri}") + } catch { + case e: Exception => + uploadException.set(Some(e)) + logger.error(s"Upload failed: ${bigObject.getUri}", e) + } finally { + pipedIn.close() + } + } + + override def write(b: Int): Unit = { + ensureOpen() + pipedOut.write(b) + } + + override def write(b: Array[Byte], off: Int, len: Int): Unit = { + ensureOpen() + pipedOut.write(b, off, len) + } + + override def flush(): Unit = { + if (!closed) pipedOut.flush() + } + + /** + * Closes the stream and completes the S3 upload. + * Blocks until upload is complete. Throws IOException if upload failed. + */ + override def close(): Unit = { + if (closed) return + + closed = true + try { + pipedOut.close() + Await.result(uploadFuture, Duration.Inf) + checkUploadSuccess() + } catch { + case e: IOException => throw e + case e: Exception => + S3StorageClient.deleteObject(bucketName, objectKey) + throw new IOException(s"Failed to complete upload: ${e.getMessage}", e) + } + } + + private def ensureOpen(): Unit = { + if (closed) throw new IOException("Stream is closed") + checkUploadSuccess() + } + + private def checkUploadSuccess(): Unit = { + uploadException.get().foreach { ex => + throw new IOException("Background upload failed", ex) + } + } +} diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala index 3c533477052..e2e4979299e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala @@ -244,24 +244,6 @@ object S3StorageClient { ) } - /** - * Checks if an object exists in S3. - * - * @param bucketName The S3 bucket name. - * @param objectKey The object key (path) in S3. - * @return True if the object exists, false otherwise. - */ - def objectExists(bucketName: String, objectKey: String): Boolean = { - try { - s3Client.headObject( - HeadObjectRequest.builder().bucket(bucketName).key(objectKey).build() - ) - true - } catch { - case _: Exception => false - } - } - /** * Deletes a single object from S3. * diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectInputStreamSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectInputStreamSpec.scala new file mode 100644 index 00000000000..a163326b9d8 --- /dev/null +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectInputStreamSpec.scala @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import org.apache.amber.core.tuple.BigObject +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} +import org.scalatest.funsuite.AnyFunSuite + +import java.io.{ByteArrayInputStream, IOException} +import scala.util.Random + +class BigObjectInputStreamSpec + extends AnyFunSuite + with S3StorageTestBase + with BeforeAndAfterAll + with BeforeAndAfterEach { + + private val testBucketName = "test-big-object-input-stream" + + override def beforeAll(): Unit = { + super.beforeAll() + S3StorageClient.createBucketIfNotExist(testBucketName) + } + + override def afterAll(): Unit = { + try { + S3StorageClient.deleteDirectory(testBucketName, "") + } catch { + case _: Exception => // Ignore cleanup errors + } + super.afterAll() + } + + // Helper methods + private def createTestObject(key: String, data: Array[Byte]): BigObject = { + S3StorageClient.uploadObject(testBucketName, key, new ByteArrayInputStream(data)) + new BigObject(s"s3://$testBucketName/$key") + } + + private def createTestObject(key: String, data: String): BigObject = + createTestObject(key, data.getBytes) + + private def generateRandomData(size: Int): Array[Byte] = + Array.fill[Byte](size)((Random.nextInt(256) - 128).toByte) + + private def withStream[T](bigObject: BigObject)(f: BigObjectInputStream => T): T = { + val stream = new BigObjectInputStream(bigObject) + try { + f(stream) + } finally { + stream.close() + } + } + + private def assertThrowsIOExceptionWhenClosed(operation: BigObjectInputStream => Unit): Unit = { + val bigObject = createTestObject(s"test/closed-${Random.nextInt()}.txt", "data") + val stream = new BigObjectInputStream(bigObject) + stream.close() + val exception = intercept[IOException](operation(stream)) + assert(exception.getMessage.contains("Stream is closed")) + } + + // Constructor Tests + test("constructor should reject null BigObject") { + val exception = intercept[IllegalArgumentException] { + new BigObjectInputStream(null) + } + assert(exception.getMessage.contains("BigObject cannot be null")) + } + + test("constructor should accept valid BigObject") { + val bigObject = createTestObject("test/valid.txt", "test data") + withStream(bigObject) { _ => } + } + + // read() Tests + test("read() should read single bytes correctly") { + val bigObject = createTestObject("test/single-byte.txt", "Hello") + withStream(bigObject) { stream => + assert(stream.read() == 'H'.toByte) + assert(stream.read() == 'e'.toByte) + assert(stream.read() == 'l'.toByte) + assert(stream.read() == 'l'.toByte) + assert(stream.read() == 'o'.toByte) + assert(stream.read() == -1) // EOF + } + } + + test("read() should return -1 for empty object") { + val bigObject = createTestObject("test/empty.txt", "") + withStream(bigObject) { stream => + assert(stream.read() == -1) + } + } + + // read(byte[], int, int) Tests + test("read(byte[], int, int) should read data into buffer") { + val testData = "Hello, World!" + val bigObject = createTestObject("test/buffer-read.txt", testData) + withStream(bigObject) { stream => + val buffer = new Array[Byte](testData.length) + val bytesRead = stream.read(buffer, 0, buffer.length) + assert(bytesRead == testData.length) + assert(new String(buffer) == testData) + } + } + + test("read(byte[], int, int) should handle partial reads and offsets") { + val testData = "Hello, World!" + val bigObject = createTestObject("test/partial.txt", testData) + withStream(bigObject) { stream => + // Test partial read + val buffer1 = new Array[Byte](5) + assert(stream.read(buffer1, 0, 5) == 5) + assert(new String(buffer1) == "Hello") + } + + // Test offset + withStream(bigObject) { stream => + val buffer2 = new Array[Byte](20) + assert(stream.read(buffer2, 5, 10) == 10) + assert(new String(buffer2, 5, 10) == "Hello, Wor") + } + } + + test("read(byte[], int, int) should return -1 at EOF") { + val bigObject = createTestObject("test/eof.txt", "test") + withStream(bigObject) { stream => + val buffer = new Array[Byte](10) + stream.read(buffer, 0, 10) + assert(stream.read(buffer, 0, 10) == -1) + } + } + + // readAllBytes() Tests + test("readAllBytes() should read entire object") { + val testData = "Hello, World! This is a test." + val bigObject = createTestObject("test/read-all.txt", testData) + withStream(bigObject) { stream => + assert(new String(stream.readAllBytes()) == testData) + } + } + + test("readAllBytes() should handle large objects") { + val largeData = generateRandomData(1024 * 1024) // 1MB + val bigObject = createTestObject("test/large.bin", largeData) + withStream(bigObject) { stream => + val bytes = stream.readAllBytes() + assert(bytes.length == largeData.length) + assert(bytes.sameElements(largeData)) + } + } + + test("readAllBytes() should return empty array for empty object") { + val bigObject = createTestObject("test/empty-all.txt", "") + withStream(bigObject) { stream => + assert(stream.readAllBytes().length == 0) + } + } + + // readNBytes() Tests + test("readNBytes() should read exactly N bytes") { + val testData = "Hello, World! This is a test." + val bigObject = createTestObject("test/read-n.txt", testData) + withStream(bigObject) { stream => + val bytes = stream.readNBytes(5) + assert(bytes.length == 5) + assert(new String(bytes) == "Hello") + } + } + + test("readNBytes() should handle EOF and zero") { + val bigObject = createTestObject("test/read-n-eof.txt", "Hello") + withStream(bigObject) { stream => + // Request more than available + val bytes = stream.readNBytes(100) + assert(bytes.length == 5) + assert(new String(bytes) == "Hello") + } + + // Test n=0 + withStream(bigObject) { stream => + assert(stream.readNBytes(0).length == 0) + } + } + + // skip() Tests + test("skip() should skip bytes correctly") { + val bigObject = createTestObject("test/skip.txt", "Hello, World!") + withStream(bigObject) { stream => + assert(stream.skip(7) == 7) + assert(stream.read() == 'W'.toByte) + } + } + + test("skip() should handle EOF and zero") { + val bigObject = createTestObject("test/skip-eof.txt", "Hello") + withStream(bigObject) { stream => + assert(stream.skip(100) == 5) + assert(stream.read() == -1) + } + + // Test n=0 + withStream(bigObject) { stream => + assert(stream.skip(0) == 0) + } + } + + // available() Tests + test("available() should return non-negative value") { + val bigObject = createTestObject("test/available.txt", "Hello, World!") + withStream(bigObject) { stream => + assert(stream.available() >= 0) + } + } + + // close() Tests + test("close() should be idempotent") { + val bigObject = createTestObject("test/close-idempotent.txt", "data") + val stream = new BigObjectInputStream(bigObject) + stream.close() + stream.close() // Should not throw + stream.close() // Should not throw + } + + test("close() should prevent further operations") { + val bigObject = createTestObject("test/close-prevents.txt", "data") + val stream = new BigObjectInputStream(bigObject) + stream.close() + + intercept[IOException] { stream.read() } + intercept[IOException] { stream.readAllBytes() } + intercept[IOException] { stream.readNBytes(10) } + intercept[IOException] { stream.skip(10) } + intercept[IOException] { stream.available() } + } + + test("close() should work without reading (lazy initialization)") { + val bigObject = createTestObject("test/close-lazy.txt", "data") + val stream = new BigObjectInputStream(bigObject) + stream.close() // Should not throw + } + + // Closed stream tests - consolidated + test("operations should throw IOException when stream is closed") { + assertThrowsIOExceptionWhenClosed(_.read()) + assertThrowsIOExceptionWhenClosed(_.read(new Array[Byte](10), 0, 10)) + assertThrowsIOExceptionWhenClosed(_.readAllBytes()) + assertThrowsIOExceptionWhenClosed(_.readNBytes(10)) + assertThrowsIOExceptionWhenClosed(_.skip(10)) + assertThrowsIOExceptionWhenClosed(_.available()) + assertThrowsIOExceptionWhenClosed(_.mark(100)) + assertThrowsIOExceptionWhenClosed(_.reset()) + } + + // mark/reset Tests + test("markSupported() should delegate to underlying stream") { + val bigObject = createTestObject("test/mark.txt", "data") + withStream(bigObject) { stream => + val supported = stream.markSupported() + assert(!supported || supported) // Just verify it's callable + } + } + + test("mark() and reset() should delegate to underlying stream") { + val bigObject = createTestObject("test/mark-reset.txt", "data") + withStream(bigObject) { stream => + if (stream.markSupported()) { + stream.mark(100) + stream.read() + stream.reset() + } + // If not supported, methods should still be callable + } + } + + // Lazy initialization Tests + test("lazy initialization should not download until first read") { + val bigObject = createTestObject("test/lazy-init.txt", "data") + val stream = new BigObjectInputStream(bigObject) + // Creating the stream should not trigger download + // Reading should trigger download + try { + assert(stream.read() == 'd'.toByte) + } finally { + stream.close() + } + } + + // Integration Tests + test("should handle chunked reading of large objects") { + val largeData = generateRandomData(10 * 1024) // 10KB + val bigObject = createTestObject("test/chunked.bin", largeData) + withStream(bigObject) { stream => + val buffer = new Array[Byte](1024) + val output = new java.io.ByteArrayOutputStream() + var bytesRead = 0 + + while ({ + bytesRead = stream.read(buffer, 0, buffer.length) + bytesRead != -1 + }) { + output.write(buffer, 0, bytesRead) + } + + val result = output.toByteArray + assert(result.length == largeData.length) + assert(result.sameElements(largeData)) + } + } + + test("should handle multiple streams reading same object") { + val testData = "Shared data" + val bigObject = createTestObject("test/shared.txt", testData) + + val stream1 = new BigObjectInputStream(bigObject) + val stream2 = new BigObjectInputStream(bigObject) + + try { + assert(new String(stream1.readAllBytes()) == testData) + assert(new String(stream2.readAllBytes()) == testData) + } finally { + stream1.close() + stream2.close() + } + } + + test("should preserve binary data integrity") { + val binaryData = Array[Byte](0, 1, 2, 127, -128, -1, 50, 100) + val bigObject = createTestObject("test/binary.bin", binaryData) + withStream(bigObject) { stream => + assert(stream.readAllBytes().sameElements(binaryData)) + } + } +} diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala index b25acb7d130..72c61cde590 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala @@ -19,14 +19,13 @@ package org.apache.texera.service.util -import org.apache.amber.core.tuple.BigObject +import org.apache.amber.core.executor.OperatorExecutor +import org.apache.amber.core.tuple.{BigObject, Tuple, TupleLike} import org.apache.texera.dao.MockTexeraDB import org.apache.texera.dao.jooq.generated.Tables._ import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import org.scalatest.funsuite.AnyFunSuite -import java.io.ByteArrayInputStream - class BigObjectManagerSpec extends AnyFunSuite with MockTexeraDB @@ -95,6 +94,15 @@ class BigObjectManagerSpec .execute() } + /** Creates a mock OperatorExecutor for testing. */ + private def createMockExecutor(execId: Int, opId: String): OperatorExecutor = { + val executor = new OperatorExecutor { + override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = Iterator.empty + } + executor.initializeExecutionContext(execId, opId) + executor + } + /** Creates a big object from string data and returns it. */ private def createBigObject( data: String, @@ -102,17 +110,15 @@ class BigObjectManagerSpec opId: String = "test-op" ): BigObject = { createMockExecution(execId) - BigObjectManager.create(new ByteArrayInputStream(data.getBytes), execId, opId) - } - - /** Creates a BigObjectStream from test data. */ - private def createStream(data: String): BigObjectStream = - new BigObjectStream(new ByteArrayInputStream(data.getBytes)) - - /** Verifies that an object exists in S3. */ - private def assertObjectExists(pointer: BigObject, shouldExist: Boolean = true): Unit = { - val exists = S3StorageClient.objectExists(pointer.getBucketName, pointer.getObjectKey) - assert(exists == shouldExist) + val executor = createMockExecutor(execId, opId) + val bigObject = new BigObject(executor) + val out = new BigObjectOutputStream(bigObject) + try { + out.write(data.getBytes) + } finally { + out.close() + } + bigObject } /** Verifies standard bucket name. */ @@ -122,79 +128,111 @@ class BigObjectManagerSpec } // ======================================== - // BigObjectStream Tests + // BigObjectInputStream Tests (Standard Java InputStream) // ======================================== - test("BigObjectStream should read all bytes from stream") { + test("BigObjectInputStream should read all bytes from stream") { val data = "Hello, World! This is a test." - val stream = createStream(data) + val bigObject = createBigObject(data, execId = 100) - assert(stream.read().sameElements(data.getBytes)) + val stream = new BigObjectInputStream(bigObject) + assert(stream.readAllBytes().sameElements(data.getBytes)) stream.close() + + BigObjectManager.delete(100) } - test("BigObjectStream should read exact number of bytes") { - val stream = createStream("0123456789ABCDEF") - val result = stream.read(10) + test("BigObjectInputStream should read exact number of bytes") { + val bigObject = createBigObject("0123456789ABCDEF", execId = 101) + + val stream = new BigObjectInputStream(bigObject) + val result = stream.readNBytes(10) assert(result.length == 10) assert(result.sameElements("0123456789".getBytes)) stream.close() + + BigObjectManager.delete(101) } - test("BigObjectStream should handle reading more bytes than available") { + test("BigObjectInputStream should handle reading more bytes than available") { val data = "Short" - val stream = createStream(data) - val result = stream.read(100) + val bigObject = createBigObject(data, execId = 102) + + val stream = new BigObjectInputStream(bigObject) + val result = stream.readNBytes(100) assert(result.length == data.length) assert(result.sameElements(data.getBytes)) stream.close() - } - test("BigObjectStream should return empty array for 0 or negative byte reads") { - val stream = createStream("Test") - assert(stream.read(0).isEmpty) - assert(stream.read(-5).isEmpty) - stream.close() + BigObjectManager.delete(102) } - test("BigObjectStream should return empty array at EOF") { - val stream = createStream("EOF") - stream.read() // Read all data - assert(stream.read(10).isEmpty) + test("BigObjectInputStream should support standard single-byte read") { + val bigObject = createBigObject("ABC", execId = 103) + + val stream = new BigObjectInputStream(bigObject) + assert(stream.read() == 65) // 'A' + assert(stream.read() == 66) // 'B' + assert(stream.read() == 67) // 'C' + assert(stream.read() == -1) // EOF stream.close() + + BigObjectManager.delete(103) } - test("BigObjectStream should track closed state correctly") { - val stream = createStream("test") - assert(!stream.isClosed) + test("BigObjectInputStream should return -1 at EOF") { + val bigObject = createBigObject("EOF", execId = 104) + + val stream = new BigObjectInputStream(bigObject) + stream.readAllBytes() // Read all data + assert(stream.read() == -1) stream.close() - assert(stream.isClosed) + + BigObjectManager.delete(104) } - test("BigObjectStream should throw exception when reading from closed stream") { - val stream = createStream("test") + test("BigObjectInputStream should throw exception when reading from closed stream") { + val bigObject = createBigObject("test", execId = 105) + + val stream = new BigObjectInputStream(bigObject) stream.close() - assertThrows[IllegalStateException](stream.read()) - assertThrows[IllegalStateException](stream.read(10)) + assertThrows[java.io.IOException](stream.read()) + assertThrows[java.io.IOException](stream.readAllBytes()) + + BigObjectManager.delete(105) } - test("BigObjectStream should handle multiple close calls") { - val stream = createStream("test") + test("BigObjectInputStream should handle multiple close calls") { + val bigObject = createBigObject("test", execId = 106) + + val stream = new BigObjectInputStream(bigObject) stream.close() stream.close() // Should not throw - assert(stream.isClosed) + + BigObjectManager.delete(106) } - test("BigObjectStream should read large data correctly") { + test("BigObjectInputStream should read large data correctly") { val largeData = Array.fill[Byte](20000)((scala.util.Random.nextInt(256) - 128).toByte) - val stream = new BigObjectStream(new ByteArrayInputStream(largeData)) - - val result = stream.read() + createMockExecution(107) + val executor = createMockExecutor(107, "test-op") + val bigObject = new BigObject(executor) + val out = new BigObjectOutputStream(bigObject) + try { + out.write(largeData) + } finally { + out.close() + } + + val stream = new BigObjectInputStream(bigObject) + val result = stream.readAllBytes() assert(result.sameElements(largeData)) stream.close() + + BigObjectManager.delete(107) } // ======================================== @@ -215,38 +253,54 @@ class BigObjectManagerSpec assert(record.getUri == pointer.getUri) } - test("BigObjectManager should open and read a big object") { + test("BigObjectInputStream should open and read a big object") { val data = "Hello from big object!" val pointer = createBigObject(data, execId = 2) - val stream = BigObjectManager.open(pointer) - val readData = stream.read() + val stream = new BigObjectInputStream(pointer) + val readData = stream.readAllBytes() stream.close() assert(readData.sameElements(data.getBytes)) } - test("BigObjectManager should fail to open non-existent big object") { + test("BigObjectInputStream should fail to open non-existent big object") { val fakeBigObject = new BigObject("s3://texera-big-objects/nonexistent/file") - assertThrows[IllegalArgumentException](BigObjectManager.open(fakeBigObject)) + val stream = new BigObjectInputStream(fakeBigObject) + + try { + intercept[Exception] { + stream.read() + } + } finally { + try { stream.close() } + catch { case _: Exception => } + } } test("BigObjectManager should delete big objects by execution ID") { val execId = 3 createMockExecution(execId) - val pointer1 = - BigObjectManager.create(new ByteArrayInputStream("Object 1".getBytes), execId, "op-1") - val pointer2 = - BigObjectManager.create(new ByteArrayInputStream("Object 2".getBytes), execId, "op-2") - - assertObjectExists(pointer1) - assertObjectExists(pointer2) + val executor1 = createMockExecutor(execId, "op-1") + val pointer1 = new BigObject(executor1) + val out1 = new BigObjectOutputStream(pointer1) + try { + out1.write("Object 1".getBytes) + } finally { + out1.close() + } + + val executor2 = createMockExecutor(execId, "op-2") + val pointer2 = new BigObject(executor2) + val out2 = new BigObjectOutputStream(pointer2) + try { + out2.write("Object 2".getBytes) + } finally { + out2.close() + } BigObjectManager.delete(execId) - - assertObjectExists(pointer1, shouldExist = false) - assertObjectExists(pointer2, shouldExist = false) assert( getDSLContext.selectFrom(BIG_OBJECT).where(BIG_OBJECT.EXECUTION_ID.eq(execId)).fetch().isEmpty ) @@ -261,10 +315,6 @@ class BigObjectManagerSpec val pointer2 = createBigObject("Test data", execId = 5) BigObjectManager.delete(4) - - assertObjectExists(pointer1, shouldExist = false) - assertObjectExists(pointer2, shouldExist = true) - BigObjectManager.delete(5) } @@ -272,7 +322,6 @@ class BigObjectManagerSpec val pointer = createBigObject("Test bucket creation", execId = 6) assertStandardBucket(pointer) - assertObjectExists(pointer) BigObjectManager.delete(6) } @@ -280,10 +329,17 @@ class BigObjectManagerSpec test("BigObjectManager should handle large objects correctly") { val largeData = Array.fill[Byte](6 * 1024 * 1024)((scala.util.Random.nextInt(256) - 128).toByte) createMockExecution(7) - val pointer = BigObjectManager.create(new ByteArrayInputStream(largeData), 7, "large-op") - - val stream = BigObjectManager.open(pointer) - val readData = stream.read() + val executor = createMockExecutor(7, "large-op") + val pointer = new BigObject(executor) + val out = new BigObjectOutputStream(pointer) + try { + out.write(largeData) + } finally { + out.close() + } + + val stream = new BigObjectInputStream(pointer) + val readData = stream.readAllBytes() stream.close() assert(readData.sameElements(largeData)) @@ -292,10 +348,24 @@ class BigObjectManagerSpec test("BigObjectManager should generate unique URIs for different objects") { createMockExecution(8) - val data = new ByteArrayInputStream("Unique URI test".getBytes) - val pointer1 = BigObjectManager.create(data, 8, "test-op") - val pointer2 = - BigObjectManager.create(new ByteArrayInputStream("Unique URI test".getBytes), 8, "test-op") + val testData = "Unique URI test".getBytes + val executor = createMockExecutor(8, "test-op") + val pointer1 = new BigObject(executor) + val out1 = new BigObjectOutputStream(pointer1) + try { + out1.write(testData) + } finally { + out1.close() + } + + val executor2 = createMockExecutor(8, "test-op") + val pointer2 = new BigObject(executor2) + val out2 = new BigObjectOutputStream(pointer2) + try { + out2.write(testData) + } finally { + out2.close() + } assert(pointer1.getUri != pointer2.getUri) assert(pointer1.getObjectKey != pointer2.getObjectKey) @@ -303,16 +373,16 @@ class BigObjectManagerSpec BigObjectManager.delete(8) } - test("BigObjectManager should handle multiple reads from the same big object") { + test("BigObjectInputStream should handle multiple reads from the same big object") { val data = "Multiple reads test data" val pointer = createBigObject(data, execId = 9) - val stream1 = BigObjectManager.open(pointer) - val readData1 = stream1.read() + val stream1 = new BigObjectInputStream(pointer) + val readData1 = stream1.readAllBytes() stream1.close() - val stream2 = BigObjectManager.open(pointer) - val readData2 = stream2.read() + val stream2 = new BigObjectInputStream(pointer) + val readData2 = stream2.readAllBytes() stream2.close() assert(readData1.sameElements(data.getBytes)) @@ -335,12 +405,18 @@ class BigObjectManagerSpec // Object-Oriented API Tests // ======================================== - test("BigObjectManager.create() should create and register a big object") { + test("BigObject with BigObjectOutputStream should create and register a big object") { createMockExecution(11) - val data = "Test data for BigObjectManager.create()" - val stream = new ByteArrayInputStream(data.getBytes) + val data = "Test data for BigObject with BigObjectOutputStream" + val executor = createMockExecutor(11, "operator-11") - val bigObject = BigObjectManager.create(stream, 11, "operator-11") + val bigObject = new BigObject(executor) + val out = new BigObjectOutputStream(bigObject) + try { + out.write(data.getBytes) + } finally { + out.close() + } assertStandardBucket(bigObject) @@ -355,12 +431,12 @@ class BigObjectManagerSpec BigObjectManager.delete(11) } - test("BigObject.open() should read big object contents") { - val data = "Test data for bigObject.open()" + test("BigObjectInputStream constructor should read big object contents") { + val data = "Test data for BigObjectInputStream constructor" val bigObject = createBigObject(data, execId = 12) - val stream = bigObject.open() - val readData = stream.read() + val stream = new BigObjectInputStream(bigObject) + val readData = stream.readAllBytes() stream.close() assert(readData.sameElements(data.getBytes)) @@ -368,21 +444,176 @@ class BigObjectManagerSpec BigObjectManager.delete(12) } - test("BigObjectManager.create() and BigObject.open() should work together end-to-end") { + test("BigObjectOutputStream and BigObjectInputStream should work together end-to-end") { createMockExecution(13) val data = "End-to-end test data" - - // Create using BigObjectManager - val bigObject = - BigObjectManager.create(new ByteArrayInputStream(data.getBytes), 13, "operator-13") - - // Read using BigObject instance method - val stream = bigObject.open() - val readData = stream.read() + val executor = createMockExecutor(13, "operator-13") + + // Create using streaming API + val bigObject = new BigObject(executor) + val out = new BigObjectOutputStream(bigObject) + try { + out.write(data.getBytes) + } finally { + out.close() + } + + // Read using standard constructor + val stream = new BigObjectInputStream(bigObject) + val readData = stream.readAllBytes() stream.close() assert(readData.sameElements(data.getBytes)) BigObjectManager.delete(13) } + + // ======================================== + // BigObjectOutputStream Tests (New Symmetric API) + // ======================================== + + test("BigObjectOutputStream should write and upload data to S3") { + createMockExecution(200) + val executor = createMockExecutor(200, "operator-200") + val data = "Test data for BigObjectOutputStream" + + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write(data.getBytes) + outStream.close() + + assertStandardBucket(bigObject) + + // Verify data can be read back + val inStream = new BigObjectInputStream(bigObject) + val readData = inStream.readAllBytes() + inStream.close() + + assert(readData.sameElements(data.getBytes)) + + BigObjectManager.delete(200) + } + + test("BigObjectOutputStream should register big object in database") { + createMockExecution(201) + val executor = createMockExecutor(201, "operator-201") + val data = "Database registration test" + + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write(data.getBytes) + outStream.close() + + val record = getDSLContext + .selectFrom(BIG_OBJECT) + .where(BIG_OBJECT.EXECUTION_ID.eq(201).and(BIG_OBJECT.OPERATOR_ID.eq("operator-201"))) + .fetchOne() + + assert(record != null) + assert(record.getUri == bigObject.getUri) + + BigObjectManager.delete(201) + } + + test("BigObjectOutputStream should handle large data correctly") { + createMockExecution(202) + val executor = createMockExecutor(202, "operator-202") + val largeData = Array.fill[Byte](8 * 1024 * 1024)((scala.util.Random.nextInt(256) - 128).toByte) + + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write(largeData) + outStream.close() + + // Verify data integrity + val inStream = new BigObjectInputStream(bigObject) + val readData = inStream.readAllBytes() + inStream.close() + + assert(readData.sameElements(largeData)) + + BigObjectManager.delete(202) + } + + test("BigObjectOutputStream should handle multiple writes") { + createMockExecution(203) + val executor = createMockExecutor(203, "operator-203") + + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write("Hello ".getBytes) + outStream.write("World".getBytes) + outStream.write("!".getBytes) + outStream.close() + + val inStream = new BigObjectInputStream(bigObject) + val readData = inStream.readAllBytes() + inStream.close() + + assert(readData.sameElements("Hello World!".getBytes)) + + BigObjectManager.delete(203) + } + + test("BigObjectOutputStream should throw exception when writing to closed stream") { + createMockExecution(204) + val executor = createMockExecutor(204, "operator-204") + + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write("test".getBytes) + outStream.close() + + assertThrows[java.io.IOException](outStream.write("more".getBytes)) + + BigObjectManager.delete(204) + } + + test("BigObjectOutputStream should handle close() being called multiple times") { + createMockExecution(205) + val executor = createMockExecutor(205, "operator-205") + + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write("test".getBytes) + outStream.close() + outStream.close() // Should not throw + + BigObjectManager.delete(205) + } + + test("New BigObject(executor) constructor should create unique URIs") { + createMockExecution(206) + val executor1 = createMockExecutor(206, "operator-206") + val executor2 = createMockExecutor(206, "operator-206") + + val bigObject1 = new BigObject(executor1) + val bigObject2 = new BigObject(executor2) + + assert(bigObject1.getUri != bigObject2.getUri) + assert(bigObject1.getObjectKey != bigObject2.getObjectKey) + + BigObjectManager.delete(206) + } + + test("BigObject(executor) and BigObjectOutputStream API should be symmetric with input") { + createMockExecution(207) + val executor = createMockExecutor(207, "operator-207") + val data = "Symmetric API test" + + // Write using new symmetric API + val bigObject = new BigObject(executor) + val outStream = new BigObjectOutputStream(bigObject) + outStream.write(data.getBytes) + outStream.close() + + // Read using symmetric API + val inStream = new BigObjectInputStream(bigObject) + val readData = inStream.readAllBytes() + inStream.close() + + assert(readData.sameElements(data.getBytes)) + + BigObjectManager.delete(207) + } } diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectOutputStreamSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectOutputStreamSpec.scala new file mode 100644 index 00000000000..14fdfa1ddb0 --- /dev/null +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectOutputStreamSpec.scala @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.service.util + +import org.apache.amber.core.tuple.BigObject +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} +import org.scalatest.funsuite.AnyFunSuite + +import java.io.IOException +import scala.util.Random + +class BigObjectOutputStreamSpec + extends AnyFunSuite + with S3StorageTestBase + with BeforeAndAfterAll + with BeforeAndAfterEach { + + private val testBucketName = "test-big-object-output-stream" + + override def beforeAll(): Unit = { + super.beforeAll() + S3StorageClient.createBucketIfNotExist(testBucketName) + } + + override def afterAll(): Unit = { + try { + S3StorageClient.deleteDirectory(testBucketName, "") + } catch { + case _: Exception => // Ignore cleanup errors + } + super.afterAll() + } + + // Helper methods + private def createBigObject(key: String): BigObject = + new BigObject(s"s3://$testBucketName/$key") + + private def generateRandomData(size: Int): Array[Byte] = + Array.fill[Byte](size)((Random.nextInt(256) - 128).toByte) + + private def withStream[T](bigObject: BigObject)(f: BigObjectOutputStream => T): T = { + val stream = new BigObjectOutputStream(bigObject) + try f(stream) + finally stream.close() + } + + private def readBack(bigObject: BigObject): Array[Byte] = { + val inputStream = new BigObjectInputStream(bigObject) + try inputStream.readAllBytes() + finally inputStream.close() + } + + private def writeAndVerify(key: String, data: Array[Byte]): Unit = { + val bigObject = createBigObject(key) + withStream(bigObject)(_.write(data, 0, data.length)) + assert(readBack(bigObject).sameElements(data)) + } + + // === Constructor Tests === + test("should reject null BigObject") { + val exception = intercept[IllegalArgumentException](new BigObjectOutputStream(null)) + assert(exception.getMessage.contains("BigObject cannot be null")) + } + + // === Basic Write Tests === + test("should write single bytes correctly") { + val bigObject = createBigObject("test/single-bytes.txt") + withStream(bigObject) { stream => + "Hello".foreach(c => stream.write(c.toByte)) + } + assert(new String(readBack(bigObject)) == "Hello") + } + + test("should write byte arrays correctly") { + val testData = "Hello, World!".getBytes + writeAndVerify("test/array-write.txt", testData) + } + + test("should handle partial writes with offset and length") { + val testData = "Hello, World!".getBytes + val bigObject = createBigObject("test/partial-write.txt") + + withStream(bigObject) { stream => + stream.write(testData, 0, 5) // "Hello" + stream.write(testData, 7, 5) // "World" + } + + assert(new String(readBack(bigObject)) == "HelloWorld") + } + + test("should handle multiple consecutive writes") { + val bigObject = createBigObject("test/multiple-writes.txt") + withStream(bigObject) { stream => + stream.write("Hello".getBytes) + stream.write(", ".getBytes) + stream.write("World!".getBytes) + } + assert(new String(readBack(bigObject)) == "Hello, World!") + } + + // === Stream Lifecycle Tests === + test("flush should not throw") { + val bigObject = createBigObject("test/flush.txt") + withStream(bigObject) { stream => + stream.write("test".getBytes) + stream.flush() + stream.write(" data".getBytes) + } + assert(new String(readBack(bigObject)) == "test data") + } + + test("close should be idempotent") { + val bigObject = createBigObject("test/close-idempotent.txt") + val stream = new BigObjectOutputStream(bigObject) + stream.write("data".getBytes) + stream.close() + stream.close() // Should not throw + stream.flush() // Should not throw after close + assert(new String(readBack(bigObject)) == "data") + } + + test("close should handle empty stream") { + val bigObject = createBigObject("test/empty-stream.txt") + val stream = new BigObjectOutputStream(bigObject) + stream.close() + assert(readBack(bigObject).length == 0) + } + + // === Error Handling === + test("write operations should throw IOException when stream is closed") { + val bigObject = createBigObject("test/closed-stream.txt") + val stream = new BigObjectOutputStream(bigObject) + stream.close() + + val ex1 = intercept[IOException](stream.write('A'.toByte)) + assert(ex1.getMessage.contains("Stream is closed")) + + val ex2 = intercept[IOException](stream.write("test".getBytes)) + assert(ex2.getMessage.contains("Stream is closed")) + } + + // === Large Data Tests === + test("should handle large data (1MB)") { + val largeData = generateRandomData(1024 * 1024) + writeAndVerify("test/large-1mb.bin", largeData) + } + + test("should handle very large data (10MB)") { + val veryLargeData = generateRandomData(10 * 1024 * 1024) + writeAndVerify("test/large-10mb.bin", veryLargeData) + } + + test("should handle chunked writes") { + val totalSize = 1024 * 1024 // 1MB + val chunkSize = 8 * 1024 // 8KB + val data = generateRandomData(totalSize) + val bigObject = createBigObject("test/chunked.bin") + + withStream(bigObject) { stream => + data.grouped(chunkSize).foreach(chunk => stream.write(chunk)) + } + + assert(readBack(bigObject).sameElements(data)) + } + + // === Binary Data Tests === + test("should preserve all byte values (0-255)") { + val allBytes = (0 until 256).map(_.toByte).toArray + writeAndVerify("test/all-bytes.bin", allBytes) + } + + // === Integration Tests === + test("should handle concurrent writes to different objects") { + val streams = (1 to 3).map { i => + val obj = createBigObject(s"test/concurrent-$i.txt") + val stream = new BigObjectOutputStream(obj) + (obj, stream, s"Data $i") + } + + try { + streams.foreach { case (_, stream, data) => stream.write(data.getBytes) } + } finally { + streams.foreach(_._2.close()) + } + + streams.foreach { + case (obj, _, expected) => + assert(new String(readBack(obj)) == expected) + } + } + + test("should overwrite existing object") { + val bigObject = createBigObject("test/overwrite.txt") + withStream(bigObject)(_.write("original data".getBytes)) + withStream(bigObject)(_.write("new data".getBytes)) + assert(new String(readBack(bigObject)) == "new data") + } + + test("should handle mixed write operations") { + val bigObject = createBigObject("test/mixed-writes.txt") + withStream(bigObject) { stream => + stream.write('A'.toByte) + stream.write(" test ".getBytes) + stream.write('B'.toByte) + val data = "Hello, World!".getBytes + stream.write(data, 7, 6) // "World!" + } + assert(new String(readBack(bigObject)) == "A test BWorld!") + } + + // === Edge Cases === + test("should create bucket automatically") { + val newBucketName = s"new-bucket-${Random.nextInt(10000)}" + val bigObject = new BigObject(s"s3://$newBucketName/test/auto-create.txt") + + try { + withStream(bigObject)(_.write("test".getBytes)) + assert(new String(readBack(bigObject)) == "test") + } finally { + try S3StorageClient.deleteDirectory(newBucketName, "") + catch { case _: Exception => /* ignore */ } + } + } + + test("should handle rapid open/close cycles") { + (1 to 10).foreach { i => + withStream(createBigObject(s"test/rapid-$i.txt"))(_.write(s"data-$i".getBytes)) + } + + (1 to 10).foreach { i => + val result = readBack(createBigObject(s"test/rapid-$i.txt")) + assert(new String(result) == s"data-$i") + } + } +} diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala index 83927928c4a..a1662cf8c3f 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/S3StorageClientSpec.scala @@ -81,7 +81,6 @@ class S3StorageClientSpec assert(eTag != null) assert(eTag.nonEmpty) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) // Clean up S3StorageClient.deleteObject(testBucketName, objectKey) @@ -93,7 +92,6 @@ class S3StorageClientSpec val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("")) assert(eTag != null) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) // Clean up S3StorageClient.deleteObject(testBucketName, objectKey) @@ -108,7 +106,6 @@ class S3StorageClientSpec assert(eTag != null) assert(eTag.nonEmpty) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) // Verify the uploaded content val downloadedStream = S3StorageClient.downloadObject(testBucketName, objectKey) @@ -129,7 +126,6 @@ class S3StorageClientSpec val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) assert(eTag != null) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) // Clean up S3StorageClient.deleteObject(testBucketName, objectKey) @@ -231,54 +227,6 @@ class S3StorageClientSpec S3StorageClient.deleteObject(testBucketName, objectKey) } - // ======================================== - // objectExists Tests - // ======================================== - - test("objectExists should return true for existing object") { - val objectKey = "test/exists-test.txt" - S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("exists test")) - - assert(S3StorageClient.objectExists(testBucketName, objectKey)) - - // Clean up - S3StorageClient.deleteObject(testBucketName, objectKey) - } - - test("objectExists should return false for non-existent object") { - val nonExistentKey = "test/does-not-exist.txt" - - assert(!S3StorageClient.objectExists(testBucketName, nonExistentKey)) - } - - test("objectExists should return false for deleted object") { - val objectKey = "test/deleted-object.txt" - S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("to be deleted")) - - assert(S3StorageClient.objectExists(testBucketName, objectKey)) - - S3StorageClient.deleteObject(testBucketName, objectKey) - - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) - } - - test("objectExists should return false for non-existent bucket") { - val nonExistentBucket = "non-existent-bucket-12345" - val objectKey = "test/object.txt" - - assert(!S3StorageClient.objectExists(nonExistentBucket, objectKey)) - } - - test("objectExists should handle objects with special characters") { - val objectKey = "test/special/path with spaces & chars!@#.txt" - S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("special chars")) - - assert(S3StorageClient.objectExists(testBucketName, objectKey)) - - // Clean up - S3StorageClient.deleteObject(testBucketName, objectKey) - } - // ======================================== // deleteObject Tests // ======================================== @@ -287,11 +235,12 @@ class S3StorageClientSpec val objectKey = "test/delete-test.txt" S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream("delete me")) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) - S3StorageClient.deleteObject(testBucketName, objectKey) - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + // Verify deletion by attempting to download + assertThrows[Exception] { + S3StorageClient.downloadObject(testBucketName, objectKey) + } } test("deleteObject should not throw exception for non-existent object") { @@ -306,10 +255,13 @@ class S3StorageClientSpec val objectKey = "test/large-delete-test.bin" S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(largeData)) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) S3StorageClient.deleteObject(testBucketName, objectKey) - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) + + // Verify deletion by attempting to download + assertThrows[Exception] { + S3StorageClient.downloadObject(testBucketName, objectKey) + } } test("deleteObject should handle multiple deletions of the same object") { @@ -321,11 +273,9 @@ class S3StorageClientSpec ) S3StorageClient.deleteObject(testBucketName, objectKey) - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) // Second delete should not throw exception S3StorageClient.deleteObject(testBucketName, objectKey) - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) } // ======================================== @@ -340,9 +290,6 @@ class S3StorageClientSpec val eTag = S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) assert(eTag != null) - // Verify exists - assert(S3StorageClient.objectExists(testBucketName, objectKey)) - // Download val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) val downloadedData = new String(readInputStream(inputStream)) @@ -351,7 +298,6 @@ class S3StorageClientSpec // Delete S3StorageClient.deleteObject(testBucketName, objectKey) - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) } test("multiple objects can be managed independently") { @@ -367,19 +313,9 @@ class S3StorageClientSpec S3StorageClient.uploadObject(testBucketName, key, createInputStream(data)) } - // Verify all exist - objects.keys.foreach { key => - assert(S3StorageClient.objectExists(testBucketName, key)) - } - // Delete one object S3StorageClient.deleteObject(testBucketName, "test/object2.txt") - // Verify deletion and others still exist - assert(S3StorageClient.objectExists(testBucketName, "test/object1.txt")) - assert(!S3StorageClient.objectExists(testBucketName, "test/object2.txt")) - assert(S3StorageClient.objectExists(testBucketName, "test/object3.txt")) - // Clean up remaining objects S3StorageClient.deleteObject(testBucketName, "test/object1.txt") S3StorageClient.deleteObject(testBucketName, "test/object3.txt") @@ -390,7 +326,6 @@ class S3StorageClientSpec val testData = "Nested path test" S3StorageClient.uploadObject(testBucketName, objectKey, createInputStream(testData)) - assert(S3StorageClient.objectExists(testBucketName, objectKey)) val inputStream = S3StorageClient.downloadObject(testBucketName, objectKey) val downloadedData = new String(readInputStream(inputStream)) @@ -398,6 +333,5 @@ class S3StorageClientSpec assert(downloadedData == testData) S3StorageClient.deleteObject(testBucketName, objectKey) - assert(!S3StorageClient.objectExists(testBucketName, objectKey)) } } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala index b0be7fdf2ee..55a09f9ebef 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala @@ -24,6 +24,7 @@ import org.apache.amber.core.storage.DocumentFactory import org.apache.amber.core.tuple.AttributeTypeUtils.parseField import org.apache.amber.core.tuple.{BigObject, TupleLike} import org.apache.amber.util.JSONUtils.objectMapper +import org.apache.texera.service.util.BigObjectOutputStream import org.apache.commons.compress.archivers.ArchiveStreamFactory import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream import org.apache.commons.io.IOUtils.toByteArray @@ -85,8 +86,20 @@ class FileScanSourceOpExec private[scan] ( case FileAttributeType.SINGLE_STRING => new String(toByteArray(entry), desc.fileEncoding.getCharset) case FileAttributeType.BIG_OBJECT => - // For big objects, create a big object from the input stream - new BigObject(entry, FileScanSourceOpExec.this) + // For big objects, create reference and upload via streaming + val bigObject = new BigObject(FileScanSourceOpExec.this) + val out = new BigObjectOutputStream(bigObject) + try { + val buffer = new Array[Byte](8192) + var bytesRead = entry.read(buffer) + while (bytesRead != -1) { + out.write(buffer, 0, bytesRead) + bytesRead = entry.read(buffer) + } + } finally { + out.close() + } + bigObject case _ => parseField(toByteArray(entry), desc.attributeType.getType) }) TupleLike(fields.toSeq: _*) diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala index 509ba0bdf59..fe205d17ddf 100644 --- a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala @@ -158,6 +158,6 @@ class FileScanSourceOpExecSpec extends AnyFlatSpec with BeforeAndAfterAll { it should "reject invalid BigObject URIs" in { assertThrows[IllegalArgumentException](new BigObject("http://invalid")) assertThrows[IllegalArgumentException](new BigObject("not-a-uri")) - assertThrows[IllegalArgumentException](new BigObject(null)) + assertThrows[IllegalArgumentException](new BigObject(null.asInstanceOf[String])) } } From 4a8d9b48600d0fd820a428bbe6e282d9c80e7638 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Thu, 20 Nov 2025 17:28:54 -0800 Subject: [PATCH 06/10] Remove execution id and tables --- .../architecture/rpc/controlcommands.proto | 1 - .../RegionExecutionCoordinator.scala | 3 +- .../InitializeExecutorHandler.scala | 12 +- .../user/workflow/WorkflowResource.scala | 5 +- .../texera/web/service/WorkflowService.scala | 4 +- .../architecture/worker/WorkerSpec.scala | 3 +- .../core/executor/OperatorExecutor.scala | 15 - .../apache/amber/core/tuple/BigObject.java | 9 +- .../service/util/BigObjectManager.scala | 70 +---- .../service/util/BigObjectOutputStream.scala | 2 +- .../service/util/BigObjectManagerSpec.scala | 290 +++++------------- .../source/scan/FileScanSourceOpExec.scala | 2 +- .../scan/FileScanSourceOpExecSpec.scala | 1 - sql/texera_ddl.sql | 10 +- sql/updates/16.sql | 34 -- 15 files changed, 98 insertions(+), 363 deletions(-) delete mode 100644 sql/updates/16.sql diff --git a/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto index d596f8b0447..41f0976314c 100644 --- a/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/amber/engine/architecture/rpc/controlcommands.proto @@ -256,7 +256,6 @@ message InitializeExecutorRequest { int32 totalWorkerCount = 1; core.OpExecInitInfo opExecInitInfo = 2; bool isSource = 3; - int64 executionId = 4; } message UpdateExecutorRequest { diff --git a/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 7dd319fb98d..a83af49dde1 100644 --- a/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -361,8 +361,7 @@ class RegionExecutionCoordinator( InitializeExecutorRequest( workerConfigs.length, physicalOp.opExecInitInfo, - physicalOp.isSourceOperator, - physicalOp.executionId.id + physicalOp.isSourceOperator ), asyncRPCClient.mkContext(workerId) ) diff --git a/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala b/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala index d4d548a7f44..32a718606cb 100644 --- a/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala +++ b/amber/src/main/scala/org/apache/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala @@ -42,12 +42,7 @@ trait InitializeExecutorHandler { dp.serializationManager.setOpInitialization(req) val workerIdx = VirtualIdentityUtils.getWorkerIndex(actorId) val workerCount = req.totalWorkerCount - - val executionId = req.executionId.toInt - val operatorId = VirtualIdentityUtils.getPhysicalOpId(actorId).logicalOpId.id - - // Create the executor - val executor = req.opExecInitInfo match { + dp.executor = req.opExecInitInfo match { case OpExecWithClassName(className, descString) => ExecFactory.newExecFromJavaClassName(className, descString, workerIdx, workerCount) case OpExecWithCode(code, _) => @@ -57,11 +52,6 @@ trait InitializeExecutorHandler { case OpExecInitInfo.Empty => throw new IllegalArgumentException("Empty executor initialization info") } - - // Initialize execution context on the executor instance - executor.initializeExecutionContext(executionId, operatorId) - - dp.executor = executor EmptyReturn() } diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala index b132fc1d185..81081c636f7 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala @@ -601,10 +601,7 @@ class WorkflowResource extends LazyLogging { .asScala .toList - // Delete big objects - eids.foreach { eid => - BigObjectManager.delete(eid.toInt) - } + BigObjectManager.delete() // Collect all URIs related to executions for cleanup val uris = eids.flatMap { eid => diff --git a/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala b/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala index 35dd9fde27a..38d0ae93279 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala @@ -345,9 +345,7 @@ class WorkflowService( logger.debug(s"Error processing document at $uri: ${error.getMessage}") } } - // Delete big objects - BigObjectManager.delete(eid.id.toInt) + BigObjectManager.delete() } - } diff --git a/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala b/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala index 895f06c1371..1bc3a160783 100644 --- a/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala +++ b/amber/src/test/scala/org/apache/amber/engine/architecture/worker/WorkerSpec.scala @@ -194,8 +194,7 @@ class WorkerSpec InitializeExecutorRequest( 1, OpExecWithClassName("org.apache.amber.engine.architecture.worker.DummyOperatorExecutor"), - isSource = false, - 1 + isSource = false ), AsyncRPCContext(CONTROLLER, identifier1), 4 diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala index 8caa9a2f3e3..69e62a8f308 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/executor/OperatorExecutor.scala @@ -25,21 +25,6 @@ import org.apache.amber.core.workflow.PortIdentity trait OperatorExecutor { - // Execution context - private var _executionId: Option[Int] = None - private var _operatorId: Option[String] = None - - def executionId: Int = - _executionId.getOrElse(throw new IllegalStateException("Execution context not initialized")) - - def operatorId: String = - _operatorId.getOrElse(throw new IllegalStateException("Execution context not initialized")) - - final def initializeExecutionContext(execId: Int, opId: String): Unit = { - _executionId = Some(execId) - _operatorId = Some(opId) - } - def open(): Unit = {} def produceStateOnStart(port: Int): Option[State] = None diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java index 67ceaf01c92..2be14dc167c 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/BigObject.java @@ -60,20 +60,19 @@ public BigObject(@JsonProperty("uri") String uri) { /** * Creates a new BigObject for writing data. - * Generates a unique S3 URI and registers it with the execution context. + * Generates a unique S3 URI. * * Usage example: * - * BigObject bigObject = new BigObject(executor); + * BigObject bigObject = new BigObject(); * try (BigObjectOutputStream out = new BigObjectOutputStream(bigObject)) { * out.write(data); * } * // bigObject is now ready to be added to tuples * - * @param executor The operator executor providing execution context */ - public BigObject(OperatorExecutor executor) { - this(BigObjectManager.create(executor)); + public BigObject() { + this(BigObjectManager.create()); } @JsonValue diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala index 9d647067749..efa12ad5226 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala @@ -20,13 +20,8 @@ package org.apache.texera.service.util import com.typesafe.scalalogging.LazyLogging -import org.apache.amber.core.executor.OperatorExecutor -import org.apache.amber.core.tuple.BigObject -import org.apache.texera.dao.SqlServer -import org.apache.texera.dao.jooq.generated.Tables.BIG_OBJECT import java.util.UUID -import scala.jdk.CollectionConverters._ /** * Manages the lifecycle of BigObjects stored in S3. @@ -37,72 +32,37 @@ import scala.jdk.CollectionConverters._ */ object BigObjectManager extends LazyLogging { private val DEFAULT_BUCKET = "texera-big-objects" - private lazy val db = SqlServer.getInstance().createDSLContext() /** - * Creates a new BigObject reference and registers it for tracking. + * Creates a new BigObject reference. * The actual data upload happens separately via BigObjectOutputStream. * - * @param executor The operator executor providing execution context * @return S3 URI string for the new BigObject (format: s3://bucket/key) - * @throws RuntimeException if database registration fails */ - def create(executor: OperatorExecutor): String = { + def create(): String = { S3StorageClient.createBucketIfNotExist(DEFAULT_BUCKET) - val objectKey = s"${System.currentTimeMillis()}/${UUID.randomUUID()}" + val objectKey = s"objects/${System.currentTimeMillis()}/${UUID.randomUUID()}" val uri = s"s3://$DEFAULT_BUCKET/$objectKey" - try { - db.insertInto(BIG_OBJECT) - .columns(BIG_OBJECT.EXECUTION_ID, BIG_OBJECT.OPERATOR_ID, BIG_OBJECT.URI) - .values(Int.box(executor.executionId), executor.operatorId, uri) - .execute() - - logger.debug( - s"Created BigObject: eid=${executor.executionId}, opid=${executor.operatorId}, uri=$uri" - ) - } catch { - case e: Exception => - throw new RuntimeException(s"Failed to register BigObject in database: ${e.getMessage}", e) - } - uri } /** - * Deletes all BigObjects associated with an execution. - * Removes both the S3 objects and database records. + * Deletes all big objects from the bucket. * - * @param executionId The execution ID whose BigObjects should be deleted + * @throws Exception if the deletion fails + * @return Unit */ - def delete(executionId: Int): Unit = { - val uris = db - .select(BIG_OBJECT.URI) - .from(BIG_OBJECT) - .where(BIG_OBJECT.EXECUTION_ID.eq(executionId)) - .fetchInto(classOf[String]) - .asScala - .toList - - if (uris.isEmpty) { - logger.debug(s"No BigObjects found for execution $executionId") - return - } - - logger.info(s"Deleting ${uris.size} BigObject(s) for execution $executionId") - - uris.foreach { uri => - try { - val bigObject = new BigObject(uri) - S3StorageClient.deleteObject(bigObject.getBucketName, bigObject.getObjectKey) - } catch { - case e: Exception => logger.error(s"Failed to delete BigObject from S3: $uri", e) - } + def delete(): Unit = { + try { + S3StorageClient.deleteDirectory(DEFAULT_BUCKET, "objects") + logger.info(s"Successfully deleted all big objects from bucket: $DEFAULT_BUCKET") + } catch { + case e: Exception => + logger.error(s"Failed to delete big objects from bucket: $DEFAULT_BUCKET", e) + throw e } - - db.deleteFrom(BIG_OBJECT) - .where(BIG_OBJECT.EXECUTION_ID.eq(executionId)) - .execute() } + } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala index 45c43e889d7..00d44ab03ac 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala @@ -35,7 +35,7 @@ import scala.concurrent.duration.Duration * * Usage: * {{{ - * val bigObject = new BigObject(executor) + * val bigObject = new BigObject() * try (val out = new BigObjectOutputStream(bigObject)) { * out.write(myBytes) * } diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala index 72c61cde590..ff003005ff7 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala @@ -19,99 +19,14 @@ package org.apache.texera.service.util -import org.apache.amber.core.executor.OperatorExecutor -import org.apache.amber.core.tuple.{BigObject, Tuple, TupleLike} -import org.apache.texera.dao.MockTexeraDB -import org.apache.texera.dao.jooq.generated.Tables._ -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} +import org.apache.amber.core.tuple.BigObject import org.scalatest.funsuite.AnyFunSuite -class BigObjectManagerSpec - extends AnyFunSuite - with MockTexeraDB - with S3StorageTestBase - with BeforeAndAfterAll - with BeforeAndAfterEach { - - override def beforeAll(): Unit = { - super.beforeAll() - initializeDBAndReplaceDSLContext() - } - - override def afterAll(): Unit = { - shutdownDB() - super.afterAll() - } - - override def beforeEach(): Unit = { - getDSLContext.deleteFrom(BIG_OBJECT).execute() - } - - /** Creates mock workflow execution records needed for foreign key constraints. */ - private def createMockExecution(executionId: Int): Unit = { - val dsl = getDSLContext - val id = Int.box(executionId) - - dsl - .insertInto(USER) - .columns(USER.UID, USER.NAME, USER.EMAIL, USER.PASSWORD) - .values(id, s"test_user_$executionId", s"test$executionId@test.com", "password") - .onConflictDoNothing() - .execute() - - dsl - .insertInto(WORKFLOW) - .columns(WORKFLOW.WID, WORKFLOW.NAME, WORKFLOW.CONTENT) - .values(id, s"test_workflow_$executionId", "{}") - .onConflictDoNothing() - .execute() - - dsl - .insertInto(WORKFLOW_OF_USER) - .columns(WORKFLOW_OF_USER.UID, WORKFLOW_OF_USER.WID) - .values(id, id) - .onConflictDoNothing() - .execute() - - dsl - .insertInto(WORKFLOW_VERSION) - .columns(WORKFLOW_VERSION.VID, WORKFLOW_VERSION.WID, WORKFLOW_VERSION.CONTENT) - .values(id, id, "{}") - .onConflictDoNothing() - .execute() - - dsl - .insertInto(WORKFLOW_EXECUTIONS) - .columns( - WORKFLOW_EXECUTIONS.EID, - WORKFLOW_EXECUTIONS.VID, - WORKFLOW_EXECUTIONS.UID, - WORKFLOW_EXECUTIONS.STATUS, - WORKFLOW_EXECUTIONS.ENVIRONMENT_VERSION - ) - .values(id, id, id, Short.box(1.toShort), "test") - .onConflictDoNothing() - .execute() - } - - /** Creates a mock OperatorExecutor for testing. */ - private def createMockExecutor(execId: Int, opId: String): OperatorExecutor = { - val executor = new OperatorExecutor { - override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = Iterator.empty - } - executor.initializeExecutionContext(execId, opId) - executor - } +class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { /** Creates a big object from string data and returns it. */ - private def createBigObject( - data: String, - execId: Int, - opId: String = "test-op" - ): BigObject = { - createMockExecution(execId) - val executor = createMockExecutor(execId, opId) - val bigObject = new BigObject(executor) + private def createBigObject(data: String): BigObject = { + val bigObject = new BigObject() val out = new BigObjectOutputStream(bigObject) try { out.write(data.getBytes) @@ -133,17 +48,17 @@ class BigObjectManagerSpec test("BigObjectInputStream should read all bytes from stream") { val data = "Hello, World! This is a test." - val bigObject = createBigObject(data, execId = 100) + val bigObject = createBigObject(data) val stream = new BigObjectInputStream(bigObject) assert(stream.readAllBytes().sameElements(data.getBytes)) stream.close() - BigObjectManager.delete(100) + BigObjectManager.delete() } test("BigObjectInputStream should read exact number of bytes") { - val bigObject = createBigObject("0123456789ABCDEF", execId = 101) + val bigObject = createBigObject("0123456789ABCDEF") val stream = new BigObjectInputStream(bigObject) val result = stream.readNBytes(10) @@ -152,12 +67,12 @@ class BigObjectManagerSpec assert(result.sameElements("0123456789".getBytes)) stream.close() - BigObjectManager.delete(101) + BigObjectManager.delete() } test("BigObjectInputStream should handle reading more bytes than available") { val data = "Short" - val bigObject = createBigObject(data, execId = 102) + val bigObject = createBigObject(data) val stream = new BigObjectInputStream(bigObject) val result = stream.readNBytes(100) @@ -166,11 +81,11 @@ class BigObjectManagerSpec assert(result.sameElements(data.getBytes)) stream.close() - BigObjectManager.delete(102) + BigObjectManager.delete() } test("BigObjectInputStream should support standard single-byte read") { - val bigObject = createBigObject("ABC", execId = 103) + val bigObject = createBigObject("ABC") val stream = new BigObjectInputStream(bigObject) assert(stream.read() == 65) // 'A' @@ -179,22 +94,22 @@ class BigObjectManagerSpec assert(stream.read() == -1) // EOF stream.close() - BigObjectManager.delete(103) + BigObjectManager.delete() } test("BigObjectInputStream should return -1 at EOF") { - val bigObject = createBigObject("EOF", execId = 104) + val bigObject = createBigObject("EOF") val stream = new BigObjectInputStream(bigObject) stream.readAllBytes() // Read all data assert(stream.read() == -1) stream.close() - BigObjectManager.delete(104) + BigObjectManager.delete() } test("BigObjectInputStream should throw exception when reading from closed stream") { - val bigObject = createBigObject("test", execId = 105) + val bigObject = createBigObject("test") val stream = new BigObjectInputStream(bigObject) stream.close() @@ -202,24 +117,22 @@ class BigObjectManagerSpec assertThrows[java.io.IOException](stream.read()) assertThrows[java.io.IOException](stream.readAllBytes()) - BigObjectManager.delete(105) + BigObjectManager.delete() } test("BigObjectInputStream should handle multiple close calls") { - val bigObject = createBigObject("test", execId = 106) + val bigObject = createBigObject("test") val stream = new BigObjectInputStream(bigObject) stream.close() stream.close() // Should not throw - BigObjectManager.delete(106) + BigObjectManager.delete() } test("BigObjectInputStream should read large data correctly") { val largeData = Array.fill[Byte](20000)((scala.util.Random.nextInt(256) - 128).toByte) - createMockExecution(107) - val executor = createMockExecutor(107, "test-op") - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val out = new BigObjectOutputStream(bigObject) try { out.write(largeData) @@ -232,30 +145,22 @@ class BigObjectManagerSpec assert(result.sameElements(largeData)) stream.close() - BigObjectManager.delete(107) + BigObjectManager.delete() } // ======================================== // BigObjectManager Tests // ======================================== - test("BigObjectManager should create and register a big object") { - val pointer = createBigObject("Test big object data", execId = 1, opId = "operator-1") + test("BigObjectManager should create a big object") { + val pointer = createBigObject("Test big object data") assertStandardBucket(pointer) - - val record = getDSLContext - .selectFrom(BIG_OBJECT) - .where(BIG_OBJECT.EXECUTION_ID.eq(1).and(BIG_OBJECT.OPERATOR_ID.eq("operator-1"))) - .fetchOne() - - assert(record != null) - assert(record.getUri == pointer.getUri) } test("BigObjectInputStream should open and read a big object") { val data = "Hello from big object!" - val pointer = createBigObject(data, execId = 2) + val pointer = createBigObject(data) val stream = new BigObjectInputStream(pointer) val readData = stream.readAllBytes() @@ -278,12 +183,8 @@ class BigObjectManagerSpec } } - test("BigObjectManager should delete big objects by execution ID") { - val execId = 3 - createMockExecution(execId) - - val executor1 = createMockExecutor(execId, "op-1") - val pointer1 = new BigObject(executor1) + test("BigObjectManager should delete all big objects") { + val pointer1 = new BigObject() val out1 = new BigObjectOutputStream(pointer1) try { out1.write("Object 1".getBytes) @@ -291,8 +192,7 @@ class BigObjectManagerSpec out1.close() } - val executor2 = createMockExecutor(execId, "op-2") - val pointer2 = new BigObject(executor2) + val pointer2 = new BigObject() val out2 = new BigObjectOutputStream(pointer2) try { out2.write("Object 2".getBytes) @@ -300,37 +200,31 @@ class BigObjectManagerSpec out2.close() } - BigObjectManager.delete(execId) - assert( - getDSLContext.selectFrom(BIG_OBJECT).where(BIG_OBJECT.EXECUTION_ID.eq(execId)).fetch().isEmpty - ) + BigObjectManager.delete() } test("BigObjectManager should handle delete with no objects gracefully") { - BigObjectManager.delete(9999) // Should not throw exception + BigObjectManager.delete() // Should not throw exception } - test("BigObjectManager should not delete objects from different executions") { - val pointer1 = createBigObject("Test data", execId = 4) - val pointer2 = createBigObject("Test data", execId = 5) + test("BigObjectManager should delete all objects") { + val pointer1 = createBigObject("Test data") + val pointer2 = createBigObject("Test data") - BigObjectManager.delete(4) - BigObjectManager.delete(5) + BigObjectManager.delete() } test("BigObjectManager should create bucket if it doesn't exist") { - val pointer = createBigObject("Test bucket creation", execId = 6) + val pointer = createBigObject("Test bucket creation") assertStandardBucket(pointer) - BigObjectManager.delete(6) + BigObjectManager.delete() } test("BigObjectManager should handle large objects correctly") { val largeData = Array.fill[Byte](6 * 1024 * 1024)((scala.util.Random.nextInt(256) - 128).toByte) - createMockExecution(7) - val executor = createMockExecutor(7, "large-op") - val pointer = new BigObject(executor) + val pointer = new BigObject() val out = new BigObjectOutputStream(pointer) try { out.write(largeData) @@ -343,14 +237,12 @@ class BigObjectManagerSpec stream.close() assert(readData.sameElements(largeData)) - BigObjectManager.delete(7) + BigObjectManager.delete() } test("BigObjectManager should generate unique URIs for different objects") { - createMockExecution(8) val testData = "Unique URI test".getBytes - val executor = createMockExecutor(8, "test-op") - val pointer1 = new BigObject(executor) + val pointer1 = new BigObject() val out1 = new BigObjectOutputStream(pointer1) try { out1.write(testData) @@ -358,8 +250,7 @@ class BigObjectManagerSpec out1.close() } - val executor2 = createMockExecutor(8, "test-op") - val pointer2 = new BigObject(executor2) + val pointer2 = new BigObject() val out2 = new BigObjectOutputStream(pointer2) try { out2.write(testData) @@ -370,12 +261,12 @@ class BigObjectManagerSpec assert(pointer1.getUri != pointer2.getUri) assert(pointer1.getObjectKey != pointer2.getObjectKey) - BigObjectManager.delete(8) + BigObjectManager.delete() } test("BigObjectInputStream should handle multiple reads from the same big object") { val data = "Multiple reads test data" - val pointer = createBigObject(data, execId = 9) + val pointer = createBigObject(data) val stream1 = new BigObjectInputStream(pointer) val readData1 = stream1.readAllBytes() @@ -388,29 +279,27 @@ class BigObjectManagerSpec assert(readData1.sameElements(data.getBytes)) assert(readData2.sameElements(data.getBytes)) - BigObjectManager.delete(9) + BigObjectManager.delete() } test("BigObjectManager should properly parse bucket name and object key from big object") { - val bigObject = createBigObject("URI parsing test", execId = 10) + val bigObject = createBigObject("URI parsing test") assertStandardBucket(bigObject) assert(bigObject.getObjectKey.nonEmpty) assert(!bigObject.getObjectKey.startsWith("/")) - BigObjectManager.delete(10) + BigObjectManager.delete() } // ======================================== // Object-Oriented API Tests // ======================================== - test("BigObject with BigObjectOutputStream should create and register a big object") { - createMockExecution(11) + test("BigObject with BigObjectOutputStream should create a big object") { val data = "Test data for BigObject with BigObjectOutputStream" - val executor = createMockExecutor(11, "operator-11") - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val out = new BigObjectOutputStream(bigObject) try { out.write(data.getBytes) @@ -420,20 +309,12 @@ class BigObjectManagerSpec assertStandardBucket(bigObject) - val record = getDSLContext - .selectFrom(BIG_OBJECT) - .where(BIG_OBJECT.EXECUTION_ID.eq(11).and(BIG_OBJECT.OPERATOR_ID.eq("operator-11"))) - .fetchOne() - - assert(record != null) - assert(record.getUri == bigObject.getUri) - - BigObjectManager.delete(11) + BigObjectManager.delete() } test("BigObjectInputStream constructor should read big object contents") { val data = "Test data for BigObjectInputStream constructor" - val bigObject = createBigObject(data, execId = 12) + val bigObject = createBigObject(data) val stream = new BigObjectInputStream(bigObject) val readData = stream.readAllBytes() @@ -441,16 +322,14 @@ class BigObjectManagerSpec assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete(12) + BigObjectManager.delete() } test("BigObjectOutputStream and BigObjectInputStream should work together end-to-end") { - createMockExecution(13) val data = "End-to-end test data" - val executor = createMockExecutor(13, "operator-13") // Create using streaming API - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val out = new BigObjectOutputStream(bigObject) try { out.write(data.getBytes) @@ -465,7 +344,7 @@ class BigObjectManagerSpec assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete(13) + BigObjectManager.delete() } // ======================================== @@ -473,11 +352,9 @@ class BigObjectManagerSpec // ======================================== test("BigObjectOutputStream should write and upload data to S3") { - createMockExecution(200) - val executor = createMockExecutor(200, "operator-200") val data = "Test data for BigObjectOutputStream" - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write(data.getBytes) outStream.close() @@ -491,36 +368,26 @@ class BigObjectManagerSpec assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete(200) + BigObjectManager.delete() } - test("BigObjectOutputStream should register big object in database") { - createMockExecution(201) - val executor = createMockExecutor(201, "operator-201") + test("BigObjectOutputStream should create big object") { val data = "Database registration test" - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write(data.getBytes) outStream.close() - val record = getDSLContext - .selectFrom(BIG_OBJECT) - .where(BIG_OBJECT.EXECUTION_ID.eq(201).and(BIG_OBJECT.OPERATOR_ID.eq("operator-201"))) - .fetchOne() - - assert(record != null) - assert(record.getUri == bigObject.getUri) + assertStandardBucket(bigObject) - BigObjectManager.delete(201) + BigObjectManager.delete() } test("BigObjectOutputStream should handle large data correctly") { - createMockExecution(202) - val executor = createMockExecutor(202, "operator-202") val largeData = Array.fill[Byte](8 * 1024 * 1024)((scala.util.Random.nextInt(256) - 128).toByte) - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write(largeData) outStream.close() @@ -532,14 +399,11 @@ class BigObjectManagerSpec assert(readData.sameElements(largeData)) - BigObjectManager.delete(202) + BigObjectManager.delete() } test("BigObjectOutputStream should handle multiple writes") { - createMockExecution(203) - val executor = createMockExecutor(203, "operator-203") - - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write("Hello ".getBytes) outStream.write("World".getBytes) @@ -552,57 +416,45 @@ class BigObjectManagerSpec assert(readData.sameElements("Hello World!".getBytes)) - BigObjectManager.delete(203) + BigObjectManager.delete() } test("BigObjectOutputStream should throw exception when writing to closed stream") { - createMockExecution(204) - val executor = createMockExecutor(204, "operator-204") - - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write("test".getBytes) outStream.close() assertThrows[java.io.IOException](outStream.write("more".getBytes)) - BigObjectManager.delete(204) + BigObjectManager.delete() } test("BigObjectOutputStream should handle close() being called multiple times") { - createMockExecution(205) - val executor = createMockExecutor(205, "operator-205") - - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write("test".getBytes) outStream.close() outStream.close() // Should not throw - BigObjectManager.delete(205) + BigObjectManager.delete() } - test("New BigObject(executor) constructor should create unique URIs") { - createMockExecution(206) - val executor1 = createMockExecutor(206, "operator-206") - val executor2 = createMockExecutor(206, "operator-206") - - val bigObject1 = new BigObject(executor1) - val bigObject2 = new BigObject(executor2) + test("New BigObject() constructor should create unique URIs") { + val bigObject1 = new BigObject() + val bigObject2 = new BigObject() assert(bigObject1.getUri != bigObject2.getUri) assert(bigObject1.getObjectKey != bigObject2.getObjectKey) - BigObjectManager.delete(206) + BigObjectManager.delete() } - test("BigObject(executor) and BigObjectOutputStream API should be symmetric with input") { - createMockExecution(207) - val executor = createMockExecutor(207, "operator-207") + test("BigObject() and BigObjectOutputStream API should be symmetric with input") { val data = "Symmetric API test" // Write using new symmetric API - val bigObject = new BigObject(executor) + val bigObject = new BigObject() val outStream = new BigObjectOutputStream(bigObject) outStream.write(data.getBytes) outStream.close() @@ -614,6 +466,6 @@ class BigObjectManagerSpec assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete(207) + BigObjectManager.delete() } } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala index 55a09f9ebef..c039b6e2d8a 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExec.scala @@ -87,7 +87,7 @@ class FileScanSourceOpExec private[scan] ( new String(toByteArray(entry), desc.fileEncoding.getCharset) case FileAttributeType.BIG_OBJECT => // For big objects, create reference and upload via streaming - val bigObject = new BigObject(FileScanSourceOpExec.this) + val bigObject = new BigObject() val out = new BigObjectOutputStream(bigObject) try { val buffer = new Array[Byte](8192) diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala index fe205d17ddf..07b09f0a268 100644 --- a/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/source/scan/FileScanSourceOpExecSpec.scala @@ -125,7 +125,6 @@ class FileScanSourceOpExecSpec extends AnyFlatSpec with BeforeAndAfterAll { desc.setResolvedFileName(URI.create(testFile.toUri.toString)) val executor = new FileScanSourceOpExec(objectMapper.writeValueAsString(desc)) - executor.initializeExecutionContext(1, "test-op") try { executor.open() diff --git a/sql/texera_ddl.sql b/sql/texera_ddl.sql index a7db9ebe15f..5a122b19eb7 100644 --- a/sql/texera_ddl.sql +++ b/sql/texera_ddl.sql @@ -442,12 +442,4 @@ BEGIN END LOOP; END $$; --- END Fulltext search index creation (DO NOT EDIT THIS LINE) - -CREATE TABLE big_object ( - execution_id INT NOT NULL, - operator_id VARCHAR(100) NOT NULL, - uri TEXT NOT NULL UNIQUE, - creation_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - FOREIGN KEY (execution_id) REFERENCES workflow_executions(eid) ON DELETE CASCADE -); \ No newline at end of file +-- END Fulltext search index creation (DO NOT EDIT THIS LINE) \ No newline at end of file diff --git a/sql/updates/16.sql b/sql/updates/16.sql deleted file mode 100644 index bc762bb3469..00000000000 --- a/sql/updates/16.sql +++ /dev/null @@ -1,34 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- ============================================ --- 1. Connect to the texera_db database --- ============================================ -\c texera_db - -SET search_path TO texera_db; - --- ============================================ --- 2. Update the table schema --- ============================================ -CREATE TABLE big_object ( - execution_id INT NOT NULL, - operator_id VARCHAR(100) NOT NULL, - uri TEXT NOT NULL UNIQUE, - creation_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - FOREIGN KEY (execution_id) REFERENCES workflow_executions(eid) ON DELETE CASCADE -); \ No newline at end of file From 8e79ec9c39b0e53260236323ca02eb2ac8219f84 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Thu, 20 Nov 2025 17:38:10 -0800 Subject: [PATCH 07/10] Revert SQL --- sql/texera_ddl.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/texera_ddl.sql b/sql/texera_ddl.sql index 5a122b19eb7..7b0f9b9063d 100644 --- a/sql/texera_ddl.sql +++ b/sql/texera_ddl.sql @@ -442,4 +442,4 @@ BEGIN END LOOP; END $$; --- END Fulltext search index creation (DO NOT EDIT THIS LINE) \ No newline at end of file +-- END Fulltext search index creation (DO NOT EDIT THIS LINE) From 8c58a04312b86fd753f3dc200189b54461559db7 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Thu, 20 Nov 2025 18:20:54 -0800 Subject: [PATCH 08/10] Fix test --- .../org/apache/texera/service/util/BigObjectManager.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala index efa12ad5226..7171a639c22 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala @@ -60,8 +60,7 @@ object BigObjectManager extends LazyLogging { logger.info(s"Successfully deleted all big objects from bucket: $DEFAULT_BUCKET") } catch { case e: Exception => - logger.error(s"Failed to delete big objects from bucket: $DEFAULT_BUCKET", e) - throw e + logger.warn(s"Failed to delete big objects from bucket: $DEFAULT_BUCKET", e) } } From 3f045322bc04e941c1b14cb01eaf4fb6fedecf99 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Fri, 21 Nov 2025 13:51:54 -0800 Subject: [PATCH 09/10] Update comments and rename function --- .../user/workflow/WorkflowResource.scala | 2 +- .../texera/web/service/WorkflowService.scala | 2 +- .../service/util/BigObjectManager.scala | 7 ++- .../service/util/BigObjectManagerSpec.scala | 54 +++++++++---------- 4 files changed, 32 insertions(+), 33 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala index 81081c636f7..c6917d9390a 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala @@ -601,7 +601,7 @@ class WorkflowResource extends LazyLogging { .asScala .toList - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() // Collect all URIs related to executions for cleanup val uris = eids.flatMap { eid => diff --git a/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala b/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala index 38d0ae93279..c9f1bee3463 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/WorkflowService.scala @@ -346,6 +346,6 @@ class WorkflowService( } } // Delete big objects - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala index 7171a639c22..a6a273eb304 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectManager.scala @@ -26,9 +26,8 @@ import java.util.UUID /** * Manages the lifecycle of BigObjects stored in S3. * - * Handles creation, tracking, and cleanup of large objects that exceed - * normal tuple size limits. Objects are automatically cleaned up when - * their associated workflow execution completes. + * Handles creation and deletion of large objects that exceed + * normal tuple size limits. */ object BigObjectManager extends LazyLogging { private val DEFAULT_BUCKET = "texera-big-objects" @@ -54,7 +53,7 @@ object BigObjectManager extends LazyLogging { * @throws Exception if the deletion fails * @return Unit */ - def delete(): Unit = { + def deleteAllObjects(): Unit = { try { S3StorageClient.deleteDirectory(DEFAULT_BUCKET, "objects") logger.info(s"Successfully deleted all big objects from bucket: $DEFAULT_BUCKET") diff --git a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala index ff003005ff7..ce1d4f4e691 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/service/util/BigObjectManagerSpec.scala @@ -54,7 +54,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(stream.readAllBytes().sameElements(data.getBytes)) stream.close() - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should read exact number of bytes") { @@ -67,7 +67,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(result.sameElements("0123456789".getBytes)) stream.close() - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should handle reading more bytes than available") { @@ -81,7 +81,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(result.sameElements(data.getBytes)) stream.close() - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should support standard single-byte read") { @@ -94,7 +94,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(stream.read() == -1) // EOF stream.close() - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should return -1 at EOF") { @@ -105,7 +105,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(stream.read() == -1) stream.close() - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should throw exception when reading from closed stream") { @@ -117,7 +117,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assertThrows[java.io.IOException](stream.read()) assertThrows[java.io.IOException](stream.readAllBytes()) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should handle multiple close calls") { @@ -127,7 +127,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { stream.close() stream.close() // Should not throw - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should read large data correctly") { @@ -145,7 +145,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(result.sameElements(largeData)) stream.close() - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } // ======================================== @@ -200,18 +200,18 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { out2.close() } - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectManager should handle delete with no objects gracefully") { - BigObjectManager.delete() // Should not throw exception + BigObjectManager.deleteAllObjects() // Should not throw exception } test("BigObjectManager should delete all objects") { val pointer1 = createBigObject("Test data") val pointer2 = createBigObject("Test data") - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectManager should create bucket if it doesn't exist") { @@ -219,7 +219,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assertStandardBucket(pointer) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectManager should handle large objects correctly") { @@ -237,7 +237,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { stream.close() assert(readData.sameElements(largeData)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectManager should generate unique URIs for different objects") { @@ -261,7 +261,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(pointer1.getUri != pointer2.getUri) assert(pointer1.getObjectKey != pointer2.getObjectKey) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream should handle multiple reads from the same big object") { @@ -279,7 +279,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData1.sameElements(data.getBytes)) assert(readData2.sameElements(data.getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectManager should properly parse bucket name and object key from big object") { @@ -289,7 +289,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(bigObject.getObjectKey.nonEmpty) assert(!bigObject.getObjectKey.startsWith("/")) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } // ======================================== @@ -309,7 +309,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assertStandardBucket(bigObject) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectInputStream constructor should read big object contents") { @@ -322,7 +322,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectOutputStream and BigObjectInputStream should work together end-to-end") { @@ -344,7 +344,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } // ======================================== @@ -368,7 +368,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectOutputStream should create big object") { @@ -381,7 +381,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assertStandardBucket(bigObject) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectOutputStream should handle large data correctly") { @@ -399,7 +399,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData.sameElements(largeData)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectOutputStream should handle multiple writes") { @@ -416,7 +416,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData.sameElements("Hello World!".getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectOutputStream should throw exception when writing to closed stream") { @@ -427,7 +427,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assertThrows[java.io.IOException](outStream.write("more".getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObjectOutputStream should handle close() being called multiple times") { @@ -437,7 +437,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { outStream.close() outStream.close() // Should not throw - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("New BigObject() constructor should create unique URIs") { @@ -447,7 +447,7 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(bigObject1.getUri != bigObject2.getUri) assert(bigObject1.getObjectKey != bigObject2.getObjectKey) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } test("BigObject() and BigObjectOutputStream API should be symmetric with input") { @@ -466,6 +466,6 @@ class BigObjectManagerSpec extends AnyFunSuite with S3StorageTestBase { assert(readData.sameElements(data.getBytes)) - BigObjectManager.delete() + BigObjectManager.deleteAllObjects() } } From 437832b216c07984c7c1c459bdd5548a1e5782a0 Mon Sep 17 00:00:00 2001 From: Kunwoo Park Date: Tue, 25 Nov 2025 11:27:08 -0800 Subject: [PATCH 10/10] Address comments --- .../service/util/BigObjectInputStream.scala | 49 +++++-------------- .../service/util/BigObjectOutputStream.scala | 14 ++---- .../texera/service/util/S3StorageClient.scala | 12 +++-- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala index 841bdc8cb27..cdc7e5b77af 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectInputStream.scala @@ -47,35 +47,18 @@ class BigObjectInputStream(bigObject: BigObject) extends InputStream { @volatile private var closed = false - override def read(): Int = { - ensureOpen() - underlying.read() - } + override def read(): Int = whenOpen(underlying.read()) - override def read(b: Array[Byte], off: Int, len: Int): Int = { - ensureOpen() - underlying.read(b, off, len) - } + override def read(b: Array[Byte], off: Int, len: Int): Int = + whenOpen(underlying.read(b, off, len)) - override def readAllBytes(): Array[Byte] = { - ensureOpen() - underlying.readAllBytes() - } + override def readAllBytes(): Array[Byte] = whenOpen(underlying.readAllBytes()) - override def readNBytes(n: Int): Array[Byte] = { - ensureOpen() - underlying.readNBytes(n) - } + override def readNBytes(n: Int): Array[Byte] = whenOpen(underlying.readNBytes(n)) - override def skip(n: Long): Long = { - ensureOpen() - underlying.skip(n) - } + override def skip(n: Long): Long = whenOpen(underlying.skip(n)) - override def available(): Int = { - ensureOpen() - underlying.available() - } + override def available(): Int = whenOpen(underlying.available()) override def close(): Unit = { if (!closed) { @@ -86,22 +69,14 @@ class BigObjectInputStream(bigObject: BigObject) extends InputStream { } } - override def markSupported(): Boolean = { - ensureOpen() - underlying.markSupported() - } + override def markSupported(): Boolean = whenOpen(underlying.markSupported()) - override def mark(readlimit: Int): Unit = { - ensureOpen() - underlying.mark(readlimit) - } + override def mark(readlimit: Int): Unit = whenOpen(underlying.mark(readlimit)) - override def reset(): Unit = { - ensureOpen() - underlying.reset() - } + override def reset(): Unit = whenOpen(underlying.reset()) - private def ensureOpen(): Unit = { + private def whenOpen[T](f: => T): T = { if (closed) throw new java.io.IOException("Stream is closed") + f } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala index 00d44ab03ac..80214a973f9 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/BigObjectOutputStream.scala @@ -78,15 +78,10 @@ class BigObjectOutputStream(bigObject: BigObject) extends OutputStream with Lazy } } - override def write(b: Int): Unit = { - ensureOpen() - pipedOut.write(b) - } + override def write(b: Int): Unit = whenOpen(pipedOut.write(b)) - override def write(b: Array[Byte], off: Int, len: Int): Unit = { - ensureOpen() - pipedOut.write(b, off, len) - } + override def write(b: Array[Byte], off: Int, len: Int): Unit = + whenOpen(pipedOut.write(b, off, len)) override def flush(): Unit = { if (!closed) pipedOut.flush() @@ -112,9 +107,10 @@ class BigObjectOutputStream(bigObject: BigObject) extends OutputStream with Lazy } } - private def ensureOpen(): Unit = { + private def whenOpen[T](f: => T): T = { if (closed) throw new IOException("Stream is closed") checkUploadSuccess() + f } private def checkUploadSuccess(): Unit = { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala b/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala index e2e4979299e..8c3bc2f5f33 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/service/util/S3StorageClient.scala @@ -180,6 +180,7 @@ object S3StorageClient { ) .uploadId() + var uploadSuccess = false try { // Upload all parts using an iterator val allParts = Iterator @@ -203,7 +204,7 @@ object S3StorageClient { } .toList - s3Client + val result = s3Client .completeMultipartUpload( CompleteMultipartUploadRequest .builder() @@ -215,8 +216,11 @@ object S3StorageClient { ) .eTag() - } catch { - case e: Exception => + uploadSuccess = true + result + + } finally { + if (!uploadSuccess) { try { s3Client.abortMultipartUpload( AbortMultipartUploadRequest @@ -227,7 +231,7 @@ object S3StorageClient { .build() ) } catch { case _: Exception => } - throw e + } } }