-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Search before asking
- I searched in the issues and found nothing similar.
Paimon version
paimon-spark-3.5-1.1-20250206.002614-45.jar
Compute Engine
spark 3.4.4
spark 3.5.4
Minimal reproduce step
I have a dataframe with 900 fields and attempted to write it to a Paimon table. A small amount of data writes without errors, but attempting to write over 100,000 records results in an error.
In addition, in this case, the spark stage is successful, only driver report an exception, but when I check the data files in the HDFS path, they exist and have not been cleaned up, it may not expect.
25/02/07 16:24:52 INFO [main] DAGScheduler: Job 4 finished: collect at PaimonSparkWriter.scala:235, took 19.418619 s
25/02/07 16:24:52 INFO [main] CodeGenerator: Code generated in 23.710223 ms
java.lang.ArrayIndexOutOfBoundsException: 39182
at org.apache.paimon.memory.MemorySegmentUtils.getIntMultiSegments(MemorySegmentUtils.java:685)
at org.apache.paimon.memory.MemorySegmentUtils.getInt(MemorySegmentUtils.java:675)
at org.apache.paimon.data.BinaryArray.pointTo(BinaryArray.java:129)
at org.apache.paimon.memory.MemorySegmentUtils.readArrayData(MemorySegmentUtils.java:1152)
at org.apache.paimon.data.BinaryRow.getArray(BinaryRow.java:350)
at org.apache.paimon.io.DataFileMetaSerializer.fromRow(DataFileMetaSerializer.java:84)
at org.apache.paimon.io.DataFileMetaSerializer.fromRow(DataFileMetaSerializer.java:34)
at org.apache.paimon.utils.ObjectSerializer.deserialize(ObjectSerializer.java:81)
at org.apache.paimon.utils.ObjectSerializer.deserializeList(ObjectSerializer.java:104)
at org.apache.paimon.table.sink.CommitMessageSerializer.lambda$fileDeserializer$0(CommitMessageSerializer.java:135)
at org.apache.paimon.table.sink.CommitMessageSerializer.deserialize(CommitMessageSerializer.java:124)
at org.apache.paimon.table.sink.CommitMessageSerializer.deserialize(CommitMessageSerializer.java:103)
at org.apache.paimon.spark.commands.PaimonSparkWriter.deserializeCommitMessage(PaimonSparkWriter.scala:408)
at org.apache.paimon.spark.commands.PaimonSparkWriter.$anonfun$write$17(PaimonSparkWriter.scala:237)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
at org.apache.paimon.spark.commands.PaimonSparkWriter.write(PaimonSparkWriter.scala:237)
at org.apache.paimon.spark.commands.WriteIntoPaimonTable.run(WriteIntoPaimonTable.scala:67)
at org.apache.paimon.spark.SparkWrite.$anonfun$toInsertableRelation$1(SparkWrite.scala:35)
at org.apache.spark.sql.execution.datasources.v2.SupportsV1Write.writeWithV1(V1FallbackWriters.scala:79)
at org.apache.spark.sql.execution.datasources.v2.SupportsV1Write.writeWithV1$(V1FallbackWriters.scala:78)
at org.apache.spark.sql.execution.datasources.v2.AppendDataExecV1.writeWithV1(V1FallbackWriters.scala:34)
at org.apache.spark.sql.execution.datasources.v2.V1FallbackWriters.run(V1FallbackWriters.scala:66)
at org.apache.spark.sql.execution.datasources.v2.V1FallbackWriters.run$(V1FallbackWriters.scala:65)
at org.apache.spark.sql.execution.datasources.v2.AppendDataExecV1.run(V1FallbackWriters.scala:34)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:220)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
at org.apache.spark.sql.SparkSession.$anonfun$sql$4(SparkSession.scala:691)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:682)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:713)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:744)
at insertToUserLake(<console>:31)
... 47 elided
What doesn't meet your expectations?
- write success.
- job failed while
deserializeCommitMessageand clean all read write files.
Anything else?
No response
Are you willing to submit a PR?
- I'm willing to submit a PR!
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working