From f747f7f063bf4c0b06fb2bb5e2a72d591aabccb3 Mon Sep 17 00:00:00 2001 From: zouxxyy Date: Mon, 20 Jan 2025 15:26:27 +0800 Subject: [PATCH 1/2] 1 --- docs/content/spark/sql-query.md | 12 ++++++++++++ .../paimon/spark/sql/PaimonQueryTest.scala | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/docs/content/spark/sql-query.md b/docs/content/spark/sql-query.md index c97b6d3341b7..a2b49fd8eaee 100644 --- a/docs/content/spark/sql-query.md +++ b/docs/content/spark/sql-query.md @@ -32,6 +32,18 @@ Just like all other tables, Paimon tables can be queried with `SELECT` statement Paimon's batch read returns all the data in a snapshot of the table. By default, batch reads return the latest snapshot. +```sql +-- read all columns +SELECT * FROM t; +``` + +Paimon also supports reading some hidden metadata columns, such as `__paimon_file_path`, `__paimon_partition`, `__paimon_bucket`. + +```sql +-- read all columns and the corresponding file path, partition, bucket of the record +SELECT *, __paimon_file_path, __paimon_partition, __paimon_bucket FROM t; +``` + ### Batch Time Travel Paimon batch reads with time travel can specify a snapshot or a tag and read the corresponding data. diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/PaimonQueryTest.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/PaimonQueryTest.scala index 08f5275f01b5..d8d621a0e690 100644 --- a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/PaimonQueryTest.scala +++ b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/PaimonQueryTest.scala @@ -368,6 +368,22 @@ class PaimonQueryTest extends PaimonSparkTestBase { } } + test("Paimon Query: query metadata columns") { + sql("CREATE TABLE T (a INT, p1 INT, p2 INT) PARTITIONED BY (p1, p2)") + sql("INSERT INTO T VALUES (1, 1, 1), (2, 1, 2)") + checkAnswer( + sql(""" + |SELECT + |*, + |element_at(split(__paimon_file_path, '\\.'), -1), + |__paimon_partition, + |__paimon_bucket + |FROM T ORDER BY a + |""".stripMargin), + Seq(Row(1, 1, 1, "parquet", Row(1, 1), 0), Row(2, 1, 2, "parquet", Row(1, 2), 0)) + ) + } + private def getAllFiles( tableName: String, partitions: Seq[String], From 64ba9e1d6fc88c0534f19679b94190e9bb7d13fc Mon Sep 17 00:00:00 2001 From: zouxxyy Date: Mon, 20 Jan 2025 20:13:53 +0800 Subject: [PATCH 2/2] update --- docs/content/spark/sql-query.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/content/spark/sql-query.md b/docs/content/spark/sql-query.md index a2b49fd8eaee..cc420b4534f2 100644 --- a/docs/content/spark/sql-query.md +++ b/docs/content/spark/sql-query.md @@ -37,7 +37,11 @@ Paimon's batch read returns all the data in a snapshot of the table. By default, SELECT * FROM t; ``` -Paimon also supports reading some hidden metadata columns, such as `__paimon_file_path`, `__paimon_partition`, `__paimon_bucket`. +Paimon also supports reading some hidden metadata columns, currently supporting the following columns: + +- `__paimon_file_path`: the file path of the record. +- `__paimon_partition`: the partition of the record. +- `__paimon_bucket`: the bucket of the record. ```sql -- read all columns and the corresponding file path, partition, bucket of the record