From d2c8f8c5be2354fe771d9217b55a138d38cb4f11 Mon Sep 17 00:00:00 2001 From: klion26 Date: Tue, 12 Aug 2025 19:23:21 +0800 Subject: [PATCH 1/6] add primitive_time --- variant/primitive_time.metadata | Bin 0 -> 3 bytes variant/primitive_time.value | Bin 0 -> 9 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 variant/primitive_time.metadata create mode 100644 variant/primitive_time.value diff --git a/variant/primitive_time.metadata b/variant/primitive_time.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_time.value b/variant/primitive_time.value new file mode 100644 index 0000000000000000000000000000000000000000..9fa4fb3f9f7dfc59bbd29dd5c8832a04c04b0f81 GIT binary patch literal 9 QcmZ=c@JX|Si-Caw020vxvH$=8 literal 0 HcmV?d00001 From 04db7630c963e0a9e35a3b685b15c08b27d4dd9b Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 13 Aug 2025 08:58:46 +0800 Subject: [PATCH 2/6] update data_dictionary and regen script --- variant/data_dictionary.json | 5 +++-- variant/regen.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json index bdd16ef..a74f059 100644 --- a/variant/data_dictionary.json +++ b/variant/data_dictionary.json @@ -69,5 +69,6 @@ "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", "primitive_timestampntz": "2025-04-16 12:34:56.78", - "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)" -} \ No newline at end of file + "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", + "primitive_time": "12:33:54:123456" +} diff --git a/variant/regen.py b/variant/regen.py index b776afd..d6da0a7 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -78,6 +78,7 @@ -- https://github.com/apache/parquet-testing/issues/79 -- is not clear how to create the following types using Spark SQL -- TODO TimeNTZ (Type ID 17) +-- binary artifacts of TimeNTZ was generated by the iceberg test code, please ref to https://github.com/apache/parquet-testing/pull/92 for more detail -- TODO 'timestamp with timezone (NANOS)' (Type ID 18) -- TODO 'timestamp with time zone (NANOS)' (Type ID 19) -- TODO 'UUID' (Type ID 20) @@ -170,4 +171,4 @@ # Note: It is possible to write the output to a single parquet file, using a command # such as: # spark.sql("SELECT * FROM output").repartition(1).write.parquet('variant.parquet') -# At the time of writing, this file does not have the logical type annotation for VARIANT \ No newline at end of file +# At the time of writing, this file does not have the logical type annotation for VARIANT From d33d5074517af4d5f05dcf5b9e93d6f2f06e2eb7 Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 13 Aug 2025 11:08:36 +0800 Subject: [PATCH 3/6] add timestamp_nanos(tz&ntz) and uuid binary artifacts --- variant/data_dictionary.json | 5 ++++- variant/primitive_timestamp_nanos.metadata | Bin 0 -> 3 bytes variant/primitive_timestamp_nanos.value | 1 + variant/primitive_timestampntz_nanos.metadata | Bin 0 -> 3 bytes variant/primitive_timestampntz_nanos.value | 1 + variant/primitive_uuid.metadata | Bin 0 -> 3 bytes variant/primitive_uuid.value | 1 + variant/regen.py | 8 +------- 8 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 variant/primitive_timestamp_nanos.metadata create mode 100644 variant/primitive_timestamp_nanos.value create mode 100644 variant/primitive_timestampntz_nanos.metadata create mode 100644 variant/primitive_timestampntz_nanos.value create mode 100644 variant/primitive_uuid.metadata create mode 100644 variant/primitive_uuid.value diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json index a74f059..a2d173d 100644 --- a/variant/data_dictionary.json +++ b/variant/data_dictionary.json @@ -69,6 +69,9 @@ "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", "primitive_timestampntz": "2025-04-16 12:34:56.78", + "primitive_timestamp_nanos": "2024-11-07T12:33:54.123456789+00:00", + "primitive_timestampntz_nanos": "2024-11-07T12:33:54.123456789", "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", - "primitive_time": "12:33:54:123456" + "primitive_time": "12:33:54:123456", + "primitive_uuid": "f24f9b64-81fa-49d1-b74e-8c09a6e31c56", } diff --git a/variant/primitive_timestamp_nanos.metadata b/variant/primitive_timestamp_nanos.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_timestamp_nanos.value b/variant/primitive_timestamp_nanos.value new file mode 100644 index 0000000..2e7e246 --- /dev/null +++ b/variant/primitive_timestamp_nanos.value @@ -0,0 +1 @@ +HA:l \ No newline at end of file diff --git a/variant/primitive_timestampntz_nanos.metadata b/variant/primitive_timestampntz_nanos.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_timestampntz_nanos.value b/variant/primitive_timestampntz_nanos.value new file mode 100644 index 0000000..1910207 --- /dev/null +++ b/variant/primitive_timestampntz_nanos.value @@ -0,0 +1 @@ +LA:l \ No newline at end of file diff --git a/variant/primitive_uuid.metadata b/variant/primitive_uuid.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_uuid.value b/variant/primitive_uuid.value new file mode 100644 index 0000000..314f3a6 --- /dev/null +++ b/variant/primitive_uuid.value @@ -0,0 +1 @@ +POdIѷN V \ No newline at end of file diff --git a/variant/regen.py b/variant/regen.py index d6da0a7..d39c366 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -75,13 +75,7 @@ INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!'::Variant); --- https://github.com/apache/parquet-testing/issues/79 --- is not clear how to create the following types using Spark SQL --- TODO TimeNTZ (Type ID 17) --- binary artifacts of TimeNTZ was generated by the iceberg test code, please ref to https://github.com/apache/parquet-testing/pull/92 for more detail --- TODO 'timestamp with timezone (NANOS)' (Type ID 18) --- TODO 'timestamp with time zone (NANOS)' (Type ID 19) --- TODO 'UUID' (Type ID 20) +-- binary artifacts of 'TimeNTZ'/'timestamp with timezone (NANOS)'/'timestamp with time zone (NANOS)'/'UUID' was generated by the iceberg test code, please ref to https://github.com/apache/parquet-testing/pull/92 for more detail ------------------------------- -- Short string (basic_type=1) From 0123a86e7a95f5759a42019af855571ca6e26c4c Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 13 Aug 2025 21:04:58 +0800 Subject: [PATCH 4/6] fix typo --- variant/regen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variant/regen.py b/variant/regen.py index d39c366..d2e14f8 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -75,7 +75,7 @@ INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!'::Variant); --- binary artifacts of 'TimeNTZ'/'timestamp with timezone (NANOS)'/'timestamp with time zone (NANOS)'/'UUID' was generated by the iceberg test code, please ref to https://github.com/apache/parquet-testing/pull/92 for more detail +-- binary artifacts of 'TimeNTZ'/'timestamp with timezone (NANOS)'/'timestamp without time zone (NANOS)'/'UUID' was generated by the iceberg test code, please ref to https://github.com/apache/parquet-testing/pull/92 for more detail ------------------------------- -- Short string (basic_type=1) From 9867649e9f9df0d2ff725b52a78ba11e2bd52723 Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 13 Aug 2025 21:49:07 +0800 Subject: [PATCH 5/6] update the primitive data order in data_dictionary.json --- variant/data_dictionary.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json index a2d173d..8a1faae 100644 --- a/variant/data_dictionary.json +++ b/variant/data_dictionary.json @@ -67,11 +67,11 @@ "primitive_int8": 42, "primitive_null": null, "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", + "primitive_time": "12:33:54:123456", "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", "primitive_timestampntz": "2025-04-16 12:34:56.78", "primitive_timestamp_nanos": "2024-11-07T12:33:54.123456789+00:00", "primitive_timestampntz_nanos": "2024-11-07T12:33:54.123456789", - "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", - "primitive_time": "12:33:54:123456", "primitive_uuid": "f24f9b64-81fa-49d1-b74e-8c09a6e31c56", + "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", } From c3bfa1a50d746acbfbebd8a520abe242eaec5ab2 Mon Sep 17 00:00:00 2001 From: klion26 Date: Thu, 14 Aug 2025 13:21:09 +0800 Subject: [PATCH 6/6] update readme --- variant/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/variant/README.md b/variant/README.md index 418dfa7..c09c49a 100644 --- a/variant/README.md +++ b/variant/README.md @@ -69,5 +69,9 @@ resulting in a single `0` byte: echo -n 'a' | tr a '\0' > primitive_null.value ``` +### Modification 2: Created `TimeNTZ/Timestamp with timezone nanos/Timestamp without timezone nanos/UUID` with Iceberg test code + +Currently, Spark [does not support](https://github.com/apache/spark/blob/master/common/variant/README.md) Variant values containing UUID, Time, or nanosecond-precision Timestamp. the `primitive_time.[metadata/value]`, `primitive_timestamp_nanos.[metadata/value]`, `primitive_timestampntz_nanos.[metadata/value]` and `primitive_uuid.[metadata/data]` was generated by [Iceberg test code](https://github.com/apache/iceberg/blob/3a4215dbb714477c89681ab94f1197b6ebcbdfff/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantReaders.java#L355) + [Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md [primitive types listed in the spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-primitive-type-basic_type0