diff --git a/variant/README.md b/variant/README.md index 418dfa7..c09c49a 100644 --- a/variant/README.md +++ b/variant/README.md @@ -69,5 +69,9 @@ resulting in a single `0` byte: echo -n 'a' | tr a '\0' > primitive_null.value ``` +### Modification 2: Created `TimeNTZ/Timestamp with timezone nanos/Timestamp without timezone nanos/UUID` with Iceberg test code + +Currently, Spark [does not support](https://github.com/apache/spark/blob/master/common/variant/README.md) Variant values containing UUID, Time, or nanosecond-precision Timestamp. the `primitive_time.[metadata/value]`, `primitive_timestamp_nanos.[metadata/value]`, `primitive_timestampntz_nanos.[metadata/value]` and `primitive_uuid.[metadata/data]` was generated by [Iceberg test code](https://github.com/apache/iceberg/blob/3a4215dbb714477c89681ab94f1197b6ebcbdfff/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantReaders.java#L355) + [Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md [primitive types listed in the spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-primitive-type-basic_type0 diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json index bdd16ef..8a1faae 100644 --- a/variant/data_dictionary.json +++ b/variant/data_dictionary.json @@ -67,7 +67,11 @@ "primitive_int8": 42, "primitive_null": null, "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", + "primitive_time": "12:33:54:123456", "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", "primitive_timestampntz": "2025-04-16 12:34:56.78", - "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)" -} \ No newline at end of file + "primitive_timestamp_nanos": "2024-11-07T12:33:54.123456789+00:00", + "primitive_timestampntz_nanos": "2024-11-07T12:33:54.123456789", + "primitive_uuid": "f24f9b64-81fa-49d1-b74e-8c09a6e31c56", + "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", +} diff --git a/variant/primitive_time.metadata b/variant/primitive_time.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_time.metadata differ diff --git a/variant/primitive_time.value b/variant/primitive_time.value new file mode 100644 index 0000000..9fa4fb3 Binary files /dev/null and b/variant/primitive_time.value differ diff --git a/variant/primitive_timestamp_nanos.metadata b/variant/primitive_timestamp_nanos.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_timestamp_nanos.metadata differ diff --git a/variant/primitive_timestamp_nanos.value b/variant/primitive_timestamp_nanos.value new file mode 100644 index 0000000..2e7e246 --- /dev/null +++ b/variant/primitive_timestamp_nanos.value @@ -0,0 +1 @@ +HA:l \ No newline at end of file diff --git a/variant/primitive_timestampntz_nanos.metadata b/variant/primitive_timestampntz_nanos.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_timestampntz_nanos.metadata differ diff --git a/variant/primitive_timestampntz_nanos.value b/variant/primitive_timestampntz_nanos.value new file mode 100644 index 0000000..1910207 --- /dev/null +++ b/variant/primitive_timestampntz_nanos.value @@ -0,0 +1 @@ +LA:l \ No newline at end of file diff --git a/variant/primitive_uuid.metadata b/variant/primitive_uuid.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_uuid.metadata differ diff --git a/variant/primitive_uuid.value b/variant/primitive_uuid.value new file mode 100644 index 0000000..314f3a6 --- /dev/null +++ b/variant/primitive_uuid.value @@ -0,0 +1 @@ +POdIѷN V \ No newline at end of file diff --git a/variant/regen.py b/variant/regen.py index b776afd..d2e14f8 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -75,12 +75,7 @@ INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!'::Variant); --- https://github.com/apache/parquet-testing/issues/79 --- is not clear how to create the following types using Spark SQL --- TODO TimeNTZ (Type ID 17) --- TODO 'timestamp with timezone (NANOS)' (Type ID 18) --- TODO 'timestamp with time zone (NANOS)' (Type ID 19) --- TODO 'UUID' (Type ID 20) +-- binary artifacts of 'TimeNTZ'/'timestamp with timezone (NANOS)'/'timestamp without time zone (NANOS)'/'UUID' was generated by the iceberg test code, please ref to https://github.com/apache/parquet-testing/pull/92 for more detail ------------------------------- -- Short string (basic_type=1) @@ -170,4 +165,4 @@ # Note: It is possible to write the output to a single parquet file, using a command # such as: # spark.sql("SELECT * FROM output").repartition(1).write.parquet('variant.parquet') -# At the time of writing, this file does not have the logical type annotation for VARIANT \ No newline at end of file +# At the time of writing, this file does not have the logical type annotation for VARIANT