diff --git a/README.md b/README.md index a11eb6a..07fd928 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,9 @@ ~ under the License. --> # Testing Data and Utilities for Apache Parquet + +- [data](data/README.md) - Sample Parquet data files for testing +- [bad_data](bad_data/README.md) - Reproducers for bad data files for testing +- [variant](variant/README.md) - Sample [Variant] binary values + +[Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md \ No newline at end of file diff --git a/variant/README.md b/variant/README.md new file mode 100644 index 0000000..71e4da8 --- /dev/null +++ b/variant/README.md @@ -0,0 +1,52 @@ + + +# Variant Binary Encoding + +This directory contains binary artifacts encoded using the Parquet [Variant] +binary encoding. These files are **not** valid Parquet files, but rather +raw binary data. + +## Structure + +* `data_dictionary.json` - contains the JSON representation for each example + +Each example consists of 2 files: + +* `.metadata` -- the binary contents of the `metadata` field +* `.value` -- the binary contents of the `value` field + +## Descriptions + +1. `primitive_` -- Examples primitive (`basic_type` = 1), one for each of the [primitive types listed in the spec] +2. `short_string` -- Example of short string (`basic_type` = 2) +3. `object_empty` -- Example of object (`basic_type` = 3) with no fields +3. `object_primitive` -- Example of object with only primitive fields +4. `object_nested` -- Example of object with other objects in fields +5. `array_empty` -- Example of array (`basic_type` = 4) with no elements +5. `array_primitive` -- Example of array with only primitive elements +6. `array_nested` -- Example of an with objects and other arrays in the elements + +## Regenerating these files + +The files were generated by running the [`regen.py`](regen.py) script that uses Apache Spark to +generate the files. + +[Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +[primitive types listed in the spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-primitive-type-basic_type0 diff --git a/variant/array_empty.metadata b/variant/array_empty.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/array_empty.metadata differ diff --git a/variant/array_empty.value b/variant/array_empty.value new file mode 100644 index 0000000..fd76034 Binary files /dev/null and b/variant/array_empty.value differ diff --git a/variant/array_nested.metadata b/variant/array_nested.metadata new file mode 100644 index 0000000..6b96dea Binary files /dev/null and b/variant/array_nested.metadata differ diff --git a/variant/array_nested.value b/variant/array_nested.value new file mode 100644 index 0000000..e7bd59a Binary files /dev/null and b/variant/array_nested.value differ diff --git a/variant/array_primitive.metadata b/variant/array_primitive.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/array_primitive.metadata differ diff --git a/variant/array_primitive.value b/variant/array_primitive.value new file mode 100644 index 0000000..882fda1 Binary files /dev/null and b/variant/array_primitive.value differ diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json new file mode 100644 index 0000000..02361b7 --- /dev/null +++ b/variant/data_dictionary.json @@ -0,0 +1,73 @@ +{ + "array_empty": [], + "array_nested": [ + { + "id": 1, + "thing": { + "names": [ + "Contrarian", + "Spider" + ] + } + }, + null, + { + "id": 2, + "names": [ + "Apple", + "Ray", + null + ], + "type": "if" + } + ], + "array_primitive": [ + 2, + 1, + 5, + 9 + ], + "object_empty": {}, + "object_nested": { + "id": 1, + "observation": { + "location": "In the Volcano", + "time": "12:34:56", + "value": { + "humidity": 456, + "temperature": 123 + } + }, + "species": { + "name": "lava monster", + "population": 6789 + } + }, + "object_primitive": { + "boolean_false_field": false, + "boolean_true_field": true, + "double_field": 1.23456789, + "int_field": 1, + "null_field": null, + "string_field": "Apache Parquet", + "timestamp_field": "2025-04-16T12:34:56.78" + }, + "primitive_binary": "AxM33q2+78r+", + "primitive_boolean_false": false, + "primitive_boolean_true": true, + "primitive_date": "2025-04-16", + "primitive_decimal16": 1.2345678912345678e+16, + "primitive_decimal4": 12.34, + "primitive_decimal8": 12345678.9, + "primitive_double": 1234567890.1234, + "primitive_float": 1234567940.0, + "primitive_int16": 1234, + "primitive_int32": 123456, + "primitive_int64": 12345678, + "primitive_int8": 42, + "primitive_null": null, + "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", + "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", + "primitive_timestampntz": "2025-04-16 12:34:56.78", + "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)" +} \ No newline at end of file diff --git a/variant/long_string.metadata b/variant/long_string.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/long_string.metadata differ diff --git a/variant/long_string.value b/variant/long_string.value new file mode 100644 index 0000000..2749c8e Binary files /dev/null and b/variant/long_string.value differ diff --git a/variant/object_empty.metadata b/variant/object_empty.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/object_empty.metadata differ diff --git a/variant/object_empty.value b/variant/object_empty.value new file mode 100644 index 0000000..aa7650d Binary files /dev/null and b/variant/object_empty.value differ diff --git a/variant/object_nested.metadata b/variant/object_nested.metadata new file mode 100644 index 0000000..71548c3 Binary files /dev/null and b/variant/object_nested.metadata differ diff --git a/variant/object_nested.value b/variant/object_nested.value new file mode 100644 index 0000000..7d05f4b Binary files /dev/null and b/variant/object_nested.value differ diff --git a/variant/object_primitive.metadata b/variant/object_primitive.metadata new file mode 100644 index 0000000..1855a30 Binary files /dev/null and b/variant/object_primitive.metadata differ diff --git a/variant/object_primitive.value b/variant/object_primitive.value new file mode 100644 index 0000000..eacd3d9 Binary files /dev/null and b/variant/object_primitive.value differ diff --git a/variant/primitive_binary.metadata b/variant/primitive_binary.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_binary.metadata differ diff --git a/variant/primitive_binary.value b/variant/primitive_binary.value new file mode 100644 index 0000000..a874b03 Binary files /dev/null and b/variant/primitive_binary.value differ diff --git a/variant/primitive_boolean_false.metadata b/variant/primitive_boolean_false.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_boolean_false.metadata differ diff --git a/variant/primitive_boolean_false.value b/variant/primitive_boolean_false.value new file mode 100644 index 0000000..5a77f05 --- /dev/null +++ b/variant/primitive_boolean_false.value @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/variant/primitive_boolean_true.metadata b/variant/primitive_boolean_true.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_boolean_true.metadata differ diff --git a/variant/primitive_boolean_true.value b/variant/primitive_boolean_true.value new file mode 100644 index 0000000..45a8ca0 --- /dev/null +++ b/variant/primitive_boolean_true.value @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/variant/primitive_date.metadata b/variant/primitive_date.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_date.metadata differ diff --git a/variant/primitive_date.value b/variant/primitive_date.value new file mode 100644 index 0000000..bf2bb07 Binary files /dev/null and b/variant/primitive_date.value differ diff --git a/variant/primitive_decimal16.metadata b/variant/primitive_decimal16.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_decimal16.metadata differ diff --git a/variant/primitive_decimal16.value b/variant/primitive_decimal16.value new file mode 100644 index 0000000..3b441af Binary files /dev/null and b/variant/primitive_decimal16.value differ diff --git a/variant/primitive_decimal4.metadata b/variant/primitive_decimal4.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_decimal4.metadata differ diff --git a/variant/primitive_decimal4.value b/variant/primitive_decimal4.value new file mode 100644 index 0000000..a8dc7f1 Binary files /dev/null and b/variant/primitive_decimal4.value differ diff --git a/variant/primitive_decimal8.metadata b/variant/primitive_decimal8.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_decimal8.metadata differ diff --git a/variant/primitive_decimal8.value b/variant/primitive_decimal8.value new file mode 100644 index 0000000..41744b5 Binary files /dev/null and b/variant/primitive_decimal8.value differ diff --git a/variant/primitive_double.metadata b/variant/primitive_double.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_double.metadata differ diff --git a/variant/primitive_double.value b/variant/primitive_double.value new file mode 100644 index 0000000..d8eedde --- /dev/null +++ b/variant/primitive_double.value @@ -0,0 +1 @@ +凴eA \ No newline at end of file diff --git a/variant/primitive_float.metadata b/variant/primitive_float.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_float.metadata differ diff --git a/variant/primitive_float.value b/variant/primitive_float.value new file mode 100644 index 0000000..aeb162a --- /dev/null +++ b/variant/primitive_float.value @@ -0,0 +1 @@ +8,N \ No newline at end of file diff --git a/variant/primitive_int16.metadata b/variant/primitive_int16.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_int16.metadata differ diff --git a/variant/primitive_int16.value b/variant/primitive_int16.value new file mode 100644 index 0000000..ac699f7 --- /dev/null +++ b/variant/primitive_int16.value @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/variant/primitive_int32.metadata b/variant/primitive_int32.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_int32.metadata differ diff --git a/variant/primitive_int32.value b/variant/primitive_int32.value new file mode 100644 index 0000000..f6ec475 Binary files /dev/null and b/variant/primitive_int32.value differ diff --git a/variant/primitive_int64.metadata b/variant/primitive_int64.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_int64.metadata differ diff --git a/variant/primitive_int64.value b/variant/primitive_int64.value new file mode 100644 index 0000000..098d0ec Binary files /dev/null and b/variant/primitive_int64.value differ diff --git a/variant/primitive_int8.metadata b/variant/primitive_int8.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_int8.metadata differ diff --git a/variant/primitive_int8.value b/variant/primitive_int8.value new file mode 100644 index 0000000..77f9ec5 --- /dev/null +++ b/variant/primitive_int8.value @@ -0,0 +1 @@ + * \ No newline at end of file diff --git a/variant/primitive_null.metadata b/variant/primitive_null.metadata new file mode 100644 index 0000000..e69de29 diff --git a/variant/primitive_null.value b/variant/primitive_null.value new file mode 100644 index 0000000..e69de29 diff --git a/variant/primitive_string.metadata b/variant/primitive_string.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_string.metadata differ diff --git a/variant/primitive_string.value b/variant/primitive_string.value new file mode 100644 index 0000000..40da840 Binary files /dev/null and b/variant/primitive_string.value differ diff --git a/variant/primitive_timestamp.metadata b/variant/primitive_timestamp.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_timestamp.metadata differ diff --git a/variant/primitive_timestamp.value b/variant/primitive_timestamp.value new file mode 100644 index 0000000..a489ecb Binary files /dev/null and b/variant/primitive_timestamp.value differ diff --git a/variant/primitive_timestampntz.metadata b/variant/primitive_timestampntz.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/primitive_timestampntz.metadata differ diff --git a/variant/primitive_timestampntz.value b/variant/primitive_timestampntz.value new file mode 100644 index 0000000..47125dd Binary files /dev/null and b/variant/primitive_timestampntz.value differ diff --git a/variant/regen.py b/variant/regen.py new file mode 100644 index 0000000..ae9cb28 --- /dev/null +++ b/variant/regen.py @@ -0,0 +1,173 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This program uses Apache Spark to generate example binary Variant data +# +# Requirements +# pip install pyarrow +# pip install pyspark +# +# Last run with Spark 4.0 preview 2: +# https://spark.apache.org/news/spark-4.0.0-preview2.html + +from pyspark.sql import SparkSession +import pyarrow.parquet as pq +import os +import json + +# Initialize Spark session and create variant data via SQL +spark = SparkSession.builder \ + .appName("PySpark SQL Example") \ + .getOrCreate() + +# recursively cleanup the spark-warehouse directory +if os.path.exists('spark-warehouse'): + for root, dirs, files in os.walk('spark-warehouse', topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + + +# Create a table with variant and insert various types into it +# +# This writes data files into spark-warehouse/output +sql = """ +CREATE TABLE T (name VARCHAR(2000), variant_col VARIANT); + +------------------------------- +-- Primitive type (basic_type=0) +------------------------------- +-- One row with a value from each type listed in +-- https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types +-- +-- Spark Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html +-- Note: must use explicit typecasts as Spark returns an error for implicit casts +INSERT INTO T VALUES ('primitive_null', NULL); +INSERT INTO T VALUES ('primitive_boolean_true', true::Variant); +INSERT INTO T VALUES ('primitive_boolean_false', false::Variant); +INSERT INTO T VALUES ('primitive_int8', 42::Byte::Variant); +INSERT INTO T VALUES ('primitive_int16', 1234::Short::Variant); +INSERT INTO T VALUES ('primitive_int32', 123456::Integer::Variant); +INSERT INTO T VALUES ('primitive_int64', 12345678::Long::Variant); +INSERT INTO T VALUES ('primitive_double', 1234567890.1234::Double::Variant); +INSERT INTO T VALUES ('primitive_decimal4', 12.34::Decimal(8,2)::Variant); +INSERT INTO T VALUES ('primitive_decimal8', 12345678.90::Decimal(12,2)::Variant); +INSERT INTO T VALUES ('primitive_decimal16', 12345678912345678.90::Decimal(30,2)::Variant); +INSERT INTO T VALUES ('primitive_date', '2025-04-16'::Date::Variant); +INSERT INTO T VALUES ('primitive_timestamp', '2025-04-16T12:34:56.78'::Timestamp::Variant); +INSERT INTO T VALUES ('primitive_timestampntz', '2025-04-16T12:34:56.78'::Timestamp_NTZ::Variant); +INSERT INTO T VALUES ('primitive_float', 1234567890.1234::Float::Variant); +INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); +INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!'::Variant); + +-- https://github.com/apache/parquet-testing/issues/79 +-- is not clear how to create the following types using Spark SQL +-- TODO TimeNTZ (Type ID 17) +-- TODO 'timestamp with timezone (NANOS)' (Type ID 18) +-- TODO 'timestamp with time zone (NANOS)' (Type ID 19) +-- TODO 'UUID' (Type ID 20) + +------------------------------- +-- Short string (basic_type=1) +------------------------------- +INSERT INTO T VALUES ('short_string', 'Less than 64 bytes (❤️ with utf8)'::Variant); + +------------------------------- +-- Object (basic_type=2) +------------------------------- +-- Use parse_json to create Variant, as spark does not seem to support casting structs --> Variant. +INSERT INTO T VALUES ('object_empty', parse_json('{}')::Variant); +INSERT INTO T VALUES ('object_primitive', parse_json('{"int_field" : 1, "double_field": 1.23456789, "boolean_true_field": true, "boolean_false_field": false, "string_field": "Apache Parquet", "null_field": null, "timestamp_field": "2025-04-16T12:34:56.78"}')::Variant); +INSERT INTO T VALUES ('object_nested', parse_json('{ "id" : 1, "species" : { "name": "lava monster", "population": 6789}, "observation" : { "time": "12:34:56", "location": "In the Volcano", "value" : { "temperature": 123, "humidity": 456 } } }')::Variant); + +-- https://github.com/apache/parquet-testing/issues/77 +-- TODO create example variant objects with fields that non-json types (like timestamp, date, etc) +-- Casting from "STRUCT<...>" to "VARIANT"" is not yet supported +-- INSERT INTO T VALUES ('object_primitive', struct(1234.56::Double as double_field, true as boolean_true_field, false as boolean_false_field, '2025-04-16T12:34:56.78'::Timestamp as timestamp_field, 'Apache Parquet' as string_field, null as null_field)::Variant); +--TODO objects with more than 2**8 distinct fields (that require using more than one byte for field offset) +--TODO objects with more than 2**16 distinct fields (that require using more than 2 bytes for field offset) +--TODO objects with more than 2**24 distinct fields (that require using more than 3 bytes for field offset) + +------------------------------- +-- Array (basic_type=3) +------------------------------- +INSERT INTO T VALUES ('array_empty', parse_json('[]')::Variant); +INSERT INTO T VALUES ('array_primitive', parse_json('[2, 1, 5, 9]')::Variant); +INSERT INTO T VALUES ('array_nested', parse_json('[ { "id": 1, "thing": { "names": ["Contrarian", "Spider"] } }, null, { "id": 2, "type": "if", "names": ["Apple", "Ray", null] } ]')::Variant); + +-- https://github.com/apache/parquet-testing/issues/78 +-- TODO arrays with more than 2**8 distinct elements (that require using more than one byte for count) +-- TODO arrays where the total length of all values is greater than 2**8, 2**16, and 2**24 bytes (that require using more than one byte for the offsets) + +------------------------------- +-- Output the value to a new table that also has the JSON representation of the variant column +------------------------------- +DROP TABLE IF EXISTS output; +CREATE TABLE output AS SELECT name, variant_col, to_json(variant_col) as json_col FROM T; +""" +for statement in sql.split("\n"): + statement = statement.strip() + if not statement or statement.startswith("--"): + continue + print("Running SQL:", statement) + spark.sql(statement) + +mypath = 'spark-warehouse/output' +parquet_files = [f for f in os.listdir(mypath) if f.endswith('.parquet')] + +# extract the values from the parquet files +data_dictionary = {} +for f in parquet_files: + table = pq.read_table(os.path.join(mypath, f)) + for row in range(len(table)): + name = table[0][row] + # variants are stored as StructArrays with two fields: + # metadata, and value + variant_col = table[1][row] + metadata = variant_col['metadata'] + value = variant_col['value'] + json_value = table[2][row] + + print("Writing metadata for", name) + + # write the metadata, value, and json representation to files + with open(f"{name}.metadata", "wb") as f: + buffer = metadata.as_buffer() + if buffer is not None: + f.write(buffer) + with open(f"{name}.value", "wb") as f: + buffer = value.as_buffer() + if buffer is not None: + f.write(buffer) + + # Add the JSON representation to the data dictionary + name = name.as_py() + json_value = json_value.as_py() + + if json_value is not None: + data_dictionary[name] = json.loads(json_value) + else: + data_dictionary[name] = None + +with open(f"data_dictionary.json", "w") as f: + f.write(json.dumps(data_dictionary, sort_keys = True, indent=4)) + +# Note: It is possible to write the output to a single parquet file, using a command +# such as: +# spark.sql("SELECT * FROM output").repartition(1).write.parquet('variant.parquet') +# At the time of writing, this file does not have the logical type annotation for VARIANT \ No newline at end of file diff --git a/variant/short_string.metadata b/variant/short_string.metadata new file mode 100644 index 0000000..12db478 Binary files /dev/null and b/variant/short_string.metadata differ diff --git a/variant/short_string.value b/variant/short_string.value new file mode 100644 index 0000000..c403fe2 --- /dev/null +++ b/variant/short_string.value @@ -0,0 +1 @@ +Less than 64 bytes (❤️ with utf8) \ No newline at end of file