diff --git a/tpcds/data/sf1/call_center.parquet b/tpcds/data/sf1/call_center.parquet index a6579e4..b25f50f 100644 Binary files a/tpcds/data/sf1/call_center.parquet and b/tpcds/data/sf1/call_center.parquet differ diff --git a/tpcds/data/sf1/catalog_page.parquet b/tpcds/data/sf1/catalog_page.parquet index 6440c1b..723d135 100644 Binary files a/tpcds/data/sf1/catalog_page.parquet and b/tpcds/data/sf1/catalog_page.parquet differ diff --git a/tpcds/data/sf1/catalog_returns.parquet b/tpcds/data/sf1/catalog_returns.parquet index 58014bf..9919265 100644 Binary files a/tpcds/data/sf1/catalog_returns.parquet and b/tpcds/data/sf1/catalog_returns.parquet differ diff --git a/tpcds/data/sf1/catalog_sales.parquet b/tpcds/data/sf1/catalog_sales.parquet index cc6c517..34aaa1a 100644 Binary files a/tpcds/data/sf1/catalog_sales.parquet and b/tpcds/data/sf1/catalog_sales.parquet differ diff --git a/tpcds/data/sf1/customer.parquet b/tpcds/data/sf1/customer.parquet index 6273e19..19b988d 100644 Binary files a/tpcds/data/sf1/customer.parquet and b/tpcds/data/sf1/customer.parquet differ diff --git a/tpcds/data/sf1/customer_address.parquet b/tpcds/data/sf1/customer_address.parquet index 360ef32..e446605 100644 Binary files a/tpcds/data/sf1/customer_address.parquet and b/tpcds/data/sf1/customer_address.parquet differ diff --git a/tpcds/data/sf1/customer_demographics.parquet b/tpcds/data/sf1/customer_demographics.parquet index 2a2cfbf..87e7d5d 100644 Binary files a/tpcds/data/sf1/customer_demographics.parquet and b/tpcds/data/sf1/customer_demographics.parquet differ diff --git a/tpcds/data/sf1/date_dim.parquet b/tpcds/data/sf1/date_dim.parquet index 46fbbce..5d757e7 100644 Binary files a/tpcds/data/sf1/date_dim.parquet and b/tpcds/data/sf1/date_dim.parquet differ diff --git a/tpcds/data/sf1/household_demographics.parquet b/tpcds/data/sf1/household_demographics.parquet index 9a59a22..ecdb56a 100644 Binary files a/tpcds/data/sf1/household_demographics.parquet and b/tpcds/data/sf1/household_demographics.parquet differ diff --git a/tpcds/data/sf1/income_band.parquet b/tpcds/data/sf1/income_band.parquet index db1838f..5995900 100644 Binary files a/tpcds/data/sf1/income_band.parquet and b/tpcds/data/sf1/income_band.parquet differ diff --git a/tpcds/data/sf1/inventory.parquet b/tpcds/data/sf1/inventory.parquet index 13270c2..cbfbff5 100644 Binary files a/tpcds/data/sf1/inventory.parquet and b/tpcds/data/sf1/inventory.parquet differ diff --git a/tpcds/data/sf1/item.parquet b/tpcds/data/sf1/item.parquet index 8c12ee6..484be9a 100644 Binary files a/tpcds/data/sf1/item.parquet and b/tpcds/data/sf1/item.parquet differ diff --git a/tpcds/data/sf1/promotion.parquet b/tpcds/data/sf1/promotion.parquet index 31ffe38..5f17ddd 100644 Binary files a/tpcds/data/sf1/promotion.parquet and b/tpcds/data/sf1/promotion.parquet differ diff --git a/tpcds/data/sf1/reason.parquet b/tpcds/data/sf1/reason.parquet index 1b41b04..98686db 100644 Binary files a/tpcds/data/sf1/reason.parquet and b/tpcds/data/sf1/reason.parquet differ diff --git a/tpcds/data/sf1/ship_mode.parquet b/tpcds/data/sf1/ship_mode.parquet index 1d41ec4..e697111 100644 Binary files a/tpcds/data/sf1/ship_mode.parquet and b/tpcds/data/sf1/ship_mode.parquet differ diff --git a/tpcds/data/sf1/store.parquet b/tpcds/data/sf1/store.parquet index ad1e234..02dc00b 100644 Binary files a/tpcds/data/sf1/store.parquet and b/tpcds/data/sf1/store.parquet differ diff --git a/tpcds/data/sf1/store_returns.parquet b/tpcds/data/sf1/store_returns.parquet index d9c6a52..0059d36 100644 Binary files a/tpcds/data/sf1/store_returns.parquet and b/tpcds/data/sf1/store_returns.parquet differ diff --git a/tpcds/data/sf1/store_sales.parquet b/tpcds/data/sf1/store_sales.parquet index 5219340..dd99978 100644 Binary files a/tpcds/data/sf1/store_sales.parquet and b/tpcds/data/sf1/store_sales.parquet differ diff --git a/tpcds/data/sf1/time_dim.parquet b/tpcds/data/sf1/time_dim.parquet index 6c59272..025f089 100644 Binary files a/tpcds/data/sf1/time_dim.parquet and b/tpcds/data/sf1/time_dim.parquet differ diff --git a/tpcds/data/sf1/warehouse.parquet b/tpcds/data/sf1/warehouse.parquet index 856d07a..3cadc10 100644 Binary files a/tpcds/data/sf1/warehouse.parquet and b/tpcds/data/sf1/warehouse.parquet differ diff --git a/tpcds/data/sf1/web_page.parquet b/tpcds/data/sf1/web_page.parquet index f350a2d..a0b607d 100644 Binary files a/tpcds/data/sf1/web_page.parquet and b/tpcds/data/sf1/web_page.parquet differ diff --git a/tpcds/data/sf1/web_returns.parquet b/tpcds/data/sf1/web_returns.parquet index d0b5b77..91da0a4 100644 Binary files a/tpcds/data/sf1/web_returns.parquet and b/tpcds/data/sf1/web_returns.parquet differ diff --git a/tpcds/data/sf1/web_sales.parquet b/tpcds/data/sf1/web_sales.parquet index fdcd5c3..648cd31 100644 Binary files a/tpcds/data/sf1/web_sales.parquet and b/tpcds/data/sf1/web_sales.parquet differ diff --git a/tpcds/data/sf1/web_site.parquet b/tpcds/data/sf1/web_site.parquet index ee9c79c..0771c33 100644 Binary files a/tpcds/data/sf1/web_site.parquet and b/tpcds/data/sf1/web_site.parquet differ diff --git a/tpcds/tpcdsgen.py b/tpcds/tpcdsgen.py index fa8f55c..f456bc9 100644 --- a/tpcds/tpcdsgen.py +++ b/tpcds/tpcdsgen.py @@ -565,19 +565,24 @@ def convert_dat_to_parquet(ctx: SessionContext, table: str, dat_filename: str, f print(f"Converting {dat_filename} to {parquet_filename} ...") table_schema = all_schemas[table].copy() - - # Pre-collect the output columns so we can ignore the null field we add - # in to handle the trailing | in the file output_cols = [r.name for r in table_schema] - # Trailing | requires extra field for in processing - table_schema.append(pyarrow.field("some_null", pyarrow.null(), nullable=True)) + # Detect trailing | delimiter (older dsdgen versions add it as a field + # terminator, creating an extra empty field beyond the schema columns) + with open(dat_filename, 'r') as f: + first_line = f.readline().rstrip('\n') + num_csv_fields = len(first_line.split('|')) + has_trailing_pipe = num_csv_fields > len(table_schema) + + if has_trailing_pipe: + # Trailing | requires extra field for processing + table_schema.append(pyarrow.field("some_null", pyarrow.null(), nullable=True)) schema = pyarrow.schema(table_schema) df = ctx.read_csv(dat_filename, schema=schema, has_header=False, file_extension=file_extension, delimiter="|") - df = df.select_columns(*output_cols) - df.write_parquet(parquet_filename, compression="snappy") + df = df.select(*output_cols) + df.write_parquet(parquet_filename, compression="zstd", compression_level=19) def generate_tpcds(scale_factor: int, partitions: int): pass