Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions paimon-python/pypaimon/table/row/generic_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,7 @@ def from_bytes(
arity = len(data_fields)
actual_data = bytes_data
if len(bytes_data) >= 4:
arity_from_bytes = struct.unpack('>i', bytes_data[:4])[0]
if 0 < arity_from_bytes < 1000:
actual_data = bytes_data[4:]
actual_data = bytes_data[4:]

fields = []
null_bits_size_in_bytes = cls._calculate_bit_set_width_in_bytes(arity)
Expand Down
76 changes: 76 additions & 0 deletions paimon-python/pypaimon/tests/py36/ao_read_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
from unittest.mock import Mock

import pandas as pd
import numpy as np
import pyarrow as pa

from pypaimon.api.options import Options
from pypaimon.catalog.catalog_context import CatalogContext
from pypaimon import CatalogFactory
Expand Down Expand Up @@ -307,6 +309,80 @@ def test_append_only_multi_write_once_commit(self):
actual = table_sort_by(self._read_test_table(read_builder), 'user_id')
self.assertEqual(actual, self.expected)

def test_over1000cols_read(self):
num_rows = 1
num_cols = 10
table_name = "default.testBug"
# Generate dynamic schema based on column count
schema_fields = []
for i in range(1, num_cols + 1):
col_name = f'c{i:03d}'
if i == 1:
schema_fields.append((col_name, pa.string())) # ID column
elif i == 2:
schema_fields.append((col_name, pa.string())) # Name column
elif i == 3:
schema_fields.append((col_name, pa.string())) # Category column (partition key)
elif i % 4 == 0:
schema_fields.append((col_name, pa.float64())) # Float columns
elif i % 4 == 1:
schema_fields.append((col_name, pa.int32())) # Int columns
elif i % 4 == 2:
schema_fields.append((col_name, pa.string())) # String columns
else:
schema_fields.append((col_name, pa.int64())) # Long columns

pa_schema = pa.schema(schema_fields)
schema = Schema.from_pyarrow_schema(
pa_schema,
partition_keys=['c003'], # Use c003 as partition key
)

# Create table
self.rest_catalog.create_table(table_name, schema, False)
table = self.rest_catalog.get_table(table_name)

# Generate test data
np.random.seed(42) # For reproducible results
categories = ['Electronics', 'Clothing', 'Books', 'Home', 'Sports', 'Food', 'Toys', 'Beauty', 'Health', 'Auto']
statuses = ['Active', 'Inactive', 'Pending', 'Completed']

# Generate data dictionary
test_data = {}
for i in range(1, num_cols + 1):
col_name = f'c{i:03d}'
if i == 1:
test_data[col_name] = [f'Product_{j}' for j in range(1, num_rows + 1)]
elif i == 2:
test_data[col_name] = [f'Product_{j}' for j in range(1, num_rows + 1)]
elif i == 3:
test_data[col_name] = np.random.choice(categories, num_rows)
elif i % 4 == 0:
test_data[col_name] = np.random.uniform(1.0, 1000.0, num_rows).round(2)
elif i % 4 == 1:
test_data[col_name] = np.random.randint(1, 100, num_rows)
elif i % 4 == 2:
test_data[col_name] = np.random.choice(statuses, num_rows)
else:
test_data[col_name] = np.random.randint(1640995200, 1672531200, num_rows)

test_df = pd.DataFrame(test_data)

write_builder = table.new_batch_write_builder()
table_write = write_builder.new_write()
table_commit = write_builder.new_commit()

table_write.write_pandas(test_df)
table_commit.commit(table_write.prepare_commit())
table_write.close()
table_commit.close()

read_builder = table.new_read_builder()
table_scan = read_builder.new_scan()
table_read = read_builder.new_read()
result = table_read.to_pandas(table_scan.plan().splits())
self.assertEqual(result.to_dict(), test_df.to_dict())

def testAppendOnlyReaderWithFilter(self):
schema = Schema.from_pyarrow_schema(self.pa_schema, partition_keys=['dt'])
self.rest_catalog.create_table('default.test_append_only_filter', schema, False)
Expand Down
76 changes: 76 additions & 0 deletions paimon-python/pypaimon/tests/reader_append_only_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import unittest

import pyarrow as pa
import numpy as np
import pandas as pd

from pypaimon import CatalogFactory
from pypaimon import Schema
Expand Down Expand Up @@ -113,6 +115,80 @@ def test_append_only_multi_write_once_commit(self):
actual = self._read_test_table(read_builder).sort_by('user_id')
self.assertEqual(actual, self.expected)

def test_over1000cols_read(self):
num_rows = 1
num_cols = 10
table_name = "default.testBug"
# Generate dynamic schema based on column count
schema_fields = []
for i in range(1, num_cols + 1):
col_name = f'c{i:03d}'
if i == 1:
schema_fields.append((col_name, pa.string())) # ID column
elif i == 2:
schema_fields.append((col_name, pa.string())) # Name column
elif i == 3:
schema_fields.append((col_name, pa.string())) # Category column (partition key)
elif i % 4 == 0:
schema_fields.append((col_name, pa.float64())) # Float columns
elif i % 4 == 1:
schema_fields.append((col_name, pa.int32())) # Int columns
elif i % 4 == 2:
schema_fields.append((col_name, pa.string())) # String columns
else:
schema_fields.append((col_name, pa.int64())) # Long columns

pa_schema = pa.schema(schema_fields)
schema = Schema.from_pyarrow_schema(
pa_schema,
partition_keys=['c003'], # Use c003 as partition key
)

# Create table
self.catalog.create_table(table_name, schema, False)
table = self.catalog.get_table(table_name)

# Generate test data
np.random.seed(42) # For reproducible results
categories = ['Electronics', 'Clothing', 'Books', 'Home', 'Sports', 'Food', 'Toys', 'Beauty', 'Health', 'Auto']
statuses = ['Active', 'Inactive', 'Pending', 'Completed']

# Generate data dictionary
test_data = {}
for i in range(1, num_cols + 1):
col_name = f'c{i:03d}'
if i == 1:
test_data[col_name] = [f'Product_{j}' for j in range(1, num_rows + 1)]
elif i == 2:
test_data[col_name] = [f'Product_{j}' for j in range(1, num_rows + 1)]
elif i == 3:
test_data[col_name] = np.random.choice(categories, num_rows)
elif i % 4 == 0:
test_data[col_name] = np.random.uniform(1.0, 1000.0, num_rows).round(2)
elif i % 4 == 1:
test_data[col_name] = np.random.randint(1, 100, num_rows)
elif i % 4 == 2:
test_data[col_name] = np.random.choice(statuses, num_rows)
else:
test_data[col_name] = np.random.randint(1640995200, 1672531200, num_rows)

test_df = pd.DataFrame(test_data)

write_builder = table.new_batch_write_builder()
table_write = write_builder.new_write()
table_commit = write_builder.new_commit()

table_write.write_pandas(test_df)
table_commit.commit(table_write.prepare_commit())
table_write.close()
table_commit.close()

read_builder = table.new_read_builder()
table_scan = read_builder.new_scan()
table_read = read_builder.new_read()
result = table_read.to_pandas(table_scan.plan().splits())
self.assertEqual(result.to_dict(), test_df.to_dict())

def testAppendOnlyReaderWithFilter(self):
schema = Schema.from_pyarrow_schema(self.pa_schema, partition_keys=['dt'])
self.catalog.create_table('default.test_append_only_filter', schema, False)
Expand Down