From cbf7b99ba159bb48adaf2989fbd7a01e62c2dd54 Mon Sep 17 00:00:00 2001 From: Diego Sevilla Ruiz Date: Wed, 3 Sep 2025 14:21:29 +0200 Subject: [PATCH 1/5] Update from_parquet and read_parquet method signatures. from_parquet and read_parquet are incorrectly described as receiving just a str parameter where they allow to receive also a list of files/globs as a list of str. This fixes #26, although more work is needed because this file is auto-generated. --- duckdb/__init__.pyi | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/duckdb/__init__.pyi b/duckdb/__init__.pyi index adf142dd..91945dfd 100644 --- a/duckdb/__init__.pyi +++ b/duckdb/__init__.pyi @@ -345,8 +345,10 @@ class DuckDBPyConnection: def from_csv_auto(self, path_or_buffer: Union[str, StringIO, TextIOBase], *, header: Optional[bool | int] = None, compression: Optional[str] = None, sep: Optional[str] = None, delimiter: Optional[str] = None, dtype: Optional[Dict[str, str] | List[str]] = None, na_values: Optional[str| List[str]] = None, skiprows: Optional[int] = None, quotechar: Optional[str] = None, escapechar: Optional[str] = None, encoding: Optional[str] = None, parallel: Optional[bool] = None, date_format: Optional[str] = None, timestamp_format: Optional[str] = None, sample_size: Optional[int] = None, all_varchar: Optional[bool] = None, normalize_names: Optional[bool] = None, null_padding: Optional[bool] = None, names: Optional[List[str]] = None, lineterminator: Optional[str] = None, columns: Optional[Dict[str, str]] = None, auto_type_candidates: Optional[List[str]] = None, max_line_size: Optional[int] = None, ignore_errors: Optional[bool] = None, store_rejects: Optional[bool] = None, rejects_table: Optional[str] = None, rejects_scan: Optional[str] = None, rejects_limit: Optional[int] = None, force_not_null: Optional[List[str]] = None, buffer_size: Optional[int] = None, decimal: Optional[str] = None, allow_quoted_nulls: Optional[bool] = None, filename: Optional[bool | str] = None, hive_partitioning: Optional[bool] = None, union_by_name: Optional[bool] = None, hive_types: Optional[Dict[str, str]] = None, hive_types_autocast: Optional[bool] = None) -> DuckDBPyRelation: ... def from_df(self, df: pandas.DataFrame) -> DuckDBPyRelation: ... def from_arrow(self, arrow_object: object) -> DuckDBPyRelation: ... - def from_parquet(self, file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... - def read_parquet(self, file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... + # stubgen override + def from_parquet(self, file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... + def read_parquet(self, file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... + # end stubgen override def get_table_names(self, query: str, *, qualified: bool = False) -> Set[str]: ... def install_extension(self, extension: str, *, force_install: bool = False, repository: Optional[str] = None, repository_url: Optional[str] = None, version: Optional[str] = None) -> None: ... def load_extension(self, extension: str) -> None: ... @@ -693,8 +695,10 @@ def read_csv(path_or_buffer: Union[str, StringIO, TextIOBase], *, header: Option def from_csv_auto(path_or_buffer: Union[str, StringIO, TextIOBase], *, header: Optional[bool | int] = None, compression: Optional[str] = None, sep: Optional[str] = None, delimiter: Optional[str] = None, dtype: Optional[Dict[str, str] | List[str]] = None, na_values: Optional[str| List[str]] = None, skiprows: Optional[int] = None, quotechar: Optional[str] = None, escapechar: Optional[str] = None, encoding: Optional[str] = None, parallel: Optional[bool] = None, date_format: Optional[str] = None, timestamp_format: Optional[str] = None, sample_size: Optional[int] = None, all_varchar: Optional[bool] = None, normalize_names: Optional[bool] = None, null_padding: Optional[bool] = None, names: Optional[List[str]] = None, lineterminator: Optional[str] = None, columns: Optional[Dict[str, str]] = None, auto_type_candidates: Optional[List[str]] = None, max_line_size: Optional[int] = None, ignore_errors: Optional[bool] = None, store_rejects: Optional[bool] = None, rejects_table: Optional[str] = None, rejects_scan: Optional[str] = None, rejects_limit: Optional[int] = None, force_not_null: Optional[List[str]] = None, buffer_size: Optional[int] = None, decimal: Optional[str] = None, allow_quoted_nulls: Optional[bool] = None, filename: Optional[bool | str] = None, hive_partitioning: Optional[bool] = None, union_by_name: Optional[bool] = None, hive_types: Optional[Dict[str, str]] = None, hive_types_autocast: Optional[bool] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def from_df(df: pandas.DataFrame, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def from_arrow(arrow_object: object, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... -def from_parquet(file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... -def read_parquet(file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... +# stubgen override +def from_parquet(file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... +def read_parquet(file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... +# end stubgen override def get_table_names(query: str, *, qualified: bool = False, connection: DuckDBPyConnection = ...) -> Set[str]: ... def install_extension(extension: str, *, force_install: bool = False, repository: Optional[str] = None, repository_url: Optional[str] = None, version: Optional[str] = None, connection: DuckDBPyConnection = ...) -> None: ... def load_extension(extension: str, *, connection: DuckDBPyConnection = ...) -> None: ... From b72c6e156af6ab1edf88628c4e06127fa5670485 Mon Sep 17 00:00:00 2001 From: Diego Sevilla Ruiz Date: Thu, 4 Sep 2025 11:25:13 +0200 Subject: [PATCH 2/5] Revert "Update from_parquet and read_parquet method signatures." This reverts commit cbf7b99ba159bb48adaf2989fbd7a01e62c2dd54. --- duckdb/__init__.pyi | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/duckdb/__init__.pyi b/duckdb/__init__.pyi index 91945dfd..adf142dd 100644 --- a/duckdb/__init__.pyi +++ b/duckdb/__init__.pyi @@ -345,10 +345,8 @@ class DuckDBPyConnection: def from_csv_auto(self, path_or_buffer: Union[str, StringIO, TextIOBase], *, header: Optional[bool | int] = None, compression: Optional[str] = None, sep: Optional[str] = None, delimiter: Optional[str] = None, dtype: Optional[Dict[str, str] | List[str]] = None, na_values: Optional[str| List[str]] = None, skiprows: Optional[int] = None, quotechar: Optional[str] = None, escapechar: Optional[str] = None, encoding: Optional[str] = None, parallel: Optional[bool] = None, date_format: Optional[str] = None, timestamp_format: Optional[str] = None, sample_size: Optional[int] = None, all_varchar: Optional[bool] = None, normalize_names: Optional[bool] = None, null_padding: Optional[bool] = None, names: Optional[List[str]] = None, lineterminator: Optional[str] = None, columns: Optional[Dict[str, str]] = None, auto_type_candidates: Optional[List[str]] = None, max_line_size: Optional[int] = None, ignore_errors: Optional[bool] = None, store_rejects: Optional[bool] = None, rejects_table: Optional[str] = None, rejects_scan: Optional[str] = None, rejects_limit: Optional[int] = None, force_not_null: Optional[List[str]] = None, buffer_size: Optional[int] = None, decimal: Optional[str] = None, allow_quoted_nulls: Optional[bool] = None, filename: Optional[bool | str] = None, hive_partitioning: Optional[bool] = None, union_by_name: Optional[bool] = None, hive_types: Optional[Dict[str, str]] = None, hive_types_autocast: Optional[bool] = None) -> DuckDBPyRelation: ... def from_df(self, df: pandas.DataFrame) -> DuckDBPyRelation: ... def from_arrow(self, arrow_object: object) -> DuckDBPyRelation: ... - # stubgen override - def from_parquet(self, file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... - def read_parquet(self, file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... - # end stubgen override + def from_parquet(self, file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... + def read_parquet(self, file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None) -> DuckDBPyRelation: ... def get_table_names(self, query: str, *, qualified: bool = False) -> Set[str]: ... def install_extension(self, extension: str, *, force_install: bool = False, repository: Optional[str] = None, repository_url: Optional[str] = None, version: Optional[str] = None) -> None: ... def load_extension(self, extension: str) -> None: ... @@ -695,10 +693,8 @@ def read_csv(path_or_buffer: Union[str, StringIO, TextIOBase], *, header: Option def from_csv_auto(path_or_buffer: Union[str, StringIO, TextIOBase], *, header: Optional[bool | int] = None, compression: Optional[str] = None, sep: Optional[str] = None, delimiter: Optional[str] = None, dtype: Optional[Dict[str, str] | List[str]] = None, na_values: Optional[str| List[str]] = None, skiprows: Optional[int] = None, quotechar: Optional[str] = None, escapechar: Optional[str] = None, encoding: Optional[str] = None, parallel: Optional[bool] = None, date_format: Optional[str] = None, timestamp_format: Optional[str] = None, sample_size: Optional[int] = None, all_varchar: Optional[bool] = None, normalize_names: Optional[bool] = None, null_padding: Optional[bool] = None, names: Optional[List[str]] = None, lineterminator: Optional[str] = None, columns: Optional[Dict[str, str]] = None, auto_type_candidates: Optional[List[str]] = None, max_line_size: Optional[int] = None, ignore_errors: Optional[bool] = None, store_rejects: Optional[bool] = None, rejects_table: Optional[str] = None, rejects_scan: Optional[str] = None, rejects_limit: Optional[int] = None, force_not_null: Optional[List[str]] = None, buffer_size: Optional[int] = None, decimal: Optional[str] = None, allow_quoted_nulls: Optional[bool] = None, filename: Optional[bool | str] = None, hive_partitioning: Optional[bool] = None, union_by_name: Optional[bool] = None, hive_types: Optional[Dict[str, str]] = None, hive_types_autocast: Optional[bool] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def from_df(df: pandas.DataFrame, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def from_arrow(arrow_object: object, *, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... -# stubgen override -def from_parquet(file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... -def read_parquet(file_or_files_glob: Union[str, List[str]], binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... -# end stubgen override +def from_parquet(file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... +def read_parquet(file_glob: str, binary_as_string: bool = False, *, file_row_number: bool = False, filename: bool = False, hive_partitioning: bool = False, union_by_name: bool = False, compression: Optional[str] = None, connection: DuckDBPyConnection = ...) -> DuckDBPyRelation: ... def get_table_names(query: str, *, qualified: bool = False, connection: DuckDBPyConnection = ...) -> Set[str]: ... def install_extension(extension: str, *, force_install: bool = False, repository: Optional[str] = None, repository_url: Optional[str] = None, version: Optional[str] = None, connection: DuckDBPyConnection = ...) -> None: ... def load_extension(extension: str, *, connection: DuckDBPyConnection = ...) -> None: ... From b76a66aa12b5d6bd7bdc1f9a600d74d1a17144e5 Mon Sep 17 00:00:00 2001 From: Diego Sevilla Ruiz Date: Thu, 4 Sep 2025 17:58:28 +0200 Subject: [PATCH 3/5] Fix for #26: - Changed the type of the file_globs parameter of from_parquet and read_parquet in connection_methods.json - Added the generation of @overload functions in the generation wrappers python code. --- scripts/connection_methods.json | 4 ++-- scripts/generate_connection_stubs.py | 17 +++++++++-------- scripts/generate_connection_wrapper_stubs.py | 16 +++++++++------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/scripts/connection_methods.json b/scripts/connection_methods.json index 27705d6a..a87b992f 100644 --- a/scripts/connection_methods.json +++ b/scripts/connection_methods.json @@ -412,7 +412,7 @@ "fetch_record_batch", "arrow" ], - + "function": "FetchRecordBatchReader", "docs": "Fetch an Arrow RecordBatchReader following execute()", "args": [ @@ -992,7 +992,7 @@ "args": [ { "name": "file_globs", - "type": "str" + "type": "List[str]" }, { "name": "binary_as_string", diff --git a/scripts/generate_connection_stubs.py b/scripts/generate_connection_stubs.py index 563ade3d..32831134 100644 --- a/scripts/generate_connection_stubs.py +++ b/scripts/generate_connection_stubs.py @@ -51,8 +51,12 @@ def create_arguments(arguments) -> list: result.append(argument) return result - def create_definition(name, method) -> str: - definition = f"def {name}(" + def create_definition(name, method, overloaded: bool) -> str: + if overloaded: + definition: str = "@overload\n" + else: + definition: str = "" + definition += f"def {name}(" arguments = ['self'] if 'args' in method: arguments.extend(create_arguments(method['args'])) @@ -66,8 +70,8 @@ def create_definition(name, method) -> str: return definition # We have "duplicate" methods, which are overloaded - # maybe we should add @overload to these instead, but this is easier - written_methods = set() + # We keep note of them to add the @overload decorator. + overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)} for method in connection_methods: if isinstance(method['name'], list): @@ -75,10 +79,7 @@ def create_definition(name, method) -> str: else: names = [method['name']] for name in names: - if name in written_methods: - continue - body.append(create_definition(name, method)) - written_methods.add(name) + body.append(create_definition(name, method, name in overloaded_methods)) # ---- End of generation code ---- diff --git a/scripts/generate_connection_wrapper_stubs.py b/scripts/generate_connection_wrapper_stubs.py index 94b0e0ee..d1ce50e3 100644 --- a/scripts/generate_connection_wrapper_stubs.py +++ b/scripts/generate_connection_wrapper_stubs.py @@ -66,8 +66,12 @@ def create_arguments(arguments) -> list: result.append(argument) return result - def create_definition(name, method) -> str: - definition = f"def {name}(" + def create_definition(name, method, overloaded: bool) -> str: + if overloaded: + definition: str = "@overload\n" + else: + definition: str = "" + definition += f"def {name}(" arguments = [] if name in SPECIAL_METHOD_NAMES: arguments.append('df: pandas.DataFrame') @@ -84,7 +88,8 @@ def create_definition(name, method) -> str: # We have "duplicate" methods, which are overloaded # maybe we should add @overload to these instead, but this is easier - written_methods = set() + # We keep note of them to add the @overload decorator. + overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)} body = [] for method in methods: @@ -99,10 +104,7 @@ def create_definition(name, method) -> str: method['kwargs'].append({'name': 'connection', 'type': 'DuckDBPyConnection', 'default': '...'}) for name in names: - if name in written_methods: - continue - body.append(create_definition(name, method)) - written_methods.add(name) + body.append(create_definition(name, method, name in overloaded_methods)) # ---- End of generation code ---- From 65e5e4dc1d942c376d0157762b925d9945725878 Mon Sep 17 00:00:00 2001 From: Diego Sevilla Ruiz Date: Thu, 4 Sep 2025 18:26:20 +0200 Subject: [PATCH 4/5] Fix comment. --- scripts/generate_connection_wrapper_stubs.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/generate_connection_wrapper_stubs.py b/scripts/generate_connection_wrapper_stubs.py index d1ce50e3..64912861 100644 --- a/scripts/generate_connection_wrapper_stubs.py +++ b/scripts/generate_connection_wrapper_stubs.py @@ -86,9 +86,7 @@ def create_definition(name, method, overloaded: bool) -> str: definition += f" -> {method['return']}: ..." return definition - # We have "duplicate" methods, which are overloaded - # maybe we should add @overload to these instead, but this is easier - # We keep note of them to add the @overload decorator. + # We have "duplicate" methods, which are overloaded. overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)} body = [] @@ -104,7 +102,6 @@ def create_definition(name, method, overloaded: bool) -> str: method['kwargs'].append({'name': 'connection', 'type': 'DuckDBPyConnection', 'default': '...'}) for name in names: - body.append(create_definition(name, method, name in overloaded_methods)) # ---- End of generation code ---- From c37d493c1e2b16cdda392202cadc1449382e3490 Mon Sep 17 00:00:00 2001 From: Diego Sevilla Ruiz Date: Thu, 4 Sep 2025 18:29:20 +0200 Subject: [PATCH 5/5] Fix comments again. --- scripts/generate_connection_stubs.py | 2 +- scripts/generate_connection_wrapper_stubs.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/generate_connection_stubs.py b/scripts/generate_connection_stubs.py index 32831134..fbb66c21 100644 --- a/scripts/generate_connection_stubs.py +++ b/scripts/generate_connection_stubs.py @@ -69,7 +69,7 @@ def create_definition(name, method, overloaded: bool) -> str: definition += f" -> {method['return']}: ..." return definition - # We have "duplicate" methods, which are overloaded + # We have "duplicate" methods, which are overloaded. # We keep note of them to add the @overload decorator. overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)} diff --git a/scripts/generate_connection_wrapper_stubs.py b/scripts/generate_connection_wrapper_stubs.py index 64912861..62c60a84 100644 --- a/scripts/generate_connection_wrapper_stubs.py +++ b/scripts/generate_connection_wrapper_stubs.py @@ -87,6 +87,7 @@ def create_definition(name, method, overloaded: bool) -> str: return definition # We have "duplicate" methods, which are overloaded. + # We keep note of them to add the @overload decorator. overloaded_methods: set[str] = {m for m in connection_methods if isinstance(m['name'], list)} body = [] @@ -102,6 +103,7 @@ def create_definition(name, method, overloaded: bool) -> str: method['kwargs'].append({'name': 'connection', 'type': 'DuckDBPyConnection', 'default': '...'}) for name in names: + body.append(create_definition(name, method, name in overloaded_methods)) # ---- End of generation code ----