From 8d862344975839d868d2828538b9f4e08208a7df Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 15 Apr 2026 09:51:09 -0400 Subject: [PATCH 01/10] Add AGENTS.md and enrich __init__.py module docstring Add python/datafusion/AGENTS.md as a comprehensive DataFrame API guide for AI agents and users. It ships with pip automatically (Maturin includes everything under python-source = "python"). Covers core abstractions, import conventions, data loading, all DataFrame operations, expression building, a SQL-to-DataFrame reference table, common pitfalls, idiomatic patterns, and a categorized function index. Enrich the __init__.py module docstring from 2 lines to a full overview with core abstractions, a quick-start example, and a pointer to AGENTS.md. Closes #1394 (PR 1a) Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/AGENTS.md | 497 ++++++++++++++++++++++++++++++++++ python/datafusion/__init__.py | 43 ++- 2 files changed, 536 insertions(+), 4 deletions(-) create mode 100644 python/datafusion/AGENTS.md diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md new file mode 100644 index 000000000..0c168706e --- /dev/null +++ b/python/datafusion/AGENTS.md @@ -0,0 +1,497 @@ +# DataFusion Python DataFrame API Guide + +## What Is DataFusion? + +DataFusion is an **in-process query engine** built on Apache Arrow. It is not a +database -- there is no server, no connection string, and no external +dependencies. You create a `SessionContext`, point it at data (Parquet, CSV, +JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries +using either SQL or the DataFrame API described below. + +All data flows through **PyArrow** (`pyarrow.RecordBatch` / `pyarrow.Table`), +so any library that speaks Arrow can interoperate with DataFusion. + +## Core Abstractions + +| Abstraction | Role | Key import | +|---|---|---| +| `SessionContext` | Entry point. Loads data, runs SQL, produces DataFrames. | `from datafusion import SessionContext` | +| `DataFrame` | Lazy query builder. Each method returns a new DataFrame. | Returned by context methods | +| `Expr` | Expression tree node (column ref, literal, function call, ...). | `from datafusion import col, lit` | +| `functions` | 290+ built-in scalar, aggregate, and window functions. | `from datafusion import functions as F` | + +## Import Conventions + +```python +from datafusion import SessionContext, col, lit +from datafusion import functions as F +``` + +## Data Loading + +```python +ctx = SessionContext() + +# From files +df = ctx.read_parquet("path/to/data.parquet") +df = ctx.read_csv("path/to/data.csv") +df = ctx.read_json("path/to/data.json") + +# From Python objects +df = ctx.from_pydict({"a": [1, 2, 3], "b": ["x", "y", "z"]}) +df = ctx.from_pylist([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}]) +df = ctx.from_pandas(pandas_df) +df = ctx.from_polars(polars_df) +df = ctx.from_arrow(arrow_table) + +# From SQL +df = ctx.sql("SELECT a, b FROM my_table WHERE a > 1") +``` + +To make a DataFrame queryable by name in SQL, register it first: + +```python +ctx.register_parquet("my_table", "path/to/data.parquet") +ctx.register_csv("my_table", "path/to/data.csv") +``` + +## DataFrame Operations Quick Reference + +Every method returns a **new** DataFrame (immutable/lazy). Chain them fluently. + +### Projection + +```python +df.select("a", "b") # by column name +df.select(col("a"), col("b")) # by Expr +df.select(col("a"), (col("b") + 1).alias("b_plus_1")) # with expression + +df.with_column("new_col", col("a") + lit(10)) # add one column +df.with_columns( + col("a").alias("x"), + y=col("b") + lit(1), # named keyword form +) + +df.drop("unwanted_col") +df.with_column_renamed("old_name", "new_name") +``` + +### Filtering + +```python +df.filter(col("a") > lit(10)) +df.filter(col("a") > lit(10), col("b") == lit("x")) # multiple = AND +df.filter("a > 10") # SQL expression string +``` + +### Aggregation + +```python +# GROUP BY a, compute sum(b) and count(*) +df.aggregate([col("a")], [F.sum(col("b")), F.count(col("a"))]) + +# HAVING: filter after aggregate +df.aggregate( + [col("region")], + [F.sum(col("sales")).alias("total_sales")], +).filter(col("total_sales") > lit(1000)) +``` + +### Sorting + +```python +df.sort(col("a").sort(ascending=True, nulls_first=False)) +df.sort(col("a").sort(ascending=False)) # descending +``` + +### Joining + +```python +# Equi-join on shared column name +df1.join(df2, on="key") +df1.join(df2, on="key", how="left") + +# Different column names +df1.join(df2, left_on="id", right_on="fk_id", how="inner") + +# Expression-based join (supports inequality predicates) +df1.join_on(df2, col("a") == col("b"), how="inner") + +# Semi join: keep rows from left where a match exists in right (like EXISTS) +df1.join(df2, on="key", how="semi") + +# Anti join: keep rows from left where NO match exists in right (like NOT EXISTS) +df1.join(df2, on="key", how="anti") +``` + +Join types: `"inner"`, `"left"`, `"right"`, `"full"`, `"semi"`, `"anti"`. + +### Window Functions + +```python +from datafusion import WindowFrame + +# Row number partitioned by group, ordered by value +df.window( + F.row_number( + partition_by=[col("group")], + order_by=[col("value")], + ).alias("rn") +) + +# Using a Window object for reuse +from datafusion.expr import Window + +win = Window( + partition_by=[col("group")], + order_by=[col("value").sort(ascending=True)], +) +df.select( + col("group"), + col("value"), + F.sum(col("value")).over(win).alias("running_total"), +) + +# With explicit frame bounds +win = Window( + partition_by=[col("group")], + order_by=[col("value").sort(ascending=True)], + window_frame=WindowFrame("rows", 0, None), # current row to unbounded following +) +``` + +### Set Operations + +```python +df1.union(df2) # UNION ALL (by position) +df1.union(df2, distinct=True) # UNION DISTINCT +df1.union_by_name(df2) # match columns by name, not position +df1.intersect(df2) # INTERSECT ALL +df1.except_all(df2) # EXCEPT ALL +``` + +### Limit and Offset + +```python +df.limit(10) # first 10 rows +df.limit(10, offset=20) # skip 20, then take 10 +``` + +### Deduplication + +```python +df.distinct() # remove duplicate rows +df.distinct_on( # keep first row per group (like DISTINCT ON in Postgres) + [col("a")], # uniqueness columns + [col("a"), col("b")], # output columns + [col("b").sort(ascending=True)], # which row to keep +) +``` + +## Executing and Collecting Results + +DataFrames are lazy until you collect. + +```python +batches = df.collect() # list[pa.RecordBatch] +table = df.to_arrow_table() # pa.Table +pandas_df = df.to_pandas() # pd.DataFrame +polars_df = df.to_polars() # pl.DataFrame +py_dict = df.to_pydict() # dict[str, list] +py_list = df.to_pylist() # list[dict] +count = df.count() # int + +# Streaming +stream = df.execute_stream() # RecordBatchStream (single partition) +for batch in stream: + process(batch) +``` + +### Writing Results + +```python +df.write_parquet("output/") +df.write_csv("output/") +df.write_json("output/") +``` + +## Expression Building + +### Column References and Literals + +```python +col("column_name") # reference a column +lit(42) # integer literal +lit("hello") # string literal +lit(3.14) # float literal +``` + +### Arithmetic + +```python +col("price") * col("quantity") # multiplication +col("a") + lit(1) # addition +col("a") - col("b") # subtraction +col("a") / lit(2) # division +col("a") % lit(3) # modulo +``` + +### Comparisons + +```python +col("a") > lit(10) +col("a") >= lit(10) +col("a") < lit(10) +col("a") <= lit(10) +col("a") == lit("x") +col("a") != lit("x") +col("a") == None # same as col("a").is_null() +col("a") != None # same as col("a").is_not_null() +``` + +### Boolean Logic + +**Important**: Python's `and`, `or`, `not` keywords do NOT work with Expr +objects. You must use the bitwise operators: + +```python +(col("a") > lit(1)) & (col("b") < lit(10)) # AND +(col("a") > lit(1)) | (col("b") < lit(10)) # OR +~(col("a") > lit(1)) # NOT +``` + +Always wrap each comparison in parentheses when combining with `&`, `|`, `~` +because Python's operator precedence for bitwise operators is different from +logical operators. + +### Null Handling + +```python +col("a").is_null() +col("a").is_not_null() +col("a").fill_null(lit(0)) # replace NULL with a value +F.coalesce(col("a"), col("b")) # first non-null value +F.nullif(col("a"), lit(0)) # return NULL if a == 0 +``` + +### CASE / WHEN + +```python +# Simple CASE (matching on a single expression) +F.case(col("status")) + .when(lit("A"), lit("Active")) + .when(lit("I"), lit("Inactive")) + .otherwise(lit("Unknown")) + +# Searched CASE (each branch has its own predicate) +F.when(col("value") > lit(100), lit("high")) + .when(col("value") > lit(50), lit("medium")) + .otherwise(lit("low")) +``` + +### Casting + +```python +import pyarrow as pa + +col("a").cast(pa.float64()) +col("a").cast(pa.utf8()) +col("a").cast(pa.date32()) +``` + +### Aliasing + +```python +(col("a") + col("b")).alias("total") +``` + +### BETWEEN and IN + +```python +col("a").between(lit(1), lit(10)) # 1 <= a <= 10 +F.in_list(col("a"), [lit(1), lit(2), lit(3)]) # a IN (1, 2, 3) +F.in_list(col("a"), [lit(1), lit(2)], negated=True) # a NOT IN (1, 2) +``` + +### Struct and Array Access + +```python +col("struct_col")["field_name"] # access struct field +col("array_col")[0] # access array element (0-indexed) +col("array_col")[1:3] # array slice (0-indexed) +``` + +## SQL-to-DataFrame Reference + +| SQL | DataFrame API | +|---|---| +| `SELECT a, b` | `df.select("a", "b")` | +| `SELECT a, b + 1 AS c` | `df.select(col("a"), (col("b") + lit(1)).alias("c"))` | +| `SELECT *, a + 1 AS c` | `df.with_column("c", col("a") + lit(1))` | +| `WHERE a > 10` | `df.filter(col("a") > lit(10))` | +| `GROUP BY a` with `SUM(b)` | `df.aggregate([col("a")], [F.sum(col("b"))])` | +| `HAVING sum_b > 100` | `.filter(col("sum_b") > lit(100))` (after aggregate) | +| `ORDER BY a DESC` | `df.sort(col("a").sort(ascending=False))` | +| `LIMIT 10 OFFSET 5` | `df.limit(10, offset=5)` | +| `DISTINCT` | `df.distinct()` | +| `a INNER JOIN b ON a.id = b.id` | `a.join(b, on="id")` | +| `a LEFT JOIN b ON a.id = b.fk` | `a.join(b, left_on="id", right_on="fk", how="left")` | +| `WHERE EXISTS (SELECT ...)` | `a.join(b, on="key", how="semi")` | +| `WHERE NOT EXISTS (SELECT ...)` | `a.join(b, on="key", how="anti")` | +| `UNION ALL` | `df1.union(df2)` | +| `UNION` (distinct) | `df1.union(df2, distinct=True)` | +| `INTERSECT` | `df1.intersect(df2)` | +| `EXCEPT` | `df1.except_all(df2)` | +| `CASE x WHEN 1 THEN 'a' END` | `F.case(col("x")).when(lit(1), lit("a")).end()` | +| `CASE WHEN x > 1 THEN 'a' END` | `F.when(col("x") > lit(1), lit("a")).end()` | +| `x IN (1, 2, 3)` | `F.in_list(col("x"), [lit(1), lit(2), lit(3)])` | +| `x BETWEEN 1 AND 10` | `col("x").between(lit(1), lit(10))` | +| `CAST(x AS DOUBLE)` | `col("x").cast(pa.float64())` | +| `ROW_NUMBER() OVER (...)` | `F.row_number(partition_by=[...], order_by=[...])` | +| `SUM(x) OVER (...)` | `F.sum(col("x")).over(window)` | +| `x IS NULL` | `col("x").is_null()` | +| `COALESCE(a, b)` | `F.coalesce(col("a"), col("b"))` | + +## Common Pitfalls + +1. **Boolean operators**: Use `&`, `|`, `~` -- not Python's `and`, `or`, `not`. + Always parenthesize: `(col("a") > lit(1)) & (col("b") < lit(2))`. + +2. **Wrapping scalars with `lit()`**: Comparisons with raw Python values work + (e.g., `col("a") > 10`) because the Expr operators auto-wrap them, but + standalone scalars in function calls need `lit()`: + `F.coalesce(col("a"), lit(0))`, not `F.coalesce(col("a"), 0)`. + +3. **Column name quoting**: Column names are normalized to lowercase by default. + To reference a column with uppercase letters, use double quotes inside the + string: `col('"MyColumn"')`. + +4. **DataFrames are immutable**: Every method returns a **new** DataFrame. You + must capture the return value: + ```python + df = df.filter(col("a") > lit(1)) # correct + df.filter(col("a") > lit(1)) # WRONG -- result is discarded + ``` + +5. **Window frame defaults**: When using `order_by` in a window, the default + frame is `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`. For a full + partition frame, set `window_frame=WindowFrame("rows", None, None)`. + +6. **Aggregate then filter for HAVING**: There is no separate `.having()` method. + Use `.filter()` after `.aggregate()`: + ```python + df.aggregate([col("g")], [F.sum(col("v")).alias("s")]).filter(col("s") > lit(100)) + ``` + +## Idiomatic Patterns + +### Fluent Chaining + +```python +result = ( + ctx.read_parquet("data.parquet") + .filter(col("year") >= lit(2020)) + .select(col("region"), col("sales")) + .aggregate([col("region")], [F.sum(col("sales")).alias("total")]) + .sort(col("total").sort(ascending=False)) + .limit(10) +) +``` + +### Using Variables as CTEs + +Instead of SQL CTEs (`WITH ... AS`), assign intermediate DataFrames to +variables: + +```python +base = ctx.read_parquet("orders.parquet").filter(col("status") == lit("shipped")) +by_region = base.aggregate([col("region")], [F.sum(col("amount")).alias("total")]) +top_regions = by_region.filter(col("total") > lit(10000)) +``` + +### Window Functions for Scalar Subqueries + +Where SQL uses a correlated scalar subquery, the idiomatic DataFrame approach +is a window function: + +```sql +-- SQL scalar subquery +SELECT *, (SELECT SUM(b) FROM t WHERE t.group = s.group) AS group_total FROM s +``` + +```python +# DataFrame: window function +win = Window(partition_by=[col("group")]) +df = df.with_column("group_total", F.sum(col("b")).over(win)) +``` + +### Semi/Anti Joins for EXISTS / NOT EXISTS + +```sql +-- SQL: WHERE EXISTS (SELECT 1 FROM other WHERE other.key = main.key) +-- DataFrame: +result = main.join(other, on="key", how="semi") + +-- SQL: WHERE NOT EXISTS (SELECT 1 FROM other WHERE other.key = main.key) +-- DataFrame: +result = main.join(other, on="key", how="anti") +``` + +### Computed Columns + +```python +# Add computed columns while keeping all originals +df = df.with_column("full_name", F.concat(col("first"), lit(" "), col("last"))) +df = df.with_column("discounted", col("price") * lit(0.9)) +``` + +## Available Functions (Categorized) + +The `functions` module (imported as `F`) provides 290+ functions. Key categories: + +**Aggregate**: `sum`, `avg`, `min`, `max`, `count`, `count_star`, `median`, +`stddev`, `stddev_pop`, `var_samp`, `var_pop`, `corr`, `covar`, `approx_distinct`, +`approx_median`, `approx_percentile_cont`, `array_agg`, `string_agg`, +`first_value`, `last_value`, `bit_and`, `bit_or`, `bit_xor`, `bool_and`, +`bool_or`, `grouping`, `regr_*` (9 regression functions) + +**Window**: `row_number`, `rank`, `dense_rank`, `percent_rank`, `cume_dist`, +`ntile`, `lag`, `lead`, `first_value`, `last_value`, `nth_value` + +**String**: `length`, `lower`, `upper`, `trim`, `ltrim`, `rtrim`, `lpad`, +`rpad`, `starts_with`, `ends_with`, `contains`, `substr`, `replace`, `reverse`, +`repeat`, `split_part`, `concat`, `concat_ws`, `initcap`, `ascii`, `chr`, +`left`, `right`, `strpos`, `translate`, `overlay`, `levenshtein` + +**Math**: `abs`, `ceil`, `floor`, `round`, `trunc`, `sqrt`, `cbrt`, `exp`, +`ln`, `log`, `log2`, `log10`, `pow`, `signum`, `pi`, `random`, `factorial`, +`gcd`, `lcm`, `greatest`, `least`, sin/cos/tan and inverse/hyperbolic variants + +**Date/Time**: `now`, `today`, `current_date`, `current_time`, +`current_timestamp`, `date_part`, `date_trunc`, `date_bin`, `extract`, +`to_timestamp`, `to_timestamp_millis`, `to_timestamp_micros`, +`to_timestamp_nanos`, `to_timestamp_seconds`, `to_unixtime`, `from_unixtime`, +`make_date`, `make_time`, `to_date`, `to_time`, `to_local_time`, `date_format` + +**Conditional**: `case`, `when`, `coalesce`, `nullif`, `ifnull`, `nvl`, `nvl2` + +**Array/List**: `array`, `make_array`, `array_agg`, `array_length`, +`array_element`, `array_slice`, `array_append`, `array_prepend`, +`array_concat`, `array_has`, `array_has_all`, `array_has_any`, `array_position`, +`array_remove`, `array_distinct`, `array_sort`, `array_reverse`, `flatten`, +`array_to_string`, `array_intersect`, `array_union`, `array_except`, +`generate_series` +(Most `array_*` functions also have `list_*` aliases.) + +**Struct/Map**: `struct`, `named_struct`, `get_field`, `make_map`, `map_keys`, +`map_values`, `map_entries`, `map_extract` + +**Regex**: `regexp_like`, `regexp_match`, `regexp_replace`, `regexp_count`, +`regexp_instr` + +**Hash**: `md5`, `sha224`, `sha256`, `sha384`, `sha512`, `digest` + +**Type**: `arrow_typeof`, `arrow_cast`, `arrow_metadata` + +**Other**: `in_list`, `order_by`, `alias`, `col`, `encode`, `decode`, +`to_hex`, `to_char`, `uuid`, `version`, `bit_length`, `octet_length` diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 80dfa2fab..b246f065b 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -15,10 +15,45 @@ # specific language governing permissions and limitations # under the License. -"""DataFusion python package. - -This is a Python library that binds to Apache Arrow in-memory query engine DataFusion. -See https://datafusion.apache.org/python for more information. +"""DataFusion: an in-process query engine built on Apache Arrow. + +DataFusion is not a database -- it has no server and no external dependencies. +You create a :py:class:`SessionContext`, point it at data sources (Parquet, CSV, +JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries +using either SQL or the DataFrame API. + +Core abstractions +----------------- +- **SessionContext** -- entry point for loading data, running SQL, and creating + DataFrames. +- **DataFrame** -- lazy query builder. Every method returns a new DataFrame; + call :py:meth:`~datafusion.dataframe.DataFrame.collect` or a ``to_*`` + method to execute. +- **Expr** -- expression tree node for column references, literals, and function + calls. Build with :py:func:`col` and :py:func:`lit`. +- **functions** -- 290+ built-in scalar, aggregate, and window functions. + +Quick start +----------- +:: + + from datafusion import SessionContext, col, lit + from datafusion import functions as F + + ctx = SessionContext() + df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = ( + df.filter(col("a") > lit(1)) + .with_column("total", col("a") + col("b")) + .aggregate([], [F.sum(col("total")).alias("grand_total")]) + ) + print(result.to_pydict()) # {'grand_total': [16]} + +For a comprehensive guide to the DataFrame API -- including a SQL-to-DataFrame +reference table, expression building, idiomatic patterns, and common pitfalls -- +see the ``AGENTS.md`` file shipped alongside this package. + +Full documentation: https://datafusion.apache.org/python """ from __future__ import annotations From 11bb9b355d8104c03687aa4abc38cef34bee0482 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 15 Apr 2026 09:54:33 -0400 Subject: [PATCH 02/10] Clarify audience of root vs package AGENTS.md The root AGENTS.md (symlinked as CLAUDE.md) is for contributors working on the project. Add a pointer to python/datafusion/AGENTS.md which is the user-facing DataFrame API guide shipped with the package. Also add the Apache license header to the package AGENTS.md. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 9 ++++++++- python/datafusion/AGENTS.md | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 86c2e9c3b..39e39ea38 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,7 +17,14 @@ under the License. --> -# Agent Instructions +# Agent Instructions for Contributors + +This file is for agents working **on** the datafusion-python project (developing, +testing, reviewing). If you need to **use** the DataFusion DataFrame API (write +queries, build expressions, understand available functions), see the user-facing +guide at [`python/datafusion/AGENTS.md`](python/datafusion/AGENTS.md). + +## Skills This project uses AI agent skills stored in `.ai/skills/`. Each skill is a directory containing a `SKILL.md` file with instructions for performing a specific task. diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md index 0c168706e..7abc7f57f 100644 --- a/python/datafusion/AGENTS.md +++ b/python/datafusion/AGENTS.md @@ -1,3 +1,22 @@ + + # DataFusion Python DataFrame API Guide ## What Is DataFusion? From 0f601791d31f072b85404480629202b4ebe6eb4e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 15 Apr 2026 10:02:13 -0400 Subject: [PATCH 03/10] Add PR template and pre-commit check guidance to AGENTS.md Document that all PRs must follow .github/pull_request_template.md and that pre-commit hooks must pass before committing. List all configured hooks (actionlint, ruff, ruff-format, cargo fmt, cargo clippy, codespell, uv-lock) and the command to run them manually. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 39e39ea38..c8f984fdd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,6 +33,41 @@ Skills follow the [Agent Skills](https://agentskills.io) open standard. Each ski - `SKILL.md` — The skill definition with YAML frontmatter (name, description, argument-hint) and detailed instructions. - Additional supporting files as needed. +## Pull Requests + +Every pull request must follow the template in +`.github/pull_request_template.md`. The description must include these sections: + +1. **Which issue does this PR close?** — Link the issue with `Closes #NNN`. +2. **Rationale for this change** — Why the change is needed (skip if the issue + already explains it clearly). +3. **What changes are included in this PR?** — Summarize the individual changes. +4. **Are there any user-facing changes?** — Note any changes visible to users + (new APIs, changed behavior, new files shipped in the package, etc.). If + there are breaking changes to public APIs, add the `api change` label. + +## Pre-commit Checks + +Always run pre-commit checks **before** committing. The hooks are defined in +`.pre-commit-config.yaml` and include: + +- **actionlint** — lint GitHub Actions workflow files +- **ruff** — Python linter +- **ruff-format** — Python formatter +- **cargo fmt** — Rust formatter (nightly) +- **cargo clippy** — Rust linter +- **codespell** — spell checker +- **uv-lock** — keep the uv lockfile in sync + +Run all hooks with: + +```bash +pre-commit run --all-files +``` + +Or they will run automatically on `git commit` if pre-commit is installed as a +git hook. Fix any failures before committing. + ## Python Function Docstrings Every Python function must include a docstring with usage examples. From c5f75f5a74c90936b4176ba33f5e8955faa262e7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 15 Apr 2026 10:02:57 -0400 Subject: [PATCH 04/10] Remove duplicated hook list from AGENTS.md Let the hooks be discoverable from .pre-commit-config.yaml rather than maintaining a separate list that can drift. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c8f984fdd..72c830f57 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -49,24 +49,14 @@ Every pull request must follow the template in ## Pre-commit Checks Always run pre-commit checks **before** committing. The hooks are defined in -`.pre-commit-config.yaml` and include: - -- **actionlint** — lint GitHub Actions workflow files -- **ruff** — Python linter -- **ruff-format** — Python formatter -- **cargo fmt** — Rust formatter (nightly) -- **cargo clippy** — Rust linter -- **codespell** — spell checker -- **uv-lock** — keep the uv lockfile in sync - -Run all hooks with: +`.pre-commit-config.yaml` and run automatically on `git commit` if pre-commit +is installed as a git hook. To run all hooks manually: ```bash pre-commit run --all-files ``` -Or they will run automatically on `git commit` if pre-commit is installed as a -git hook. Fix any failures before committing. +Fix any failures before committing. ## Python Function Docstrings From 7238c73ed87812fd76c0e3e4a9a2e29ee0a4b2cf Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 15 Apr 2026 12:25:31 -0400 Subject: [PATCH 05/10] Fix AGENTS.md: Arrow C Data Interface, aggregate filter, fluent example - Clarify that DataFusion works with any Arrow C Data Interface implementation, not just PyArrow. - Show the filter keyword argument on aggregate functions (the idiomatic HAVING equivalent) instead of the post-aggregate .filter() pattern. - Update the SQL reference table to show FILTER (WHERE ...) syntax. - Remove the now-incorrect "Aggregate then filter for HAVING" pitfall. - Add .collect() to the fluent chaining example so the result is clearly materialized. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/AGENTS.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md index 7abc7f57f..7f8c5c0a1 100644 --- a/python/datafusion/AGENTS.md +++ b/python/datafusion/AGENTS.md @@ -27,8 +27,10 @@ dependencies. You create a `SessionContext`, point it at data (Parquet, CSV, JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries using either SQL or the DataFrame API described below. -All data flows through **PyArrow** (`pyarrow.RecordBatch` / `pyarrow.Table`), -so any library that speaks Arrow can interoperate with DataFusion. +All data flows through **Apache Arrow**. The canonical Python implementation is +PyArrow (`pyarrow.RecordBatch` / `pyarrow.Table`), but any library that +conforms to the [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) +can interoperate with DataFusion. ## Core Abstractions @@ -109,13 +111,17 @@ df.filter("a > 10") # SQL expression string # GROUP BY a, compute sum(b) and count(*) df.aggregate([col("a")], [F.sum(col("b")), F.count(col("a"))]) -# HAVING: filter after aggregate +# HAVING equivalent: use the filter keyword on the aggregate function df.aggregate( [col("region")], - [F.sum(col("sales")).alias("total_sales")], -).filter(col("total_sales") > lit(1000)) + [F.sum(col("sales"), filter=col("sales") > lit(1000)).alias("large_sales")], +) ``` +Most aggregate functions accept an optional `filter` keyword argument. When +provided, only rows where the filter expression is true contribute to the +aggregate. + ### Sorting ```python @@ -349,7 +355,7 @@ col("array_col")[1:3] # array slice (0-indexed) | `SELECT *, a + 1 AS c` | `df.with_column("c", col("a") + lit(1))` | | `WHERE a > 10` | `df.filter(col("a") > lit(10))` | | `GROUP BY a` with `SUM(b)` | `df.aggregate([col("a")], [F.sum(col("b"))])` | -| `HAVING sum_b > 100` | `.filter(col("sum_b") > lit(100))` (after aggregate) | +| `SUM(b) FILTER (WHERE b > 100)` | `F.sum(col("b"), filter=col("b") > lit(100))` | | `ORDER BY a DESC` | `df.sort(col("a").sort(ascending=False))` | | `LIMIT 10 OFFSET 5` | `df.limit(10, offset=5)` | | `DISTINCT` | `df.distinct()` | @@ -396,12 +402,6 @@ col("array_col")[1:3] # array slice (0-indexed) frame is `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`. For a full partition frame, set `window_frame=WindowFrame("rows", None, None)`. -6. **Aggregate then filter for HAVING**: There is no separate `.having()` method. - Use `.filter()` after `.aggregate()`: - ```python - df.aggregate([col("g")], [F.sum(col("v")).alias("s")]).filter(col("s") > lit(100)) - ``` - ## Idiomatic Patterns ### Fluent Chaining @@ -414,6 +414,7 @@ result = ( .aggregate([col("region")], [F.sum(col("sales")).alias("total")]) .sort(col("total").sort(ascending=False)) .limit(10) + .collect() ) ``` From 4429a0816eca5e41042613a24fc98500b5721b87 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 16 Apr 2026 10:20:51 -0400 Subject: [PATCH 06/10] Update agents file after working through the first tpc-h query using only the text description --- python/datafusion/AGENTS.md | 52 ++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md index 7f8c5c0a1..b10034dda 100644 --- a/python/datafusion/AGENTS.md +++ b/python/datafusion/AGENTS.md @@ -218,6 +218,7 @@ df.distinct_on( # keep first row per group (like DISTINCT ON in Postgres DataFrames are lazy until you collect. ```python +df.show() # print formatted table to stdout batches = df.collect() # list[pa.RecordBatch] table = df.to_arrow_table() # pa.Table pandas_df = df.to_pandas() # pd.DataFrame @@ -249,8 +250,12 @@ col("column_name") # reference a column lit(42) # integer literal lit("hello") # string literal lit(3.14) # float literal +lit(pa.scalar(value)) # PyArrow scalar (preserves Arrow type) ``` +`lit()` accepts PyArrow scalars directly -- prefer this over converting Arrow +data to Python and back when working with values extracted from query results. + ### Arithmetic ```python @@ -261,6 +266,26 @@ col("a") / lit(2) # division col("a") % lit(3) # modulo ``` +### Date Arithmetic + +`Date32` columns require `Interval` types for arithmetic, not `Duration`. Use +PyArrow's `month_day_nano_interval` type, which takes a `(months, days, nanos)` +tuple: + +```python +import pyarrow as pa + +# Subtract 90 days from a date column +col("ship_date") - lit(pa.scalar((0, 90, 0), type=pa.month_day_nano_interval())) + +# Subtract 3 months +col("ship_date") - lit(pa.scalar((3, 0, 0), type=pa.month_day_nano_interval())) +``` + +**Important**: `lit(datetime.timedelta(days=90))` creates a `Duration(µs)` +literal, which is **not** compatible with `Date32` arithmetic. Always use +`pa.month_day_nano_interval()` for date operations. + ### Comparisons ```python @@ -414,8 +439,8 @@ result = ( .aggregate([col("region")], [F.sum(col("sales")).alias("total")]) .sort(col("total").sort(ascending=False)) .limit(10) - .collect() ) +result.show() ``` ### Using Variables as CTEs @@ -429,6 +454,31 @@ by_region = base.aggregate([col("region")], [F.sum(col("amount")).alias("total") top_regions = by_region.filter(col("total") > lit(10000)) ``` +### Reusing Expressions as Variables + +Just like DataFrames, expressions (`Expr`) can be stored in variables and used +anywhere an `Expr` is expected. This is useful for building up complex +expressions or reusing a computed value across multiple operations: + +```python +# Build an expression and reuse it +disc_price = col("price") * (lit(1) - col("discount")) +df = df.select( + col("id"), + disc_price.alias("disc_price"), + (disc_price * (lit(1) + col("tax"))).alias("total"), +) + +# Use a collected scalar as an expression +max_val = result_batch[0].column("max_price")[0] # PyArrow scalar +cutoff = lit(max_val) - lit(pa.scalar((0, 90, 0), type=pa.month_day_nano_interval())) +df = df.filter(col("ship_date") <= cutoff) # cutoff is already an Expr +``` + +**Important**: Do not wrap an `Expr` in `lit()`. `lit()` is for converting +Python/PyArrow values into expressions. If a value is already an `Expr`, use it +directly. + ### Window Functions for Scalar Subqueries Where SQL uses a correlated scalar subquery, the idiomatic DataFrame approach From c620a801584b9c328b1e7a64478dc754d929d19c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 17 Apr 2026 14:03:45 -0400 Subject: [PATCH 07/10] Add feedback from working through each of the TPC-H queries --- python/datafusion/AGENTS.md | 168 ++++++++++++++++++++++++++++-------- 1 file changed, 132 insertions(+), 36 deletions(-) diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md index b10034dda..b1102d438 100644 --- a/python/datafusion/AGENTS.md +++ b/python/datafusion/AGENTS.md @@ -83,9 +83,8 @@ Every method returns a **new** DataFrame (immutable/lazy). Chain them fluently. ### Projection ```python -df.select("a", "b") # by column name -df.select(col("a"), col("b")) # by Expr -df.select(col("a"), (col("b") + 1).alias("b_plus_1")) # with expression +df.select("a", "b") # preferred: plain names as strings +df.select(col("a"), (col("b") + 1).alias("b_plus_1")) # use col()/Expr only when you need an expression df.with_column("new_col", col("a") + lit(10)) # add one column df.with_columns( @@ -97,27 +96,45 @@ df.drop("unwanted_col") df.with_column_renamed("old_name", "new_name") ``` +When a column is referenced by name alone, pass the name as a string rather +than wrapping it in `col()`. Reach for `col()` only when the projection needs +arithmetic, aliasing, casting, or another expression operation. + +**Case sensitivity**: both `select("Name")` and `col("Name")` lowercase the +identifier. For a column whose real name has uppercase letters, embed double +quotes inside the string: `select('"MyCol"')` or `col('"MyCol"')`. Without the +inner quotes the lookup will fail with `No field named mycol`. + ### Filtering ```python -df.filter(col("a") > lit(10)) -df.filter(col("a") > lit(10), col("b") == lit("x")) # multiple = AND -df.filter("a > 10") # SQL expression string +df.filter(col("a") > 10) +df.filter(col("a") > 10, col("b") == "x") # multiple = AND +df.filter("a > 10") # SQL expression string ``` +Raw Python values on the right-hand side of a comparison are auto-wrapped +into literals by the `Expr` operators, so prefer `col("a") > 10` over +`col("a") > lit(10)`. See the Comparisons section and pitfall #2 for the +full rule. + ### Aggregation ```python # GROUP BY a, compute sum(b) and count(*) -df.aggregate([col("a")], [F.sum(col("b")), F.count(col("a"))]) +df.aggregate(["a"], [F.sum(col("b")), F.count(col("a"))]) # HAVING equivalent: use the filter keyword on the aggregate function df.aggregate( - [col("region")], + ["region"], [F.sum(col("sales"), filter=col("sales") > lit(1000)).alias("large_sales")], ) ``` +As with `select()`, group keys can be passed as plain name strings. Reach for +`col(...)` only when the grouping expression needs arithmetic, aliasing, +casting, or another expression operation. + Most aggregate functions accept an optional `filter` keyword argument. When provided, only rows where the filter expression is true contribute to the aggregate. @@ -125,10 +142,15 @@ aggregate. ### Sorting ```python -df.sort(col("a").sort(ascending=True, nulls_first=False)) +df.sort(col("a")) # ascending (default) df.sort(col("a").sort(ascending=False)) # descending +df.sort(col("a").sort(nulls_first=False)) # override null placement ``` +A plain expression passed to `sort()` is already treated as ascending. Only +reach for `col(...).sort(...)` when you need to override a default (descending +order or null placement). Writing `col("a").sort(ascending=True)` is redundant. + ### Joining ```python @@ -151,6 +173,14 @@ df1.join(df2, on="key", how="anti") Join types: `"inner"`, `"left"`, `"right"`, `"full"`, `"semi"`, `"anti"`. +Inner is the default `how`. Prefer `df1.join(df2, on="key")` over +`df1.join(df2, on="key", how="inner")` — drop `how=` unless you need a +non-inner join type. + +When the two sides' join columns have different native names, use +`left_on=`/`right_on=` with the original names rather than aliasing one side +to match the other — see pitfall #7. + ### Window Functions ```python @@ -220,6 +250,7 @@ DataFrames are lazy until you collect. ```python df.show() # print formatted table to stdout batches = df.collect() # list[pa.RecordBatch] +arr = df.collect_column("col_name") # pa.Array | pa.ChunkedArray (single column) table = df.to_arrow_table() # pa.Table pandas_df = df.to_pandas() # pd.DataFrame polars_df = df.to_polars() # pl.DataFrame @@ -289,25 +320,30 @@ literal, which is **not** compatible with `Date32` arithmetic. Always use ### Comparisons ```python -col("a") > lit(10) -col("a") >= lit(10) -col("a") < lit(10) -col("a") <= lit(10) -col("a") == lit("x") -col("a") != lit("x") +col("a") > 10 +col("a") >= 10 +col("a") < 10 +col("a") <= 10 +col("a") == "x" +col("a") != "x" col("a") == None # same as col("a").is_null() col("a") != None # same as col("a").is_not_null() ``` +Comparison operators auto-wrap the right-hand Python value into a literal, +so writing `col("a") > lit(10)` is redundant. Drop the `lit()` in +comparisons. Reach for `lit()` only when auto-wrapping does not apply — see +pitfall #2. + ### Boolean Logic **Important**: Python's `and`, `or`, `not` keywords do NOT work with Expr objects. You must use the bitwise operators: ```python -(col("a") > lit(1)) & (col("b") < lit(10)) # AND -(col("a") > lit(1)) | (col("b") < lit(10)) # OR -~(col("a") > lit(1)) # NOT +(col("a") > 1) & (col("b") < 10) # AND +(col("a") > 1) | (col("b") < 10) # OR +~(col("a") > 1) # NOT ``` Always wrap each comparison in parentheses when combining with `&`, `|`, `~` @@ -379,7 +415,7 @@ col("array_col")[1:3] # array slice (0-indexed) | `SELECT a, b + 1 AS c` | `df.select(col("a"), (col("b") + lit(1)).alias("c"))` | | `SELECT *, a + 1 AS c` | `df.with_column("c", col("a") + lit(1))` | | `WHERE a > 10` | `df.filter(col("a") > lit(10))` | -| `GROUP BY a` with `SUM(b)` | `df.aggregate([col("a")], [F.sum(col("b"))])` | +| `GROUP BY a` with `SUM(b)` | `df.aggregate(["a"], [F.sum(col("b"))])` | | `SUM(b) FILTER (WHERE b > 100)` | `F.sum(col("b"), filter=col("b") > lit(100))` | | `ORDER BY a DESC` | `df.sort(col("a").sort(ascending=False))` | | `LIMIT 10 OFFSET 5` | `df.limit(10, offset=5)` | @@ -407,26 +443,70 @@ col("array_col")[1:3] # array slice (0-indexed) 1. **Boolean operators**: Use `&`, `|`, `~` -- not Python's `and`, `or`, `not`. Always parenthesize: `(col("a") > lit(1)) & (col("b") < lit(2))`. -2. **Wrapping scalars with `lit()`**: Comparisons with raw Python values work - (e.g., `col("a") > 10`) because the Expr operators auto-wrap them, but - standalone scalars in function calls need `lit()`: - `F.coalesce(col("a"), lit(0))`, not `F.coalesce(col("a"), 0)`. - -3. **Column name quoting**: Column names are normalized to lowercase by default. - To reference a column with uppercase letters, use double quotes inside the - string: `col('"MyColumn"')`. +2. **Wrapping scalars with `lit()`**: Prefer raw Python values on the + right-hand side of comparisons — `col("a") > 10`, `col("name") == "Alice"` + — because the Expr comparison operators auto-wrap them. Writing + `col("a") > lit(10)` is redundant. Reserve `lit()` for places where + auto-wrapping does *not* apply: + - standalone scalars passed into function calls: + `F.coalesce(col("a"), lit(0))`, not `F.coalesce(col("a"), 0)` + - arithmetic between two literals with no column involved: + `lit(1) - col("discount")` is fine, but `lit(1) - lit(2)` needs both + - values that must carry a specific Arrow type, via `lit(pa.scalar(...))` + - `.when(...)`, `.otherwise(...)`, `F.nullif(...)`, `.between(...)`, + `F.in_list(...)` and similar method/function arguments + +3. **Column name quoting**: Column names are normalized to lowercase by default + in both `select("...")` and `col("...")`. To reference a column with + uppercase letters, use double quotes inside the string: + `select('"MyColumn"')` or `col('"MyColumn"')`. 4. **DataFrames are immutable**: Every method returns a **new** DataFrame. You must capture the return value: ```python - df = df.filter(col("a") > lit(1)) # correct - df.filter(col("a") > lit(1)) # WRONG -- result is discarded + df = df.filter(col("a") > 1) # correct + df.filter(col("a") > 1) # WRONG -- result is discarded ``` 5. **Window frame defaults**: When using `order_by` in a window, the default frame is `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`. For a full partition frame, set `window_frame=WindowFrame("rows", None, None)`. +6. **Arithmetic on aggregates belongs in a later `select`, not inside + `aggregate`**: Each item in the aggregate list must be a single aggregate + call (optionally aliased). Combining aggregates with arithmetic inside + `aggregate(...)` fails with `Internal error: Invalid aggregate expression`. + Alias the aggregates, then compute the combination downstream: + ```python + # WRONG -- arithmetic wraps two aggregates + df.aggregate([], [(lit(100) * F.sum(col("a")) / F.sum(col("b"))).alias("ratio")]) + + # CORRECT -- aggregate first, then combine + (df.aggregate([], [F.sum(col("a")).alias("num"), F.sum(col("b")).alias("den")]) + .select((lit(100) * col("num") / col("den")).alias("ratio"))) + ``` + +7. **Don't alias a join column to match the other side**: When equi-joining + with `on="key"`, renaming the join column on one side via `.alias("key")` + in a fresh projection creates a schema where one side's `key` is + qualified (`?table?.key`) and the other is unqualified. The join then + fails with `Schema contains qualified field name ... and unqualified + field name ... which would be ambiguous`. Use `left_on=`/`right_on=` with + the native names, or use `join_on(...)` with an explicit equality. + ```python + # WRONG -- alias on one side produces ambiguous schema after join + failed = orders.select(col("o_orderkey").alias("l_orderkey")) + li.join(failed, on="l_orderkey") # ambiguous l_orderkey error + + # CORRECT -- keep native names, use left_on/right_on + failed = orders.select("o_orderkey") + li.join(failed, left_on="l_orderkey", right_on="o_orderkey") + + # ALSO CORRECT -- explicit predicate via join_on + # (note: join_on keeps both key columns in the output, unlike on="key") + li.join_on(failed, col("l_orderkey") == col("o_orderkey")) + ``` + ## Idiomatic Patterns ### Fluent Chaining @@ -436,7 +516,7 @@ result = ( ctx.read_parquet("data.parquet") .filter(col("year") >= lit(2020)) .select(col("region"), col("sales")) - .aggregate([col("region")], [F.sum(col("sales")).alias("total")]) + .aggregate(["region"], [F.sum(col("sales")).alias("total")]) .sort(col("total").sort(ascending=False)) .limit(10) ) @@ -450,7 +530,7 @@ variables: ```python base = ctx.read_parquet("orders.parquet").filter(col("status") == lit("shipped")) -by_region = base.aggregate([col("region")], [F.sum(col("amount")).alias("total")]) +by_region = base.aggregate(["region"], [F.sum(col("amount")).alias("total")]) top_regions = by_region.filter(col("total") > lit(10000)) ``` @@ -470,9 +550,9 @@ df = df.select( ) # Use a collected scalar as an expression -max_val = result_batch[0].column("max_price")[0] # PyArrow scalar +max_val = result_df.collect_column("max_price")[0] # PyArrow scalar cutoff = lit(max_val) - lit(pa.scalar((0, 90, 0), type=pa.month_day_nano_interval())) -df = df.filter(col("ship_date") <= cutoff) # cutoff is already an Expr +df = df.filter(col("ship_date") <= cutoff) # cutoff is already an Expr ``` **Important**: Do not wrap an `Expr` in `lit()`. `lit()` is for converting @@ -529,9 +609,25 @@ The `functions` module (imported as `F`) provides 290+ functions. Key categories `ntile`, `lag`, `lead`, `first_value`, `last_value`, `nth_value` **String**: `length`, `lower`, `upper`, `trim`, `ltrim`, `rtrim`, `lpad`, -`rpad`, `starts_with`, `ends_with`, `contains`, `substr`, `replace`, `reverse`, -`repeat`, `split_part`, `concat`, `concat_ws`, `initcap`, `ascii`, `chr`, -`left`, `right`, `strpos`, `translate`, `overlay`, `levenshtein` +`rpad`, `starts_with`, `ends_with`, `contains`, `substr`, `substring`, +`replace`, `reverse`, `repeat`, `split_part`, `concat`, `concat_ws`, +`initcap`, `ascii`, `chr`, `left`, `right`, `strpos`, `translate`, `overlay`, +`levenshtein` + +`F.substr(str, start)` takes **only two arguments** and returns the tail of +the string from `start` onward — passing a third length argument raises +`TypeError: substr() takes 2 positional arguments but 3 were given`. For the +SQL-style 3-arg form (`SUBSTRING(str FROM start FOR length)`), use +`F.substring(col("s"), lit(start), lit(length))`. For a fixed-length prefix, +`F.left(col("s"), lit(n))` is cleanest. + +```python +# WRONG — substr does not accept a length argument +F.substr(col("c_phone"), lit(1), lit(2)) +# CORRECT +F.substring(col("c_phone"), lit(1), lit(2)) # explicit length +F.left(col("c_phone"), lit(2)) # prefix shortcut +``` **Math**: `abs`, `ceil`, `floor`, `round`, `trunc`, `sqrt`, `cbrt`, `exp`, `ln`, `log`, `log2`, `log10`, `pow`, `signum`, `pi`, `random`, `factorial`, From 0465d6e21f717fc066b7e4d6bed216c593f7dc09 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 18 Apr 2026 14:15:18 -0400 Subject: [PATCH 08/10] Address Copilot review feedback on AGENTS.md - Wrap CASE/WHEN method-chain examples in parentheses and assign to a variable so they are valid Python as shown (Copilot #1, #2). - Fix INTERSECT/EXCEPT mapping: the default distinct=False corresponds to INTERSECT ALL / EXCEPT ALL, not the distinct forms. Updated both the Set Operations section and the SQL reference table to show both the ALL and distinct variants (Copilot #4). - Change write_parquet / write_csv / write_json examples to file-style paths (output.parquet, etc.) to match the convention used in existing tests and examples. Note that a directory path is also valid for partitioned output (Copilot #5). Verified INTERSECT/EXCEPT semantics with a script: df1.intersect(df2) -> [1, 1, 2] (= INTERSECT ALL) df1.intersect(df2, distinct=True) -> [1, 2] (= INTERSECT) Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/AGENTS.md | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md index b1102d438..1ea2f8a1a 100644 --- a/python/datafusion/AGENTS.md +++ b/python/datafusion/AGENTS.md @@ -222,7 +222,9 @@ df1.union(df2) # UNION ALL (by position) df1.union(df2, distinct=True) # UNION DISTINCT df1.union_by_name(df2) # match columns by name, not position df1.intersect(df2) # INTERSECT ALL +df1.intersect(df2, distinct=True) # INTERSECT (distinct) df1.except_all(df2) # EXCEPT ALL +df1.except_all(df2, distinct=True) # EXCEPT (distinct) ``` ### Limit and Offset @@ -267,11 +269,14 @@ for batch in stream: ### Writing Results ```python -df.write_parquet("output/") -df.write_csv("output/") -df.write_json("output/") +df.write_parquet("output.parquet") +df.write_csv("output.csv") +df.write_json("output.json") ``` +You can also pass a directory path (e.g., `"output/"`) to write a multi-file +partitioned output. + ## Expression Building ### Column References and Literals @@ -364,15 +369,19 @@ F.nullif(col("a"), lit(0)) # return NULL if a == 0 ```python # Simple CASE (matching on a single expression) -F.case(col("status")) +status_label = ( + F.case(col("status")) .when(lit("A"), lit("Active")) .when(lit("I"), lit("Inactive")) .otherwise(lit("Unknown")) +) # Searched CASE (each branch has its own predicate) -F.when(col("value") > lit(100), lit("high")) +severity = ( + F.when(col("value") > lit(100), lit("high")) .when(col("value") > lit(50), lit("medium")) .otherwise(lit("low")) +) ``` ### Casting @@ -426,8 +435,10 @@ col("array_col")[1:3] # array slice (0-indexed) | `WHERE NOT EXISTS (SELECT ...)` | `a.join(b, on="key", how="anti")` | | `UNION ALL` | `df1.union(df2)` | | `UNION` (distinct) | `df1.union(df2, distinct=True)` | -| `INTERSECT` | `df1.intersect(df2)` | -| `EXCEPT` | `df1.except_all(df2)` | +| `INTERSECT ALL` | `df1.intersect(df2)` | +| `INTERSECT` (distinct) | `df1.intersect(df2, distinct=True)` | +| `EXCEPT ALL` | `df1.except_all(df2)` | +| `EXCEPT` (distinct) | `df1.except_all(df2, distinct=True)` | | `CASE x WHEN 1 THEN 'a' END` | `F.case(col("x")).when(lit(1), lit("a")).end()` | | `CASE WHEN x > 1 THEN 'a' END` | `F.when(col("x") > lit(1), lit("a")).end()` | | `x IN (1, 2, 3)` | `F.in_list(col("x"), [lit(1), lit(2), lit(3)])` | From 540503d62243a1921c0d9a32f0c56718601b161c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 18 Apr 2026 14:19:54 -0400 Subject: [PATCH 09/10] Use short-form comparisons in AGENTS.md examples Drop lit() on the RHS of comparison operators since Expr auto-wraps raw Python values, matching the style the guide recommends (Copilot #3, #6). Updates examples in the Aggregation, CASE/WHEN, SQL reference table, Common Pitfalls, Fluent Chaining, and Variables-as-CTEs sections, plus the __init__.py quick-start snippet. Prose explanations of the rule (which cite the long form as the thing to avoid) are left unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/AGENTS.md | 20 ++++++++++---------- python/datafusion/__init__.py | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python/datafusion/AGENTS.md b/python/datafusion/AGENTS.md index 1ea2f8a1a..43913986c 100644 --- a/python/datafusion/AGENTS.md +++ b/python/datafusion/AGENTS.md @@ -127,7 +127,7 @@ df.aggregate(["a"], [F.sum(col("b")), F.count(col("a"))]) # HAVING equivalent: use the filter keyword on the aggregate function df.aggregate( ["region"], - [F.sum(col("sales"), filter=col("sales") > lit(1000)).alias("large_sales")], + [F.sum(col("sales"), filter=col("sales") > 1000).alias("large_sales")], ) ``` @@ -378,8 +378,8 @@ status_label = ( # Searched CASE (each branch has its own predicate) severity = ( - F.when(col("value") > lit(100), lit("high")) - .when(col("value") > lit(50), lit("medium")) + F.when(col("value") > 100, lit("high")) + .when(col("value") > 50, lit("medium")) .otherwise(lit("low")) ) ``` @@ -423,9 +423,9 @@ col("array_col")[1:3] # array slice (0-indexed) | `SELECT a, b` | `df.select("a", "b")` | | `SELECT a, b + 1 AS c` | `df.select(col("a"), (col("b") + lit(1)).alias("c"))` | | `SELECT *, a + 1 AS c` | `df.with_column("c", col("a") + lit(1))` | -| `WHERE a > 10` | `df.filter(col("a") > lit(10))` | +| `WHERE a > 10` | `df.filter(col("a") > 10)` | | `GROUP BY a` with `SUM(b)` | `df.aggregate(["a"], [F.sum(col("b"))])` | -| `SUM(b) FILTER (WHERE b > 100)` | `F.sum(col("b"), filter=col("b") > lit(100))` | +| `SUM(b) FILTER (WHERE b > 100)` | `F.sum(col("b"), filter=col("b") > 100)` | | `ORDER BY a DESC` | `df.sort(col("a").sort(ascending=False))` | | `LIMIT 10 OFFSET 5` | `df.limit(10, offset=5)` | | `DISTINCT` | `df.distinct()` | @@ -440,7 +440,7 @@ col("array_col")[1:3] # array slice (0-indexed) | `EXCEPT ALL` | `df1.except_all(df2)` | | `EXCEPT` (distinct) | `df1.except_all(df2, distinct=True)` | | `CASE x WHEN 1 THEN 'a' END` | `F.case(col("x")).when(lit(1), lit("a")).end()` | -| `CASE WHEN x > 1 THEN 'a' END` | `F.when(col("x") > lit(1), lit("a")).end()` | +| `CASE WHEN x > 1 THEN 'a' END` | `F.when(col("x") > 1, lit("a")).end()` | | `x IN (1, 2, 3)` | `F.in_list(col("x"), [lit(1), lit(2), lit(3)])` | | `x BETWEEN 1 AND 10` | `col("x").between(lit(1), lit(10))` | | `CAST(x AS DOUBLE)` | `col("x").cast(pa.float64())` | @@ -452,7 +452,7 @@ col("array_col")[1:3] # array slice (0-indexed) ## Common Pitfalls 1. **Boolean operators**: Use `&`, `|`, `~` -- not Python's `and`, `or`, `not`. - Always parenthesize: `(col("a") > lit(1)) & (col("b") < lit(2))`. + Always parenthesize: `(col("a") > 1) & (col("b") < 2)`. 2. **Wrapping scalars with `lit()`**: Prefer raw Python values on the right-hand side of comparisons — `col("a") > 10`, `col("name") == "Alice"` @@ -525,7 +525,7 @@ col("array_col")[1:3] # array slice (0-indexed) ```python result = ( ctx.read_parquet("data.parquet") - .filter(col("year") >= lit(2020)) + .filter(col("year") >= 2020) .select(col("region"), col("sales")) .aggregate(["region"], [F.sum(col("sales")).alias("total")]) .sort(col("total").sort(ascending=False)) @@ -540,9 +540,9 @@ Instead of SQL CTEs (`WITH ... AS`), assign intermediate DataFrames to variables: ```python -base = ctx.read_parquet("orders.parquet").filter(col("status") == lit("shipped")) +base = ctx.read_parquet("orders.parquet").filter(col("status") == "shipped") by_region = base.aggregate(["region"], [F.sum(col("amount")).alias("total")]) -top_regions = by_region.filter(col("total") > lit(10000)) +top_regions = by_region.filter(col("total") > 10000) ``` ### Reusing Expressions as Variables diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index b246f065b..72a02fe15 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -37,13 +37,13 @@ ----------- :: - from datafusion import SessionContext, col, lit + from datafusion import SessionContext, col from datafusion import functions as F ctx = SessionContext() df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) result = ( - df.filter(col("a") > lit(1)) + df.filter(col("a") > 1) .with_column("total", col("a") + col("b")) .aggregate([], [F.sum(col("total")).alias("grand_total")]) ) From d4ca194486c0676b535ce063b66e9a26c2e57245 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 23 Apr 2026 16:10:07 -0400 Subject: [PATCH 10/10] Move user guide from python/datafusion/AGENTS.md to SKILL.md The in-wheel AGENTS.md was not a real distribution channel -- no shipping agent walks site-packages for AGENTS.md files. Moving to SKILL.md at the repo root, with YAML frontmatter, lets the skill ecosystems (npx skills, Claude Code plugin marketplaces, community aggregators) discover it. Update the pointers in the contributor AGENTS.md and the __init__.py module docstring accordingly. The docstring now references the GitHub URL since the file no longer ships with the wheel. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 2 +- python/datafusion/AGENTS.md => SKILL.md | 5 +++++ python/datafusion/__init__.py | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) rename python/datafusion/AGENTS.md => SKILL.md (98%) diff --git a/AGENTS.md b/AGENTS.md index 72c830f57..7d3262710 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -22,7 +22,7 @@ This file is for agents working **on** the datafusion-python project (developing, testing, reviewing). If you need to **use** the DataFusion DataFrame API (write queries, build expressions, understand available functions), see the user-facing -guide at [`python/datafusion/AGENTS.md`](python/datafusion/AGENTS.md). +skill at [`SKILL.md`](SKILL.md). ## Skills diff --git a/python/datafusion/AGENTS.md b/SKILL.md similarity index 98% rename from python/datafusion/AGENTS.md rename to SKILL.md index 43913986c..aa5cb8909 100644 --- a/python/datafusion/AGENTS.md +++ b/SKILL.md @@ -17,6 +17,11 @@ under the License. --> +--- +name: datafusion-python +description: Use when the user is writing datafusion-python (Apache DataFusion Python bindings) DataFrame or SQL code. Covers imports, data loading, DataFrame operations, expression building, SQL-to-DataFrame mappings, idiomatic patterns, and common pitfalls. +--- + # DataFusion Python DataFrame API Guide ## What Is DataFusion? diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 72a02fe15..1cdd84581 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -51,7 +51,8 @@ For a comprehensive guide to the DataFrame API -- including a SQL-to-DataFrame reference table, expression building, idiomatic patterns, and common pitfalls -- -see the ``AGENTS.md`` file shipped alongside this package. +see ``SKILL.md`` at the root of the datafusion-python repository: +https://github.com/apache/datafusion-python/blob/main/SKILL.md Full documentation: https://datafusion.apache.org/python """