Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ future contributors can work without hunting through old context.
## Key concepts

- `compare()` accepts DuckDB relations or pandas/polars DataFrames. The
optional `connection` parameter must be provided when the relations were
created on non-default connections so subsequent helper queries run in the
correct database.
optional `con` parameter must be provided when the relations were created
on non-default connections so subsequent helper queries run in the correct
database.
- The `Comparison` object stores:
- table metadata (`tables`, `by`, `unmatched_cols`, `intersection`)
- internal handles to the temp views plus a mapping of column name to
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,8 @@ comparison.summary()
## Usage

- Call `compare()` with DuckDB relations or pandas/polars DataFrames. If
your relations live on a custom DuckDB connection, pass it via
`connection=` so the comparison queries use the same database.
your relations live on a custom DuckDB connection, pass it via `con=`
so the comparison queries use the same database.
- The `by` columns must uniquely identify rows in each table. When they
do not, `compare()` raises `ComparisonError` and tells you which key
values repeat.
Expand Down
4 changes: 2 additions & 2 deletions README.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ comparison.summary()
## Usage

- Call `compare()` with DuckDB relations or pandas/polars DataFrames. If
your relations live on a custom DuckDB connection, pass it via
`connection=` so the comparison queries use the same database.
your relations live on a custom DuckDB connection, pass it via `con=`
so the comparison queries use the same database.
- The `by` columns must uniquely identify rows in each table. When they
do not, `compare()` raises `ComparisonError` and tells you which key
values repeat.
Expand Down
4 changes: 2 additions & 2 deletions docs_api/source/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Inputs

`compare()` accepts DuckDB relations (tables or views) or pandas/polars
DataFrames. If you provide relations created on a non-default connection,
pass that connection into `compare()` so helper queries run in the same
session.
pass that connection into `compare()` via `con=` so helper queries run in
the same session.

.. code-block:: pycon

Expand Down
5 changes: 2 additions & 3 deletions python/versus/comparison/_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,12 @@ def raise_relation_connection_error(
hint = (
f"`{arg_name}` appears to be bound to a different DuckDB "
"connection than the one passed to `compare()`. Pass the same "
"connection that created the relations via `connection=...`."
"connection that created the relations via `con=...`."
)
else:
hint = (
f"`{arg_name}` appears to be bound to a non-default DuckDB "
"connection. Pass that connection to `compare()` via "
"`connection=...`."
"connection. Pass that connection to `compare()` via `con=...`."
)
raise ComparisonError(hint) from exc

Expand Down
2 changes: 1 addition & 1 deletion python/versus/comparison/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def resolve_connection(
default_conn = duckdb.default_connection
conn_candidate = default_conn() if callable(default_conn) else default_conn
if not isinstance(conn_candidate, duckdb.DuckDBPyConnection):
raise ComparisonError("`connection` must be a DuckDB connection.")
raise ComparisonError("`con` must be a DuckDB connection.")
return VersusConn(conn_candidate)


Expand Down
12 changes: 6 additions & 6 deletions python/versus/comparison/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def compare(
allow_both_na: bool = True,
coerce: bool = True,
table_id: Tuple[str, str] = ("a", "b"),
connection: Optional[duckdb.DuckDBPyConnection] = None,
con: Optional[duckdb.DuckDBPyConnection] = None,
materialize: Literal["all", "summary", "none"] = "all",
) -> Comparison:
"""Compare two DuckDB relations by key columns.
Expand All @@ -45,7 +45,7 @@ def compare(
exact type matches for shared columns.
table_id : tuple[str, str], default ("a", "b")
Labels used in outputs for the two tables.
connection : duckdb.DuckDBPyConnection, optional
con : duckdb.DuckDBPyConnection, optional
DuckDB connection used to register the inputs and run queries.
materialize : {"all", "summary", "none"}, default "all"
Controls which helper tables are materialized upfront.
Expand Down Expand Up @@ -76,16 +76,16 @@ def compare(
"""
materialize_summary, materialize_keys = v.resolve_materialize(materialize)

conn = v.resolve_connection(connection)
conn = v.resolve_connection(con)
clean_ids = v.validate_table_id(table_id)
by_columns = v.normalize_column_list(by, "by", allow_empty=False)
connection_supplied = connection is not None
con_supplied = con is not None
handles = {
clean_ids[0]: i.build_table_handle(
conn, table_a, clean_ids[0], connection_supplied=connection_supplied
conn, table_a, clean_ids[0], connection_supplied=con_supplied
),
clean_ids[1]: i.build_table_handle(
conn, table_b, clean_ids[1], connection_supplied=connection_supplied
conn, table_b, clean_ids[1], connection_supplied=con_supplied
),
}
v.validate_tables(conn, handles, clean_ids, by_columns, coerce=coerce)
Expand Down
48 changes: 24 additions & 24 deletions tests/test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def comparison_from_sql(sql_a: str, sql_b: str, *, by, **kwargs):
con = duckdb.connect()
rel_a = con.sql(sql_a)
rel_b = con.sql(sql_b)
return compare(rel_a, rel_b, by=by, connection=con, **kwargs)
return compare(rel_a, rel_b, by=by, con=con, **kwargs)


def identical_comparison():
Expand All @@ -82,15 +82,15 @@ def identical_comparison():

def test_compare_summary():
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
assert rel_values(comp.tables, "nrow") == [3, 3]
value_row = rel_dicts(comp.intersection.filter("\"column\" = 'value'"))[0]
assert value_row["n_diffs"] == 1


def test_inputs_property_exposes_relations():
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
inputs = comp.inputs
assert isinstance(inputs, dict)
assert "a" in inputs and "b" in inputs
Expand All @@ -104,7 +104,7 @@ def test_compare_accepts_pandas_polars_frames():
con = duckdb.connect()
df_a = pandas.DataFrame({"id": [1, 2], "value": [10, 20]})
df_b = polars.DataFrame({"id": [1, 2], "value": [10, 22]})
comp = compare(df_a, df_b, by=["id"], connection=con)
comp = compare(df_a, df_b, by=["id"], con=con)
value_row = rel_dicts(comp.intersection.filter("\"column\" = 'value'"))[0]
assert value_row["n_diffs"] == 1
comp.close()
Expand All @@ -113,7 +113,7 @@ def test_compare_accepts_pandas_polars_frames():

def test_value_diffs_and_slice():
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
diffs = comp.value_diffs("value")
assert rel_first(diffs, "id") == 2
rows = comp.slice_diffs("a", ["value"])
Expand All @@ -122,14 +122,14 @@ def test_value_diffs_and_slice():

def test_weave_wide():
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
wide = comp.weave_diffs_wide(["value"])
assert "value_a" in wide.columns and "value_b" in wide.columns


def test_slice_unmatched():
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
unmatched = comp.slice_unmatched("a")
assert rel_first(unmatched, "id") == 1
comp.close()
Expand All @@ -141,7 +141,7 @@ def test_compare_accepts_dataframes(module_name):
df_a = module.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]})
df_b = module.DataFrame({"id": [2, 3, 4], "value": [22, 30, 40]})
con = duckdb.connect()
comp = compare(df_a, df_b, by=["id"], connection=con)
comp = compare(df_a, df_b, by=["id"], con=con)
diffs = comp.value_diffs("value")
assert rel_first(diffs, "id") == 2
comp.close()
Expand All @@ -151,7 +151,7 @@ def test_compare_accepts_dataframes(module_name):
@pytest.mark.parametrize("materialize", ["all", "summary", "none"])
def test_materialize_modes_helpers(materialize):
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con, materialize=materialize)
comp = compare(rel_a, rel_b, by=["id"], con=con, materialize=materialize)
assert rel_values(comp.tables, "nrow") == [3, 3]
diffs_row = rel_dicts(comp.intersection.filter("\"column\" = 'value'"))[0]
assert diffs_row["n_diffs"] == 1
Expand Down Expand Up @@ -183,7 +183,7 @@ def test_materialize_modes_helpers(materialize):
)
def test_materialize_modes_state(materialize, summary_materialized, has_diff_table):
con, rel_a, rel_b = build_connection()
comp = compare(rel_a, rel_b, by=["id"], connection=con, materialize=materialize)
comp = compare(rel_a, rel_b, by=["id"], con=con, materialize=materialize)
assert comp.intersection.materialized is summary_materialized
assert comp.unmatched_rows.materialized is summary_materialized
assert (comp.diff_table is not None) is has_diff_table
Expand Down Expand Up @@ -223,7 +223,7 @@ def test_summary_reports_difference_categories():
) AS t(id, value, note)
"""
)
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
summary = comp.summary()
assert summary.fetchall() == [
("value_diffs", True),
Expand All @@ -240,7 +240,7 @@ def test_summary_repr_shows_full_difference_labels():
examples.example_cars_a(con),
examples.example_cars_b(con),
by=["car"],
connection=con,
con=con,
)
rendered = str(comp.summary())
assert "unmatched_cols" in rendered
Expand Down Expand Up @@ -275,7 +275,7 @@ def test_duplicate_by_raises():
"""
)
with pytest.raises(ComparisonError):
compare(rel_dup, rel_other, by=["id"], connection=con)
compare(rel_dup, rel_other, by=["id"], con=con)


def test_examples_available():
Expand All @@ -284,7 +284,7 @@ def test_examples_available():
examples.example_cars_a(con),
examples.example_cars_b(con),
by=["car"],
connection=con,
con=con,
)
assert rel_dicts(comp.intersection.filter("\"column\" = 'mpg'"))[0]["n_diffs"] == 2
comp.close()
Expand All @@ -296,16 +296,16 @@ def test_compare_errors_when_by_column_missing():
rel_a = con.sql("SELECT 1 AS id, 10 AS value")
rel_b = con.sql("SELECT 1 AS other_id, 10 AS value")
with pytest.raises(ComparisonError):
compare(rel_a, rel_b, by=["id"], connection=con)
compare(rel_a, rel_b, by=["id"], con=con)


def test_compare_errors_on_string_inputs():
con = duckdb.connect()
rel = con.sql("SELECT 1 AS id")
with pytest.raises(ComparisonError, match=r"String inputs are not supported"):
compare(cast(Any, "SELECT 1 AS id"), rel, by=["id"], connection=con)
compare(cast(Any, "SELECT 1 AS id"), rel, by=["id"], con=con)
with pytest.raises(ComparisonError, match=r"String inputs are not supported"):
compare(rel, cast(Any, "SELECT 1 AS id"), by=["id"], connection=con)
compare(rel, cast(Any, "SELECT 1 AS id"), by=["id"], con=con)
con.close()


Expand All @@ -315,7 +315,7 @@ def test_compare_errors_on_duplicate_column_names():
df_b = pandas.DataFrame([[1, 2]], columns=["id", "value"])
con = duckdb.connect()
with pytest.raises(ComparisonError, match=r"duplicate column names"):
compare(df_a, df_b, by=["id"], connection=con)
compare(df_a, df_b, by=["id"], con=con)
con.close()


Expand All @@ -339,27 +339,27 @@ def test_compare_errors_when_table_id_invalid_length():
con, rel_a, rel_b = build_connection()
with pytest.raises(ComparisonError):
bad_table_id = cast(Any, ["x"])
compare(rel_a, rel_b, by=["id"], table_id=bad_table_id, connection=con)
compare(rel_a, rel_b, by=["id"], table_id=bad_table_id, con=con)


def test_compare_errors_when_table_id_duplicates():
con, rel_a, rel_b = build_connection()
with pytest.raises(ComparisonError):
compare(rel_a, rel_b, by=["id"], table_id=("dup", "dup"), connection=con)
compare(rel_a, rel_b, by=["id"], table_id=("dup", "dup"), con=con)


def test_compare_errors_when_table_id_blank():
con, rel_a, rel_b = build_connection()
with pytest.raises(ComparisonError):
compare(rel_a, rel_b, by=["id"], table_id=(" ", "b"), connection=con)
compare(rel_a, rel_b, by=["id"], table_id=(" ", "b"), con=con)


def test_compare_errors_when_materialize_invalid():
con, rel_a, rel_b = build_connection()
with pytest.raises(ComparisonError):
compare(rel_a, rel_b, by=["id"], connection=con, materialize=cast(Any, "nope"))
compare(rel_a, rel_b, by=["id"], con=con, materialize=cast(Any, "nope"))
with pytest.raises(ComparisonError):
compare(rel_a, rel_b, by=["id"], connection=con, materialize=cast(Any, True))
compare(rel_a, rel_b, by=["id"], con=con, materialize=cast(Any, True))
con.close()


Expand Down Expand Up @@ -552,7 +552,7 @@ def test_comparison_repr_snapshot():
con.execute(
"CREATE OR REPLACE TABLE bar AS SELECT * FROM (VALUES (2, 22, 'y'), (3, 30, 'z')) AS t(id, value, extra)"
)
comp = compare(con.table("foo"), con.table("bar"), by=["id"], connection=con)
comp = compare(con.table("foo"), con.table("bar"), by=["id"], con=con)
text = repr(comp)
assert "Comparison(tables=" in text
assert "by=" in text
Expand Down
2 changes: 1 addition & 1 deletion tests/test_slice_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def comparison_for_slice():
"AS t(id, value, other, note)"
),
by=["id"],
connection=con,
con=con,
)
yield comp
comp.close()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_slice_unmatched.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def comparison_with_unmatched():
con.sql("SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30)) AS t(id, value)"),
con.sql("SELECT * FROM (VALUES (2, 20), (3, 30), (4, 40)) AS t(id, value)"),
by=["id"],
connection=con,
con=con,
)
yield comp
comp.close()
Expand Down Expand Up @@ -45,7 +45,7 @@ def test_slice_unmatched_respects_custom_table_id():
con.sql("SELECT * FROM (VALUES (2, 20), (3, 30), (4, 40)) AS t(id, value)"),
by=["id"],
table_id=("left", "right"),
connection=con,
con=con,
)
left = comp.slice_unmatched("left")
assert rel_values(left, "id") == [1]
Expand Down
8 changes: 4 additions & 4 deletions tests/test_value_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def comparison_with_diffs():
) AS t(id, value, wind, note)
"""
)
comp = compare(rel_a, rel_b, by=["id"], connection=con)
comp = compare(rel_a, rel_b, by=["id"], con=con)
yield comp
comp.close()
con.close()
Expand Down Expand Up @@ -70,7 +70,7 @@ def test_value_diffs_stacked_errors_when_no_value_columns():
con.sql("SELECT * FROM (VALUES (1, 'x')) AS t(id, tag)"),
con.sql("SELECT * FROM (VALUES (1, 'x')) AS t(id, tag)"),
by=["id", "tag"],
connection=con,
con=con,
)
with pytest.raises(ComparisonError):
comp.value_diffs_stacked()
Expand Down Expand Up @@ -100,7 +100,7 @@ def test_value_diffs_stacked_handles_incompatible_types():
"AS t(id, alpha, beta)"
),
by=["id"],
connection=con,
con=con,
)
out = comp.value_diffs_stacked(["alpha", "beta"])
assert set(rel_values(out, "column")) == {"alpha", "beta"}
Expand All @@ -115,7 +115,7 @@ def test_value_diffs_respects_custom_table_ids():
con.sql("SELECT * FROM (VALUES (1, 15), (2, 20)) AS t(id, value)"),
by=["id"],
table_id=("original", "updated"),
connection=con,
con=con,
)
out = comp.value_diffs("value")
assert {"value_original", "value_updated"}.issubset(set(out.columns))
Expand Down
Loading