Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
Package: versus
Title: Compare Data Frames
Version: 0.3.0.9000
Authors@R:
person("Ryan", "Dickerson", , "fresh.tent5866@fastmail.com", role = c("aut", "cre", "cph"))
Authors@R: c(
person("Ryan", "Dickerson", , "fresh.tent5866@fastmail.com", role = c("aut", "cre", "cph")),
person("Eli", "Pousson", , "eli.pousson@gmail.com", role = "ctb",
comment = c(ORCID = "0000-0001-8280-1706"))
)
Description: A toolset for interactively exploring the differences between two data frames.
License: MIT + file LICENSE
Suggests:
Expand Down
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# versus (development version)

* New `table_id` argument in `compare()` allows custom table identifiers.
By default, outputs identify tables as "a" and "b". You can now provide
meaningful names e.g. `compare(..., table_id = c("original", "updated"))`, which
are reflected in the output of `compare()` and related functions. @elipousson

* In the output of `compare()` with no `table_id` specified, the table summary
`compare()$tables` uses "a" and "b" as identifiers for the `table` column rather
than "table_a" and "table_b" as used in prior versions. This is for consistency
with the case when custom ids are provided using the new `table_id` argument,
so that `compare()$tables$table` always matches the `table_id` argument.

* `weave_diffs_wide()` gains a `suffix` argument so column names in the wide
output can use custom suffixes instead of `_{table_id}`. The default keeps the
behaviour of prior versions. @elipousson

# versus 0.3.0

# versus 0.3
Expand Down
93 changes: 67 additions & 26 deletions R/compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
#' @param table_a A data frame
#' @param table_b A data frame
#' @param by <[`tidy-select`][versus_tidy_select]>. Selection of columns to use when matching rows between
#' \code{.data_a} and \code{.data_b}. Both data frames must be unique on \code{by}.
#' \code{table_a} and \code{table_b}. Both data frames must be unique on \code{by}.
#' @param allow_both_NA Logical. If \code{TRUE} a missing value in both data frames is
#' considered as equal
#' @param coerce Logical. If \code{FALSE} and columns from the input tables have
#' differing classes, the function throws an error.
#' @param table_id A character vector of length 2 providing custom identifiers for
#' \code{table_a} and \code{table_b} respectively. These identifiers are used in the
#' output instead of the default "a" and "b".
#'
#' @return
#' \describe{
Expand Down Expand Up @@ -58,22 +61,29 @@

#' @rdname compare
#' @export
compare <- function(table_a, table_b, by, allow_both_NA = TRUE, coerce = TRUE) {
compare <- function(
table_a,
table_b,
by,
allow_both_NA = TRUE,
coerce = TRUE,
table_id = c("a", "b")) {
check_required(by)
by <- enquo(by)
table_id <- clean_table_id(table_id)
table_chr <- names(enquos(table_a, table_b, .named = TRUE))
validate_tables(table_a, table_b, coerce = coerce)

by_names <- get_by_names(table_a, table_b, by = by)

table_summ <- tibble(
table = c("table_a", "table_b"),
table = table_id,
expr = table_chr,
nrow = c(nrow(table_a), nrow(table_b)),
ncol = c(ncol(table_a), ncol(table_b))
)

tbl_contents <- get_contents(table_a, table_b, by = by_names)
tbl_contents <- get_contents(table_a, table_b, by = by_names, table_id = table_id)

matches <- withCallingHandlers(
locate_matches(table_a, table_b, by = by_names),
Expand All @@ -87,29 +97,31 @@ compare <- function(table_a, table_b, by, allow_both_NA = TRUE, coerce = TRUE) {
table_a,
table_b,
by = by_names,
matches = matches
matches = matches,
table_id = table_id
)

tbl_contents$compare$diff_rows <- tbl_contents$compare$column %>%
lapply(get_diff_rows,
table_a = table_a,
table_b = table_b,
matches = matches,
allow_both_NA = allow_both_NA
allow_both_NA = allow_both_NA,
table_id = table_id
)

tbl_contents$compare <- tbl_contents$compare %>%
mutate(n_diffs = map_int(diff_rows, nrow), .after = column)

out <- list(
list(
tables = table_summ,
by = tbl_contents$by,
intersection = tbl_contents$compare,
unmatched_cols = tbl_contents$unmatched_cols,
unmatched_rows = unmatched_rows,
input = store_tables(table_a, table_b)
)
structure(out, class = "vs_comparison")
input = store_tables(table_a, table_b, table_id)
) %>%
structure(class = "vs_comparison")
}

# Methods -----------
Expand Down Expand Up @@ -175,18 +187,18 @@ split_matches <- function(matches) {
)
}

get_unmatched_rows <- function(table_a, table_b, by, matches) {
unmatched <- list(
a = fsubset(table_a, matches$a, by),
b = fsubset(table_b, matches$b, by)
)
unmatched %>%
get_unmatched_rows <- function(table_a, table_b, by, matches, table_id) {
list(
fsubset(table_a, matches$a, by),
fsubset(table_b, matches$b, by)
) %>%
setNames(table_id) %>%
bind_rows(.id = "table") %>%
mutate(row = with(matches, c(a, b))) %>%
as_tibble()
}

converge <- function(table_a, table_b, by, matches) {
converge <- function(table_a, table_b, by, matches, table_id) {
common_cols <- setdiff(intersect(names(table_a), names(table_b)), by)

by_a <- fsubset(table_a, matches$common$a, by)
Expand All @@ -195,20 +207,25 @@ converge <- function(table_a, table_b, by, matches) {

add_vars(
by_a,
frename(common_a, \(nm) paste0(nm, "_a")),
frename(common_b, \(nm) paste0(nm, "_b"))
frename(common_a, \(nm) paste0(nm, "_", table_id[1])),
frename(common_b, \(nm) paste0(nm, "_", table_id[2]))
)
}

join_split <- function(table_a, table_b, by) {
join_split <- function(table_a, table_b, by, table_id) {
matches <- locate_matches(table_a, table_b, by)
intersection <- converge(table_a, table_b, by, matches)
unmatched_rows <- get_unmatched_rows(table_a, table_b, by, matches)
intersection <- converge(table_a, table_b, by, matches, table_id)
unmatched_rows <- get_unmatched_rows(table_a, table_b, by, matches, table_id)
list(intersection = intersection, unmatched_rows = unmatched_rows)
}

get_contents <- function(table_a, table_b, by) {
tbl_contents <- join_split(contents(table_a), contents(table_b), by = "column")
get_contents <- function(table_a, table_b, by, table_id) {
tbl_contents <- join_split(
contents(table_a),
contents(table_b),
by = "column",
table_id = table_id
)
out <- list()

out$by <- tbl_contents$intersection %>%
Expand All @@ -223,9 +240,9 @@ get_contents <- function(table_a, table_b, by) {
out
}

store_tables <- function(table_a, table_b) {
store_tables <- function(table_a, table_b, table_id) {
env <- new_environment()
env$value <- list(a = table_a, b = table_b)
env$value <- list(table_a, table_b) %>% setNames(table_id)
dt_copy <- getOption("versus.copy_data_table", default = FALSE)
if (dt_copy) {
env$value <- env$value %>%
Expand Down Expand Up @@ -257,6 +274,30 @@ rethrow_match_relationship <- function(table_a, table_b, by) {
}
}

clean_table_id <- function(table_id, call = caller_env()) {
if (!is_character(table_id, n = 2)) {
message <- c(
"{.arg table_id} must be a character vector of length 2",
i = "{.arg table_id} is {.obj_type_friendly {table_id}} of length {length(table_id)}"
)
cli_abort(message, call = call)
}
attributes(table_id) <- NULL
new <- table_id %>%
vec_as_names(repair = "universal", quiet = TRUE) %>%
# second vec_as_names() is needed due to vctrs issue #1013
vec_as_names(repair = "unique", quiet = TRUE)
old <- table_id %|% ""
is_changed <- new != old
if (!any(is_changed)) {
return(table_id)
}
bullets <- paste0("`", old[is_changed], "` -> `", new[is_changed], "`")
message <- c("{.arg table_id} has been adjusted", set_names(bullets, "*"))
cli_inform(message = message)
new
}

validate_tables <- function(table_a, table_b, coerce, call = caller_env()) {
assert_data_frame(table_a, call = call)
assert_data_frame(table_b, call = call)
Expand Down
4 changes: 2 additions & 2 deletions R/get-diff-rows.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
get_diff_rows <- function(col, table_a, table_b, matches, allow_both_NA) {
get_diff_rows <- function(col, table_a, table_b, matches, allow_both_NA, table_id) {
col_a <- fsubset(table_a, matches$common$a, col)[[1]]
col_b <- fsubset(table_b, matches$common$b, col)[[1]]
matches$common %>%
fsubset(not_equal(col_a, col_b, allow_both_NA)) %>%
frename(c("row_a", "row_b"))
frename(paste0("row_", table_id))
}

not_equal <- function(col_a, col_b, allow_both_NA) {
Expand Down
17 changes: 9 additions & 8 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@ fsubset <- function(x, i, j, check = FALSE) {
ss(x, i, j, check = check)
}

assert_table_is_a_or_b <- function(table, call = caller_env()) {
check_table_arg <- function(table, comparison, call = caller_env()) {
if (identical(table, quo())) {
cli_abort("`table` is absent but must be supplied.", call = call)
}
table_id <- comparison$tables$table
table_expr <- quo_squash(table)
table_chr <- shorten(deparse(table_expr), 30)
top_msg <- "Problem with argument `table = {table_chr}`"
if (!is_character(table_expr)) {
info <- '`table` must be a single character value: "a" or "b"'
a_or_b <- paste0('"', table_id[1], '" or "', table_id[2], '"')
if (!is_string(table_expr)) {
info <- paste0("`table` must be a single character value: ", a_or_b)
cli_abort(c(top_msg, i = info), call = call)
}
if (!(identical(table_expr, "a") | identical(table_expr, "b"))) {
info <- '`table` must be either "a" or "b"'
if (!(identical(table_expr, table_id[1]) || identical(table_expr, table_id[2]))) {
info <- paste0("`table` must be either ", a_or_b)
cli_abort(c(top_msg, i = info), call = call)
}
}
Expand Down Expand Up @@ -45,10 +47,9 @@ is_ptype_compatible <- function(...) {
!incompatible
}

table_init <- function(comparison, cols = c("intersection", "by"), tbl = c("a", "b")) {
# simulate a data frame with the same classes as table_[tbl]
table_init <- function(comparison, cols = c("intersection", "by"), tbl = 1) {
# simulate a data frame with the same classes as an input table
cols <- arg_match(cols)
tbl <- arg_match(tbl)
fsubset(comparison$input$value[[tbl]], integer(0), comparison[[cols]]$column)
}

Expand Down
11 changes: 8 additions & 3 deletions R/slice-diffs.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#' Get rows with differing values
#'
#' @param comparison The output of \code{compare()}
#' @param table One of \code{"a"} or \code{"b"} indicating which of the tables used to
#' create \code{comparison} should be sliced
#' @param table A string matching one of the identifiers supplied via
#' \code{table_id} when calling \code{compare()} (defaults are \code{"a"} and
#' \code{"b"}). Within the comparison, these identifiers are stored in
#' \code{comparison$tables$table}.
#' @param column <[`tidy-select`][versus_tidy_select]>. A row will be in the output if
#' the comparison shows differing values for any columns matching this argument
#'
Expand All @@ -15,12 +17,15 @@
#' comp |> slice_diffs("a", mpg)
#' comp |> slice_diffs("b", mpg)
#' comp |> slice_diffs("a", c(mpg, disp))
#'
#' comp <- compare(example_df_a, example_df_b, by = car, table_id = c("old", "new"))
#' comp |> slice_diffs("old", mpg)

#' @rdname slice_diffs
#' @export
slice_diffs <- function(comparison, table, column = everything()) {
assert_is_comparison(enquo(comparison))
assert_table_is_a_or_b(enquo(table))
check_table_arg(enquo(table), comparison)
slice_diffs_impl(comparison, table, enquo(column))
}

Expand Down
9 changes: 7 additions & 2 deletions R/slice-unmatched.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,29 @@
#' # slice_unmatched(comp, "a") output is the same as
#' example_df_a |> dplyr::anti_join(example_df_b, by = comp$by$column)
#'
#' comp <- compare(example_df_a, example_df_b, by = car, table_id = c("old", "new"))
#' comp |> slice_unmatched("old")
#'
#' comp |> slice_unmatched_both()

#' @rdname slice_unmatched
#' @export
slice_unmatched <- function(comparison, table) {
assert_is_comparison(enquo(comparison))
assert_table_is_a_or_b(enquo(table))
check_table_arg(enquo(table), comparison)
slice_unmatched_impl(comparison, table)
}

#' @rdname slice_unmatched
#' @export
slice_unmatched_both <- function(comparison) {
assert_is_comparison(enquo(comparison))
table_id <- comparison$tables$table

out_cols <- with(comparison, c(by$column, intersection$column))

c(a = "a", b = "b") %>%
table_id %>%
setNames(table_id) %>%
map(slice_unmatched_impl, comparison = comparison, j = out_cols) %>%
ensure_ptype_compatible() %>%
bind_rows(.id = "table")
Expand Down
Loading