diff --git a/pkg-r/DESCRIPTION b/pkg-r/DESCRIPTION index 8c368369..54101403 100644 --- a/pkg-r/DESCRIPTION +++ b/pkg-r/DESCRIPTION @@ -38,6 +38,8 @@ Imports: whisker Suggests: bsicons, + dbplyr, + dplyr, DT, palmerpenguins, RSQLite, diff --git a/pkg-r/NAMESPACE b/pkg-r/NAMESPACE index 4765fced..2968103c 100644 --- a/pkg-r/NAMESPACE +++ b/pkg-r/NAMESPACE @@ -4,6 +4,7 @@ export(DBISource) export(DataFrameSource) export(DataSource) export(QueryChat) +export(TblLazySource) export(querychat) export(querychat_app) export(querychat_data_source) diff --git a/pkg-r/R/DataSource.R b/pkg-r/R/DataSource.R index dbade311..43eb212b 100644 --- a/pkg-r/R/DataSource.R +++ b/pkg-r/R/DataSource.R @@ -90,11 +90,9 @@ DataSource <- R6::R6Class( #' #' @description #' A DataSource implementation that wraps a data frame using DuckDB for SQL -#' query execution. -#' -#' @details -#' This class creates an in-memory DuckDB connection and registers the provided -#' data frame as a table. All SQL queries are executed against this DuckDB table. +#' query execution. This class creates an in-memory DuckDB connection and +#' registers the provided data frame as a table. All SQL queries are executed +#' against this DuckDB table. #' #' @export #' @examples @@ -222,11 +220,8 @@ DataFrameSource <- R6::R6Class( #' #' @description #' A DataSource implementation for DBI database connections (SQLite, PostgreSQL, -#' MySQL, etc.). -#' -#' @details -#' This class wraps a DBI connection and provides SQL query execution against -#' a specified table in the database. +#' MySQL, etc.). This class wraps a DBI connection and provides SQL query +#' execution against a single table in the database. #' #' @export #' @examples @@ -378,6 +373,182 @@ DBISource <- R6::R6Class( ) +#' Data Source: Lazy Tibble +#' +#' @description +#' A DataSource implementation for lazy tibbles connected to databases via +#' [dbplyr::tbl_sql()] or [dplyr::sql()]. +#' +#' @examplesIf rlang::is_interactive() && rlang::is_installed("dbplyr") && rlang::is_installed("dplyr") && rlang::is_installed("duckdb") +#' con <- DBI::dbConnect(duckdb::duckdb()) +#' DBI::dbWriteTable(con, "mtcars", mtcars) +#' +#' mtcars_source <- TblLazySource$new(tbl(con, "mtcars")) +#' mtcars_source$get_db_type() # "DuckDB" +#' +#' result <- mtcars_source$execute_query("SELECT * FROM mtcars WHERE cyl > 4") +#' +#' # Note, the result is not the *full* data frame, but a lazy SQL tibble +#' result +#' +#' # You can chain this result into a dplyr pipeline +#' dplyr::count(result, cyl, gear) +#' +#' # Or collect the entire data frame into local memory +#' dplyr::collect(result) +#' +#' # Finally, clean up when done with the database (closes the DB connection) +#' mtcars_source$cleanup() +#' +#' @export +TblLazySource <- R6::R6Class( + "TblLazySource", + inherit = DBISource, + private = list( + tbl = NULL, + tbl_cte = NULL + ), + public = list( + #' @field table_name Name of the table to be used in SQL queries + table_name = NULL, + + #' @description + #' Create a new TblLazySource + #' + #' @param tbl A [dbplyr::tbl_sql()] (or lazy tibble via [dplyr::tbl()]). + #' @param table_name Name of the table in the database. Can be a character + #' string, or will be inferred from the `tbl` argument, if possible. + #' @return A new TblLazySource object + #' @examplesIf rlang::is_interactive() && rlang::is_installed("dbplyr") && rlang::is_installed("dplyr") && rlang::is_installed("RSQLite") + #' conn <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") + #' DBI::dbWriteTable(conn, "mtcars", mtcars) + #' source <- TblLazySource$new(dplyr::tbl(con, "mtcars")) + initialize = function(tbl, table_name = missing_arg()) { + check_installed("dbplyr") + check_installed("dplyr") + + if (!inherits(tbl, "tbl_sql")) { + cli::cli_abort( + "{.arg tbl} must be a lazy tibble connected to a database, not {.obj_type_friendly {tbl}}" + ) + } + + private$conn <- dbplyr::remote_con(tbl) + private$tbl <- tbl + + # Collect various signals to infer the table name + obj_name <- deparse1(substitute(tbl)) + + # Get the exact table name, if tbl directly references a single table + remote_name <- dbplyr::remote_name(private$tbl) + + use_cte <- FALSE + + if (!is_missing(table_name)) { + check_sql_table_name(table_name) + self$table_name <- table_name + use_cte <- identical(table_name, remote_name %||% remote_table) + } else if (!is.null(remote_name)) { + # Remote name is non-NULL when it points to a table, so we use that next + self$table_name <- remote_name + use_cte <- FALSE + } else if (is_valid_sql_table_name(obj_name)) { + self$table_name <- obj_name + use_cte <- TRUE + } else { + id <- as.integer(runif(1) * 1e6) + self$table_name <- sprintf("querychat_cte_%d", id) + use_cte <- TRUE + } + + if (use_cte) { + # We received a complicated tbl expression, we'll have to use a CTE + private$tbl_cte <- dbplyr::remote_query(private$tbl) + } + }, + + #' @description + #' Get the database type + #' + #' @return A string describing the database type (e.g., "DuckDB", "SQLite") + get_db_type = function() { + super$get_db_type() + }, + + #' @description + #' Get schema information about the table + #' + #' @param categorical_threshold Maximum number of unique values for a text + #' column to be considered categorical + #' @return A string containing schema information formatted for LLM prompts + get_schema = function(categorical_threshold = 20) { + get_schema_impl( + private$conn, + self$table_name, + categorical_threshold, + columns = colnames(private$tbl), + prep_query = self$prep_query + ) + }, + + #' @description + #' Execute a SQL query and return results + #' + #' @param query SQL query string to execute + #' @return A data frame containing query results + execute_query = function(query) { + sql_query <- self$prep_query(query) + dplyr::tbl(private$conn, dplyr::sql(sql_query)) + }, + + #' @description + #' Test a SQL query by fetching only one row + #' + #' @param query SQL query string to test + #' @return A data frame containing one row of results (or empty if no matches) + test_query = function(query) { + super$test_query(self$prep_query(query)) + }, + + #' @description + #' Prepare a generic `SELECT * FROM ____` query to work with the SQL tibble + #' + #' @param query SQL query as a string + #' @return A complete SQL query string + prep_query = function(query) { + check_string(query) + + if (is.null(private$tbl_cte)) { + return(query) + } + + sprintf( + "WITH %s AS (\n%s\n)\n%s", + DBI::dbQuoteIdentifier(private$conn, self$table_name), + private$tbl_cte, + query + ) + }, + + #' @description + #' Get the unfiltered data as a SQL tibble + #' + #' @return A [dbplyr::tbl_sql()] containing the original, unfiltered data + get_data = function() { + private$tbl + }, + + #' @description + #' Clean up resources (close connections, etc.) + #' + #' @return NULL (invisibly) + cleanup = function() { + super$cleanup() + } + ) +) + + # Helper Functions ------------------------------------------------------------- #' Check if object is a DataSource @@ -390,9 +561,17 @@ is_data_source <- function(x) { } -get_schema_impl <- function(conn, table_name, categorical_threshold = 20) { +get_schema_impl <- function( + conn, + table_name, + categorical_threshold = 20, + columns = NULL, + prep_query = identity +) { + check_function(prep_query) + # Get column information - columns <- DBI::dbListFields(conn, table_name) + columns <- columns %||% DBI::dbListFields(conn, table_name) schema_lines <- c( paste("Table:", DBI::dbQuoteIdentifier(conn, table_name)), @@ -410,7 +589,7 @@ get_schema_impl <- function(conn, table_name, categorical_threshold = 20) { DBI::dbQuoteIdentifier(conn, table_name), " LIMIT 1" ) - sample_data <- DBI::dbGetQuery(conn, sample_query) + sample_data <- DBI::dbGetQuery(conn, prep_query(sample_query)) for (col in columns) { col_class <- class(sample_data[[col]])[1] @@ -460,7 +639,7 @@ get_schema_impl <- function(conn, table_name, categorical_threshold = 20) { " FROM ", DBI::dbQuoteIdentifier(conn, table_name) ) - result <- DBI::dbGetQuery(conn, stats_query) + result <- DBI::dbGetQuery(conn, prep_query(stats_query)) if (nrow(result) > 0) { column_stats <- as.list(result[1, ]) } @@ -505,7 +684,7 @@ get_schema_impl <- function(conn, table_name, categorical_threshold = 20) { " IS NOT NULL ORDER BY ", DBI::dbQuoteIdentifier(conn, col_name) ) - result <- DBI::dbGetQuery(conn, cat_query) + result <- DBI::dbGetQuery(conn, prep_query(cat_query)) if (nrow(result) > 0) { categorical_values[[col_name]] <- result[[1]] } diff --git a/pkg-r/R/QueryChat.R b/pkg-r/R/QueryChat.R index 6ed12a8d..25713636 100644 --- a/pkg-r/R/QueryChat.R +++ b/pkg-r/R/QueryChat.R @@ -160,8 +160,10 @@ QueryChat <- R6::R6Class( check_string(prompt_template, allow_null = TRUE) check_bool(cleanup, allow_na = TRUE) - if (is_missing(table_name) && is.data.frame(data_source)) { - table_name <- deparse1(substitute(data_source)) + if (is_missing(table_name)) { + if (is.data.frame(data_source) || inherits(data_source, "tbl_sql")) { + table_name <- deparse1(substitute(data_source)) + } } private$.data_source <- normalize_data_source(data_source, table_name) @@ -338,8 +340,15 @@ QueryChat <- R6::R6Class( }) output$dt <- DT::renderDT({ + df <- qc_vals$df() + if (inherits(df, "tbl_sql")) { + # Materialize the query to get a data frame, {dplyr} guaranteed by + # TblLazySource interface + df <- dplyr::collect(df) + } + DT::datatable( - qc_vals$df(), + df, fillContainer = TRUE, options = list(pageLength = 25, scrollX = TRUE) ) @@ -631,8 +640,10 @@ querychat <- function( prompt_template = NULL, cleanup = NA ) { - if (is_missing(table_name) && is.data.frame(data_source)) { - table_name <- deparse1(substitute(data_source)) + if (is_missing(table_name)) { + if (is.data.frame(data_source) || inherits(data_source, "tbl_sql")) { + table_name <- deparse1(substitute(data_source)) + } } QueryChat$new( @@ -701,6 +712,10 @@ normalize_data_source <- function(data_source, table_name) { return(DataFrameSource$new(data_source, table_name)) } + if (inherits(data_source, "tbl_lazy")) { + return(TblLazySource$new(data_source, table_name)) + } + if (inherits(data_source, "DBIConnection")) { return(DBISource$new(data_source, table_name)) } diff --git a/pkg-r/R/utils-check.R b/pkg-r/R/utils-check.R index 862b61ed..7da059ef 100644 --- a/pkg-r/R/utils-check.R +++ b/pkg-r/R/utils-check.R @@ -32,7 +32,7 @@ check_sql_table_name <- function( check_string(x, allow_null = allow_null, arg = arg, call = call) # Then validate SQL table name pattern - if (!grepl("^[a-zA-Z][a-zA-Z0-9_]*$", x)) { + if (!is_valid_sql_table_name(x)) { cli::cli_abort( c( "{.arg {arg}} must be a valid SQL table name", @@ -45,3 +45,7 @@ check_sql_table_name <- function( invisible(NULL) } + +is_valid_sql_table_name <- function(x) { + grepl("^[a-zA-Z][a-zA-Z0-9_]*$", x) +} diff --git a/pkg-r/man/DBISource.Rd b/pkg-r/man/DBISource.Rd index d94adae6..eb38c209 100644 --- a/pkg-r/man/DBISource.Rd +++ b/pkg-r/man/DBISource.Rd @@ -5,11 +5,8 @@ \title{DBI Source} \description{ A DataSource implementation for DBI database connections (SQLite, PostgreSQL, -MySQL, etc.). -} -\details{ -This class wraps a DBI connection and provides SQL query execution against -a specified table in the database. +MySQL, etc.). This class wraps a DBI connection and provides SQL query +execution against a single table in the database. } \examples{ \dontrun{ diff --git a/pkg-r/man/DataFrameSource.Rd b/pkg-r/man/DataFrameSource.Rd index ab2632c1..6a38f210 100644 --- a/pkg-r/man/DataFrameSource.Rd +++ b/pkg-r/man/DataFrameSource.Rd @@ -5,11 +5,9 @@ \title{Data Frame Source} \description{ A DataSource implementation that wraps a data frame using DuckDB for SQL -query execution. -} -\details{ -This class creates an in-memory DuckDB connection and registers the provided -data frame as a table. All SQL queries are executed against this DuckDB table. +query execution. This class creates an in-memory DuckDB connection and +registers the provided data frame as a table. All SQL queries are executed +against this DuckDB table. } \examples{ \dontrun{ diff --git a/pkg-r/man/TblLazySource.Rd b/pkg-r/man/TblLazySource.Rd new file mode 100644 index 00000000..dbf51cc6 --- /dev/null +++ b/pkg-r/man/TblLazySource.Rd @@ -0,0 +1,222 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataSource.R +\name{TblLazySource} +\alias{TblLazySource} +\title{Data Source: Lazy Tibble} +\description{ +A DataSource implementation for lazy tibbles connected to databases via +\code{\link[dbplyr:tbl_sql]{dbplyr::tbl_sql()}} or \code{\link[dplyr:sql]{dplyr::sql()}}. +} +\examples{ +\dontshow{if (rlang::is_interactive() && rlang::is_installed("dbplyr") && rlang::is_installed("dplyr") && rlang::is_installed("duckdb")) withAutoprint(\{ # examplesIf} +con <- DBI::dbConnect(duckdb::duckdb()) +DBI::dbWriteTable(con, "mtcars", mtcars) + +mtcars_source <- TblLazySource$new(tbl(con, "mtcars")) +mtcars_source$get_db_type() # "DuckDB" + +result <- mtcars_source$execute_query("SELECT * FROM mtcars WHERE cyl > 4") + +# Note, the result is not the *full* data frame, but a lazy SQL tibble +result + +# You can chain this result into a dplyr pipeline +dplyr::count(result, cyl, gear) + +# Or collect the entire data frame into local memory +dplyr::collect(result) + +# Finally, clean up when done with the database (closes the DB connection) +mtcars_source$cleanup() +\dontshow{\}) # examplesIf} +\dontshow{if (rlang::is_interactive() && rlang::is_installed("dbplyr") && rlang::is_installed("dplyr") && rlang::is_installed("RSQLite")) withAutoprint(\{ # examplesIf} +conn <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +DBI::dbWriteTable(conn, "mtcars", mtcars) +source <- TblLazySource$new(dplyr::tbl(con, "mtcars")) +\dontshow{\}) # examplesIf} +} +\section{Super classes}{ +\code{\link[querychat:DataSource]{querychat::DataSource}} -> \code{\link[querychat:DBISource]{querychat::DBISource}} -> \code{TblLazySource} +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Name of the table to be used in SQL queries} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-TblLazySource-new}{\code{TblLazySource$new()}} +\item \href{#method-TblLazySource-get_db_type}{\code{TblLazySource$get_db_type()}} +\item \href{#method-TblLazySource-get_schema}{\code{TblLazySource$get_schema()}} +\item \href{#method-TblLazySource-execute_query}{\code{TblLazySource$execute_query()}} +\item \href{#method-TblLazySource-test_query}{\code{TblLazySource$test_query()}} +\item \href{#method-TblLazySource-prep_query}{\code{TblLazySource$prep_query()}} +\item \href{#method-TblLazySource-get_data}{\code{TblLazySource$get_data()}} +\item \href{#method-TblLazySource-cleanup}{\code{TblLazySource$cleanup()}} +\item \href{#method-TblLazySource-clone}{\code{TblLazySource$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-new}{}}} +\subsection{Method \code{new()}}{ +Create a new TblLazySource +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$new(tbl, table_name = missing_arg())}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{tbl}}{A \code{\link[dbplyr:tbl_sql]{dbplyr::tbl_sql()}} (or lazy tibble via \code{\link[dplyr:tbl]{dplyr::tbl()}}).} + +\item{\code{table_name}}{Name of the table in the database. Can be a character +string, or will be inferred from the \code{tbl} argument, if possible.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A new TblLazySource object +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-get_db_type}{}}} +\subsection{Method \code{get_db_type()}}{ +Get the database type +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$get_db_type()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +A string describing the database type (e.g., "DuckDB", "SQLite") +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-get_schema}{}}} +\subsection{Method \code{get_schema()}}{ +Get schema information about the table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$get_schema(categorical_threshold = 20)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{categorical_threshold}}{Maximum number of unique values for a text +column to be considered categorical} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A string containing schema information formatted for LLM prompts +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-execute_query}{}}} +\subsection{Method \code{execute_query()}}{ +Execute a SQL query and return results +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$execute_query(query)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{query}}{SQL query string to execute} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data frame containing query results +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-test_query}{}}} +\subsection{Method \code{test_query()}}{ +Test a SQL query by fetching only one row +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$test_query(query)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{query}}{SQL query string to test} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data frame containing one row of results (or empty if no matches) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-prep_query}{}}} +\subsection{Method \code{prep_query()}}{ +Prepare a generic \verb{SELECT * FROM ____} query to work with the SQL tibble +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$prep_query(query)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{query}}{SQL query as a string} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A complete SQL query string +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-get_data}{}}} +\subsection{Method \code{get_data()}}{ +Get the unfiltered data as a SQL tibble +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$get_data()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +A \code{\link[dbplyr:tbl_sql]{dbplyr::tbl_sql()}} containing the original, unfiltered data +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-cleanup}{}}} +\subsection{Method \code{cleanup()}}{ +Clean up resources (close connections, etc.) +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$cleanup()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +NULL (invisibly) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TblLazySource-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TblLazySource$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/pkg-r/man/querychat-package.Rd b/pkg-r/man/querychat-package.Rd index 63598eda..7c10e48a 100644 --- a/pkg-r/man/querychat-package.Rd +++ b/pkg-r/man/querychat-package.Rd @@ -75,11 +75,19 @@ Useful links: \itemize{ \item \url{https://posit-dev.github.io/querychat/pkg-r} \item \url{https://posit-dev.github.io/querychat} + \item \url{https://github.com/posit-dev/querychat} + \item Report bugs at \url{https://github.com/posit-dev/querychat/issues} } } \author{ -\strong{Maintainer}: Joe Cheng \email{joe@posit.co} +\strong{Maintainer}: Garrick Aden-Buie \email{garrick@posit.co} (\href{https://orcid.org/0000-0002-7111-0077}{ORCID}) + +Authors: +\itemize{ + \item Joe Cheng \email{joe@posit.co} [conceptor] + \item Carson Sievert \email{carson@posit.co} (\href{https://orcid.org/0000-0002-4958-2844}{ORCID}) +} Other contributors: \itemize{