From 16a1de2c4d065183716bb1cf36dc8286461a7bc7 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 02:21:18 -0400 Subject: [PATCH 01/11] add birthday post --- .../index.qmd | 409 ++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd new file mode 100644 index 0000000..6525d70 --- /dev/null +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -0,0 +1,409 @@ +--- +title: "Happy birthday, data.table" +author: "Toby Hocking" +date: "2026-04-15" +categories: [tips, tutorials, developer] +draft: false +image: "copy_on_modify.png" +--- + +```{r Ropts, echo=FALSE} +knitr::opts_chunk$set( + dpi=100, + fig.width=9, + fig.height=4) +``` + +Happy birthday, `data.table`! + +# New analysis of authors and contributors + +Since this is the 20th anniversary of Matt’s original CRAN submission, I wanted to do some analysis of contributors over time, to emphasize the great community that has been working to improve `data.table` in recent years. +To do that, we first download data on all releases, using code from [my previous post](https://tdhock.github.io/blog/2022/release-history/) about the release history of `data.table`. + +## Download Archive web page + +We can download the Archive web page for `data.table` via the code below, + +```{r} +Archive <- "https://cloud.r-project.org/src/contrib/Archive/" +get_Archive <- function(Package, releases.dir="~/releases"){ + dir.create(releases.dir, showWarnings = FALSE) + pkg.html <- file.path(releases.dir, paste0(Package, ".html")) + if(!file.exists(pkg.html)){ + u <- paste0(Archive, Package) + download.file(u, pkg.html) + } + readLines(pkg.html) +} +(Archive.data.table <- get_Archive("data.table")) +``` + +The output above shows that the Archive web page has a regular structure, which we can convert into a data table using the regular expression pattern below. + +```{r} +file.pattern <- list( + '(?<=>)', + package=".*?", + "_", + version="[0-9.-]+", + "[.]tar[.]gz") +``` + +The code above specifies a regular expression: + +* `'(?<=>)'` is a lookbehind assertion. It means to start by looking for a greater than sign, but not including that character in the match. +* `package=".*?"` means to match zero or more of anything except newline (non-greedy, as few as possible), and output the match in the `package` column, +* `"_"` means to start by matching an underscore, +* `version="[0-9.-]+"` means to match one or more digits/dots/dashes, and + output them in the `version` column, +* `"[.]tar[.]gz\\s+"` means to match the `.tar.gz` file name suffix. + +Below we use that pattern to convert the web page into a data table with two columns, + +```{r} +nc::capture_all_str(Archive.data.table, file.pattern) +``` + +Next, we add to the pattern to match the release date, + +```{r} +Archive.pattern <- list( + file=file.pattern, + "", + "\\s+", + date.str=".*?", + "\\s") +``` + +The code above has + +* `file=file.pattern` which means to apply the previous regex, and put the matching text in the `file` column, +* `""` which matches the closing `` tag +* `"\\s+"` which matches one or more white space characters, +* `date.str=".*?"` which matches zero or more characters (non-greedy, + as few as possible), and output them in the `date.str` column, +* `"\\s"` means to match one white space character. + +The end result is a table with one row for each matched package version, and one column for each of the named arguments: + +```{r} +(Archive.dt <- nc::capture_all_str(Archive.data.table, Archive.pattern)) +``` + +Above the table shows all matches, in the same order as the original Archive web page. +Below we key the table by date, which sorts and enables fast joins. + +```{r} +Archive.dt[, IDate := as.IDate(date.str)] +library(data.table) +setkey(Archive.dt, IDate) +Archive.dt +``` + +Next, we define a grid of dates corresponding to every ten years. + +```{r} +(grid.dt <- setkey(data.table( + grid.IDate=c( + as.IDate("2006-04-14"), + as.IDate("2011-04-14"), + seq( + as.IDate("2016-04-14"), + as.IDate("2026-04-14"), + length.out=11))))) +``` + +Next, we do a rolling join to find which releases are nearest the 10 year grid. + +```{r} +(nearest.dt <- Archive.dt[grid.dt, .( + file, version, package, + release=x.IDate, + grid=i.grid.IDate +), roll="nearest"]) +``` + +Next, we download the old package sources from the Archive, and extract the Author field of DESCRIPTION. + +```{r} +desc.dt <- nearest.dt[, { + cache.dir <- "~/Archive" + dir.create(cache.dir, showWarnings = FALSE) + dt.tar.gz <- file.path(cache.dir, file) + if(!file.exists(dt.tar.gz)) + download.file(paste0(Archive, package, "/", file), dt.tar.gz) + conn <- gzfile(dt.tar.gz, "b") + DESCRIPTION <- file.path(package, "DESCRIPTION") + untar(conn, files=DESCRIPTION) + close(conn) + .(Author=read.dcf(DESCRIPTION)[,"Author"]) +}, by=.(version, release)] +``` + +The output above seems to have extra newlines, which we remove below: + +```{r} +desc.dt[, no.newlines := gsub("\n", " ", Author)] +cat(paste(desc.dt$no.newlines,collapse="\n")) +``` + +The output above has one line of comma-separated authors per ten year release. +We would like to convert it to a table with one year per author. +A simple approach would be + +```{r} +head(sapply(strsplit(desc.dt$no.newlines, ", "), head)) +``` + +It is clear that the result above does not quite work (Matt’s info is broken into the first two entries). +Instead we can use + +```{r} +author.pattern <- list( + name=".+?", + nc::quantifier( + " \\[", + roles=".+?", + "\\]", + "?"), + nc::quantifier( + " \\(", + paren=".+?", + "\\)", + "?"), + ## each author ends with a comma, or the end of the string (\z). + nc::alternatives(" with (?:many )?contributions from ", ", ", "\\z")) +(author.dt <- desc.dt[, nc::capture_all_str( + no.newlines, author.pattern +), by=.(version, release)]) +``` + +The table above has one row for each person who appears in the Author field. +We will analyze the roles. + +```{r} +author.dt[roles==""] +``` + +We see some old etries above with missing roles, which we fill in below. + +```{r} +author.dt[grepl("Dowle|Srinivasan", name), roles := "aut"] +author.dt[roles=="", roles := "ctb"] +author.dt[roles=="aut, cre", roles := "aut"] +(count.dt <- author.dt[, .(people=.N), by=.(release, version, roles)]) +``` + +How has this evolved in the past ten years? + +```{r} +library(ggplot2) +pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) +space.cm <- 0.2 +gg <- ggplot(count.dt, aes( + release, people, color=roles))+ + geom_line()+ + geom_point()+ + scale_x_date(breaks="year")+ + scale_y_log10(limits=c(0.2,500))+ + directlabels::geom_dl(aes( + label=sprintf("%s\n%s", version, pp(people))), + data=count.dt[roles=="ctb"], + method=list(directlabels::dl.trans(y=y+space.cm), "top.polygons"))+ + directlabels::geom_dl(aes( + label=sprintf("%s\n%s", pp(people), version)), + data=count.dt[roles=="aut"], + method=list(directlabels::dl.trans(y=y-space.cm), "bottom.polygons")) +directlabels::direct.label(gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons")) +``` + +The figure above shows that the number of authors and contributors has greatly expanded in the second decade of `data.table`. +I’m looking forward to the third decade! + +# Update of the previous blog + +The rest of this post is copied from [my previous post](https://tdhock.github.io/blog/2022/release-history/), with an update based on recent data. + +## Analyze several packages for comparison + +The code below defines a set of four packages for which we would like +to analyze the release history (tidyverse packages for comparison). + +```{r} +compare.pkg.dt <- rbind( + data.table(project="tidyverse", Package=c("readr","tidyr","dplyr")), + data.table(project="deprecated", Package=c("reshape2", "plyr")), + data.table(project="data.table", Package="data.table")) +``` + +In the code below, we do the same thing for each package, + +```{r} +(release.dt <- compare.pkg.dt[, { + Archive.pkg <- get_Archive(Package) + nc::capture_all_str(Archive.pkg, Archive.pattern) +}, by=names(compare.pkg.dt)]) +``` + +The result above is a data table with one row for each package +version. Note that the code set `by` to all column names, so that the +code is run for each row/package. + +## Add columns for plotting + +For plotting we add a few more columns, + +```{r} +release.dt[, `:=`( + IDate = as.IDate(date.str), + year = as.integer(sub("-.*", "", date.str)), + package = factor(Package, compare.pkg.dt$Package), + Project = paste0('\n', project))] +setkey(release.dt, Project, Package, IDate) +release.dt +``` + +To explain the new columns above, + +* `IDate` is for the date to display on the X axis, +* `year` is for labeling the first released version each year, +* `package` is for displaying the Y axis in a particular order + (defined by the factor levels), +* `Project` is for the facet/panel titles (newline so that minimal + vertical space is used). + +## Basic plot + +The code below creates a basic version history plot, + +```{r points} +(gg.points <- ggplot()+ + theme( + axis.text.x=element_text(hjust=1, angle=40))+ + facet_grid(Project ~ ., labeller=label_both, scales="free")+ + geom_point(aes( + IDate, package), + shape=1, + data=release.dt)+ + scale_x_date("Date", breaks="year")) +``` + +The plot above shows a point for every release to CRAN, so you can see +the distribution of releases over time. + +## Add direct labels + +Before plotting we make a new table which contains only the first +release of `data.table` in each year (for direct labels), + +```{r} +(labeled.releases <- release.dt[Package=="data.table", .SD[1], by=year]) +``` + +```{r points-labels} +gg.points+ + directlabels::geom_dl(aes( + IDate, package, label=paste0(year, "\n", version)), + method=list( + cex=0.7, + directlabels::polygon.method( + "top", offset.cm=0.2, custom.colors=list( + colour="white", + box.color="black", + text.color="black"))), + data=labeled.releases) +``` + +The plot above shows a label for the first version released each year. + +## Releases per year + +One way to compute releases per year would be to add up the total +number of releases, then divide by the number of years, + +```{r release-history} +(overall.stats <- dcast( + release.dt, + project + Package ~ ., + list(min,max,length), + value.var="year" +)[, releases.per.year := year_length/(year_max-year_min+1)][]) +``` + +Another way to do it would be to compute the number of releases in +each year since the release of the package. To do that we first +compute, for each package, a set of years for which we want to count +releases. + +```{r} +(max.year <- max(release.dt$year)) +(years.since.release <- release.dt[, .( + year=seq(min(year), max.year) +), by=.(Project, project, Package, package)]) +``` + +Then we can do a join and summarize to count the number of releases in +each year, for each package, + +```{r} +(releases.per.year <- release.dt[years.since.release, .( + N=as.numeric(.N) +), on=.NATURAL, by=.EACHI]) +``` + +Note that `on=.NATURAL` above means to join on the common columns +between the two tables, and `by=.EACHI` means to compute a summary for +each value specified in `i` (the first argument in the square bracket). +We can plot these data as a heat map via + +```{r heatMap} +ggplot()+ + theme_bw()+ + theme(panel.spacing=grid::unit(0, "lines"))+ + geom_tile(aes( + year, package, fill=log(N+1)), + data=releases.per.year)+ + geom_text(aes( + year, package, label=N), + data=releases.per.year)+ + facet_grid(Project ~ ., labeller=label_both, scales="free", space="free")+ + scale_fill_gradient("releases\n(log scale)", low="white", high="red")+ + scale_x_continuous(breaks=seq(2006, 2022, by=2))+ + coord_cartesian(expand=FALSE) +``` + +The heat map above shows a summarized display of the release data we +saw earlier in the dot plot. + +Next, we can apply a list of summary functions over all of the yearly +counts, for each package, via + +```{r} +(per.year.stats <- dcast( + releases.per.year, + project + Package ~ ., + list(min, max, mean, sd, length), + value.var = "N")) +``` + +Finally, the code below creates a table to compare the two different ways of +computing the number of releases per year, + +```{r} +per.year.stats[overall.stats, .( + Package, + overall.mean=N_mean, + mean.per.year=releases.per.year +), on="Package"] +``` + +The table above show similar numbers for the two methods of computing +the number of releases per year. + +## Conclusion + +We have shown how to download CRAN package release data, how to parse +the web pages using the `nc` package and regular expressions, how to +summarize/analyze using `data.table`, and how to visualize using +`ggplot2`. From 20c5d28bd0f9b578936351c389fa824e8848f681 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 02:29:33 -0400 Subject: [PATCH 02/11] move up library --- posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index 6525d70..ae9b91c 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -95,8 +95,8 @@ Above the table shows all matches, in the same order as the original Archive web Below we key the table by date, which sorts and enables fast joins. ```{r} -Archive.dt[, IDate := as.IDate(date.str)] library(data.table) +Archive.dt[, IDate := as.IDate(date.str)] setkey(Archive.dt, IDate) Archive.dt ``` From 75a6c7c2713b34b14bccf1f31487c1e69b225d0d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 02:41:19 -0400 Subject: [PATCH 03/11] angle,w=12 --- posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index ae9b91c..a55fa0e 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -10,7 +10,7 @@ image: "copy_on_modify.png" ```{r Ropts, echo=FALSE} knitr::opts_chunk$set( dpi=100, - fig.width=9, + fig.width=12, fig.height=4) ``` @@ -203,6 +203,8 @@ pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) space.cm <- 0.2 gg <- ggplot(count.dt, aes( release, people, color=roles))+ + theme( + axis.text.x=element_text(hjust=1, angle=40))+ geom_line()+ geom_point()+ scale_x_date(breaks="year")+ From 2fe1826b09acc5b23ae162ed673abf66017bf34f Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 12:34:46 -0400 Subject: [PATCH 04/11] rm date.str --- .../index.qmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index a55fa0e..2ca8870 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -12,6 +12,8 @@ knitr::opts_chunk$set( dpi=100, fig.width=12, fig.height=4) +options( + datatable.print.nrows=20) ``` Happy birthday, `data.table`! @@ -68,11 +70,12 @@ nc::capture_all_str(Archive.data.table, file.pattern) Next, we add to the pattern to match the release date, ```{r} +library(data.table) Archive.pattern <- list( file=file.pattern, "", "\\s+", - date.str=".*?", + IDate=".*?", as.IDate, "\\s") ``` @@ -81,8 +84,8 @@ The code above has * `file=file.pattern` which means to apply the previous regex, and put the matching text in the `file` column, * `""` which matches the closing `` tag * `"\\s+"` which matches one or more white space characters, -* `date.str=".*?"` which matches zero or more characters (non-greedy, - as few as possible), and output them in the `date.str` column, +* `IDate=".*?", as.IDate,` which matches zero or more characters (non-greedy, + as few as possible), then use `as.IDate` to convert the text to efficient integer date, saved in the `IDate` column, * `"\\s"` means to match one white space character. The end result is a table with one row for each matched package version, and one column for each of the named arguments: @@ -95,8 +98,6 @@ Above the table shows all matches, in the same order as the original Archive web Below we key the table by date, which sorts and enables fast joins. ```{r} -library(data.table) -Archive.dt[, IDate := as.IDate(date.str)] setkey(Archive.dt, IDate) Archive.dt ``` @@ -258,8 +259,7 @@ For plotting we add a few more columns, ```{r} release.dt[, `:=`( - IDate = as.IDate(date.str), - year = as.integer(sub("-.*", "", date.str)), + year = as.integer(sub("-.*", "", IDate)), package = factor(Package, compare.pkg.dt$Package), Project = paste0('\n', project))] setkey(release.dt, Project, Package, IDate) From 5bec46b70ce55494fc9671fd228b81fb0833ae33 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 13:58:17 -0400 Subject: [PATCH 05/11] comments --- .../index.qmd | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index 2ca8870..37f768b 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -95,60 +95,69 @@ The end result is a table with one row for each matched package version, and one ``` Above the table shows all matches, in the same order as the original Archive web page. -Below we key the table by date, which sorts and enables fast joins. +Below we key the table by date, which sorts the data in place (without allocating any new memory), and enables fast joins. ```{r} setkey(Archive.dt, IDate) Archive.dt ``` -Next, we define a grid of dates corresponding to every ten years. +We see the table above has been sorted by release date. +Next, we define a grid of dates which we will search for the nearest release. ```{r} +every.year.since.2016 <- seq( + as.IDate("2016-04-14"), + Sys.time(), + by="year") (grid.dt <- setkey(data.table( grid.IDate=c( - as.IDate("2006-04-14"), - as.IDate("2011-04-14"), - seq( - as.IDate("2016-04-14"), - as.IDate("2026-04-14"), - length.out=11))))) + as.IDate("2006-04-14"), # first release. + as.IDate("2011-04-14"), # fifth anniversary. + every.year.since.2016)))) ``` -Next, we do a rolling join to find which releases are nearest the 10 year grid. +The code above sets the key of the grid, which sorts and enables fast joins. +No variables were specified to `setkey()`; the default is to use all columns, in this case just one. +Note that `setkey()` sets the key by reference, then returns the table. + +Next, we do a rolling join to find which releases are nearest to each date in the grid. ```{r} -(nearest.dt <- Archive.dt[grid.dt, .( - file, version, package, - release=x.IDate, - grid=i.grid.IDate -), roll="nearest"]) +(nearest.dt <- unique(Archive.dt[grid.dt, .( + file, version, package, release=x.IDate +), roll="nearest"])) ``` -Next, we download the old package sources from the Archive, and extract the Author field of DESCRIPTION. +The output above shows one row per release we will analyze. +For each release, we download the package sources from the Archive, and extract the Author field of DESCRIPTION. ```{r} desc.dt <- nearest.dt[, { cache.dir <- "~/Archive" dir.create(cache.dir, showWarnings = FALSE) dt.tar.gz <- file.path(cache.dir, file) - if(!file.exists(dt.tar.gz)) - download.file(paste0(Archive, package, "/", file), dt.tar.gz) + if(!file.exists(dt.tar.gz)){ + url.tar.gz <- paste0(Archive, package, "/", file) + download.file(url.tar.gz, dt.tar.gz) + } conn <- gzfile(dt.tar.gz, "b") DESCRIPTION <- file.path(package, "DESCRIPTION") untar(conn, files=DESCRIPTION) close(conn) - .(Author=read.dcf(DESCRIPTION)[,"Author"]) + as.data.table(read.dcf(DESCRIPTION)[,"Author",drop=FALSE]) }, by=.(version, release)] +cat_head <- function(x)cat(head(x),sep="\n-----------\n") +cat_head(desc.dt$Author) ``` -The output above seems to have extra newlines, which we remove below: +We see above that the `Author` field can contain newlines, which we remove below, to make later parsing easier: ```{r} -desc.dt[, no.newlines := gsub("\n", " ", Author)] -cat(paste(desc.dt$no.newlines,collapse="\n")) +desc.dt[, no.newlines := gsub("\n", " ", Author)][, cat_head(no.newlines)] ``` +We see above that the new column has no newlines. The output above has one line of comma-separated authors per ten year release. We would like to convert it to a table with one year per author. A simple approach would be @@ -173,14 +182,14 @@ author.pattern <- list( paren=".+?", "\\)", "?"), - ## each author ends with a comma, or the end of the string (\z). + ## each author ends with one of these (\z means end of string). nc::alternatives(" with (?:many )?contributions from ", ", ", "\\z")) (author.dt <- desc.dt[, nc::capture_all_str( no.newlines, author.pattern ), by=.(version, release)]) ``` -The table above has one row for each person who appears in the Author field. +The table above has one row for each time a person appears in the Author field of one of the releases. We will analyze the roles. ```{r} From 951204ed7fd13eef8d29259baf597fb7b0ac59fb Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 14:43:35 -0400 Subject: [PATCH 06/11] fcase --- .../index.qmd | 75 +++++++++++++------ 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index 37f768b..c4ac0f5 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -196,38 +196,63 @@ We will analyze the roles. author.dt[roles==""] ``` -We see some old etries above with missing roles, which we fill in below. +We see some old entries above with missing roles, which we fill in below. ```{r} -author.dt[grepl("Dowle|Srinivasan", name), roles := "aut"] -author.dt[roles=="", roles := "ctb"] -author.dt[roles=="aut, cre", roles := "aut"] -(count.dt <- author.dt[, .(people=.N), by=.(release, version, roles)]) +linewidth.values <- c( + ctb=2, + aut=1) +author.dt[ +, Role := factor(fcase( + roles=="aut, cre" | grepl("Dowle|Srinivasan", name), "aut", + roles=="", "ctb", + default=roles), names(linewidth.values)) +][ +, table(roles, Role, useNA="always") +] +``` + +Above we use `fcase()` to create a new `Role` column, with factor levels in a non-default order (to control legend entry display order below). +Then we chain square brackets to display a table which shows how `roles` values are mapped to `Role`. +The counts look reasonable, so the next step is to count how many people with each role in each release: + +```{r} +(count.dt <- author.dt[, .(people=.N), by=.(release, version, Role)]) ``` How has this evolved in the past ten years? ```{r} library(ggplot2) -pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) -space.cm <- 0.2 gg <- ggplot(count.dt, aes( - release, people, color=roles))+ + release, people, color=Role))+ theme( axis.text.x=element_text(hjust=1, angle=40))+ - geom_line()+ - geom_point()+ - scale_x_date(breaks="year")+ - scale_y_log10(limits=c(0.2,500))+ + geom_line(aes(linewidth=Role))+ + geom_point(shape=21, fill="white")+ + scale_x_date(breaks="year", )+ + scale_linewidth_manual(values=linewidth.values)+ + scale_y_log10(limits=c(0.2, 500)) +gg +``` + +Above we see a time series showing the increasing authors and contributors over time. +To emphasize the values at each release, we add direct labels below: + +```{r} +pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) +space.cm <- 0.2 +cex <- 0.7 +directlabels::direct.label( + gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+ directlabels::geom_dl(aes( label=sprintf("%s\n%s", version, pp(people))), - data=count.dt[roles=="ctb"], - method=list(directlabels::dl.trans(y=y+space.cm), "top.polygons"))+ + data=count.dt[Role=="ctb"], + method=list(directlabels::dl.trans(cex=cex, y=y+space.cm), "top.polygons"))+ directlabels::geom_dl(aes( label=sprintf("%s\n%s", pp(people), version)), - data=count.dt[roles=="aut"], - method=list(directlabels::dl.trans(y=y-space.cm), "bottom.polygons")) -directlabels::direct.label(gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons")) + data=count.dt[Role=="aut"], + method=list(directlabels::dl.trans(cex=cex, y=y-space.cm), "bottom.polygons")) ``` The figure above shows that the number of authors and contributors has greatly expanded in the second decade of `data.table`. @@ -369,18 +394,26 @@ each value specified in `i` (the first argument in the square bracket). We can plot these data as a heat map via ```{r heatMap} +this.year <- as.integer(strftime(Sys.time(), "%Y")) ggplot()+ theme_bw()+ - theme(panel.spacing=grid::unit(0, "lines"))+ + theme( + panel.spacing=grid::unit(0, "lines"), + axis.text.x=element_text(hjust=1, angle=40))+ geom_tile(aes( - year, package, fill=log(N+1)), + year, package, fill=N), data=releases.per.year)+ geom_text(aes( year, package, label=N), data=releases.per.year)+ facet_grid(Project ~ ., labeller=label_both, scales="free", space="free")+ - scale_fill_gradient("releases\n(log scale)", low="white", high="red")+ - scale_x_continuous(breaks=seq(2006, 2022, by=2))+ + scale_fill_gradient( + "releases", + low="white", + high="red", + breaks=c(0, 2^seq(0, 4)), + transform=scales::transform_log1p())+ + scale_x_continuous(breaks=seq(2006, this.year))+ coord_cartesian(expand=FALSE) ``` From f4f5416998598b4204139f2c7da2191328c238d7 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 14:57:19 -0400 Subject: [PATCH 07/11] prop --- posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index c4ac0f5..f77fd4d 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -227,10 +227,11 @@ library(ggplot2) gg <- ggplot(count.dt, aes( release, people, color=Role))+ theme( + panel.grid.minor=element_blank(), axis.text.x=element_text(hjust=1, angle=40))+ geom_line(aes(linewidth=Role))+ geom_point(shape=21, fill="white")+ - scale_x_date(breaks="year", )+ + scale_x_date(breaks="year")+ scale_linewidth_manual(values=linewidth.values)+ scale_y_log10(limits=c(0.2, 500)) gg @@ -241,10 +242,17 @@ To emphasize the values at each release, we add direct labels below: ```{r} pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) +prop <- 0.1 space.cm <- 0.2 cex <- 0.7 directlabels::direct.label( gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+ + scale_x_date( + breaks=grid.dt$grid.IDate, + limits=grid.dt[, { + i <- as.integer(grid.IDate) + as.IDate(c(min(i), (1+prop)*max(i)-prop*min(i))) + }])+ directlabels::geom_dl(aes( label=sprintf("%s\n%s", version, pp(people))), data=count.dt[Role=="ctb"], From 77f6c897c2171789691b378331ce96b0df53f534 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 15:00:38 -0400 Subject: [PATCH 08/11] comments --- posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index f77fd4d..df212de 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -242,9 +242,12 @@ To emphasize the values at each release, we add direct labels below: ```{r} pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) +## To define upper limit of X scale, we use prop. +## prop=0 means no extra space. +## prop=0.1 means 10% more space, etc. prop <- 0.1 -space.cm <- 0.2 -cex <- 0.7 +space.cm <- 0.2 # offset of direct labels from data. +cex <- 0.7 # text size of direct labels. directlabels::direct.label( gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+ scale_x_date( From 56e3cd44144032f95b81b76d445d17911dd48a9f Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 15:17:39 -0400 Subject: [PATCH 09/11] poly.method --- .../index.qmd | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index df212de..36ca56f 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -246,8 +246,14 @@ pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people")) ## prop=0 means no extra space. ## prop=0.1 means 10% more space, etc. prop <- 0.1 -space.cm <- 0.2 # offset of direct labels from data. -cex <- 0.7 # text size of direct labels. +space.cm <- 0.2 # space between polygon point and data point. +poly.method <- function(position, direction)substitute(list( + directlabels::dl.trans( + cex=0.7, # text size of direct labels. + y=y+YSPACE), + directlabels::polygon.method( + POSITION, offset.cm=0.5)), #space between polygon point and text. + list(YSPACE=direction*space.cm, POSITION=position)) directlabels::direct.label( gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+ scale_x_date( @@ -259,11 +265,11 @@ directlabels::direct.label( directlabels::geom_dl(aes( label=sprintf("%s\n%s", version, pp(people))), data=count.dt[Role=="ctb"], - method=list(directlabels::dl.trans(cex=cex, y=y+space.cm), "top.polygons"))+ + method=poly.method("top", 1))+ directlabels::geom_dl(aes( label=sprintf("%s\n%s", pp(people), version)), data=count.dt[Role=="aut"], - method=list(directlabels::dl.trans(cex=cex, y=y-space.cm), "bottom.polygons")) + method=poly.method("bottom", -1)) ``` The figure above shows that the number of authors and contributors has greatly expanded in the second decade of `data.table`. From 83c1669acc28c672527b9f05abf4bc08a31bdcad Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 17 Apr 2026 15:43:21 -0400 Subject: [PATCH 10/11] ggtitle --- posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index 36ca56f..ae1fb43 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -157,16 +157,15 @@ We see above that the `Author` field can contain newlines, which we remove below desc.dt[, no.newlines := gsub("\n", " ", Author)][, cat_head(no.newlines)] ``` -We see above that the new column has no newlines. -The output above has one line of comma-separated authors per ten year release. -We would like to convert it to a table with one year per author. +The output above has one line of comma-separated authors per release. +We would like to convert these data to a table with one year per author. A simple approach would be ```{r} head(sapply(strsplit(desc.dt$no.newlines, ", "), head)) ``` -It is clear that the result above does not quite work (Matt’s info is broken into the first two entries). +It is clear that the result above does not quite work (Matt’s `aut, cre` role contains a comma so is broken into two entries). Instead we can use ```{r} @@ -213,7 +212,7 @@ author.dt[ ``` Above we use `fcase()` to create a new `Role` column, with factor levels in a non-default order (to control legend entry display order below). -Then we chain square brackets to display a table which shows how `roles` values are mapped to `Role`. +Then we chain square brackets to display a table which shows how `roles` are mapped to `Role`. The counts look reasonable, so the next step is to count how many people with each role in each release: ```{r} @@ -226,6 +225,7 @@ How has this evolved in the past ten years? library(ggplot2) gg <- ggplot(count.dt, aes( release, people, color=Role))+ + ggtitle("data.table contributor and author counts for selected releases")+ theme( panel.grid.minor=element_blank(), axis.text.x=element_text(hjust=1, angle=40))+ From 7ba5b0a3021cec1590aa86f82e40916800a73703 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 20 Apr 2026 22:32:58 -0400 Subject: [PATCH 11/11] options --- .../index.qmd | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd index ae1fb43..0edbb71 100644 --- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd +++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd @@ -12,8 +12,6 @@ knitr::opts_chunk$set( dpi=100, fig.width=12, fig.height=4) -options( - datatable.print.nrows=20) ``` Happy birthday, `data.table`! @@ -64,6 +62,7 @@ The code above specifies a regular expression: Below we use that pattern to convert the web page into a data table with two columns, ```{r} +options(datatable.print.nrows=20) # instead of default 100. nc::capture_all_str(Archive.data.table, file.pattern) ``` @@ -147,17 +146,24 @@ desc.dt <- nearest.dt[, { close(conn) as.data.table(read.dcf(DESCRIPTION)[,"Author",drop=FALSE]) }, by=.(version, release)] -cat_head <- function(x)cat(head(x),sep="\n-----------\n") -cat_head(desc.dt$Author) ``` -We see above that the `Author` field can contain newlines, which we remove below, to make later parsing easier: +To avoid printing the full Author column (a long string), we can set an option: + +```{r} +options( + datatable.prettyprint.char=30, # print ... after this many characters. + width=100) # max characters before wrapping columns to next line. +desc.dt +``` + +We see above that the `Author` field can contain newlines (after the comma), which we remove below, to make later parsing easier: ```{r} -desc.dt[, no.newlines := gsub("\n", " ", Author)][, cat_head(no.newlines)] +desc.dt[, no.newlines := gsub("\n", " ", Author)][] ``` -The output above has one line of comma-separated authors per release. +The output above has a new column of comma-separated authors per release (with no newlines). We would like to convert these data to a table with one year per author. A simple approach would be