From 16a1de2c4d065183716bb1cf36dc8286461a7bc7 Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.hocking@r-project.org>
Date: Fri, 17 Apr 2026 02:21:18 -0400
Subject: [PATCH 01/11] add birthday post

---
 .../index.qmd                                 | 409 ++++++++++++++++++
 1 file changed, 409 insertions(+)
 create mode 100644 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
new file mode 100644
index 0000000..6525d70
--- /dev/null
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -0,0 +1,409 @@
+---
+title: "Happy birthday, data.table"
+author: "Toby Hocking"
+date: "2026-04-15"
+categories: [tips, tutorials, developer]
+draft: false
+image: "copy_on_modify.png"
+---
+
+```{r Ropts, echo=FALSE}
+knitr::opts_chunk$set(
+  dpi=100,
+  fig.width=9,
+  fig.height=4)
+```
+
+Happy birthday, `data.table`!
+
+# New analysis of authors and contributors
+
+Since this is the 20th anniversary of Matt’s original CRAN submission, I wanted to do some analysis of contributors over time, to emphasize the great community that has been working to improve `data.table` in recent years.
+To do that, we first download data on all releases, using code from [my previous post](https://tdhock.github.io/blog/2022/release-history/) about the release history of `data.table`.
+
+## Download Archive web page
+
+We can download the Archive web page for `data.table` via the code below,
+
+```{r}
+Archive <- "https://cloud.r-project.org/src/contrib/Archive/"
+get_Archive <- function(Package, releases.dir="~/releases"){
+  dir.create(releases.dir, showWarnings = FALSE)
+  pkg.html <- file.path(releases.dir, paste0(Package, ".html"))
+  if(!file.exists(pkg.html)){
+    u <- paste0(Archive, Package)
+    download.file(u, pkg.html)
+  }
+  readLines(pkg.html)
+}
+(Archive.data.table <- get_Archive("data.table"))
+```
+
+The output above shows that the Archive web page has a regular structure, which we can convert into a data table using the regular expression pattern below.
+
+```{r}
+file.pattern <- list(
+  '(?<=>)',
+  package=".*?",
+  "_",
+  version="[0-9.-]+",
+  "[.]tar[.]gz")
+```
+
+The code above specifies a regular expression:
+
+* `'(?<=>)'` is a lookbehind assertion. It means to start by looking for a greater than sign, but not including that character in the match.
+* `package=".*?"` means to match zero or more of anything except newline (non-greedy, as few as possible), and output the match in the `package` column,
+* `"_"` means to start by matching an underscore,
+* `version="[0-9.-]+"` means to match one or more digits/dots/dashes, and
+  output them in the `version` column,
+* `"[.]tar[.]gz</a>\\s+"` means to match the `.tar.gz` file name suffix.
+
+Below we use that pattern to convert the web page into a data table with two columns,
+
+```{r}
+nc::capture_all_str(Archive.data.table, file.pattern)
+```
+
+Next, we add to the pattern to match the release date,
+
+```{r}
+Archive.pattern <- list(
+  file=file.pattern,
+  "</a>",
+  "\\s+",
+  date.str=".*?",
+  "\\s")
+```
+
+The code above has
+
+* `file=file.pattern` which means to apply the previous regex, and put the matching text in the `file` column,
+* `"</a>"` which matches the closing `</a>` tag
+* `"\\s+"` which matches one or more white space characters,
+* `date.str=".*?"` which matches zero or more characters (non-greedy,
+  as few as possible), and output them in the `date.str` column,
+* `"\\s"` means to match one white space character.
+
+The end result is a table with one row for each matched package version, and one column for each of the named arguments:
+
+```{r}
+(Archive.dt <- nc::capture_all_str(Archive.data.table, Archive.pattern))
+```
+
+Above the table shows all matches, in the same order as the original Archive web page.
+Below we key the table by date, which sorts and enables fast joins.
+
+```{r}
+Archive.dt[, IDate := as.IDate(date.str)]
+library(data.table)
+setkey(Archive.dt, IDate)
+Archive.dt
+```
+
+Next, we define a grid of dates corresponding to every ten years.
+
+```{r}
+(grid.dt <- setkey(data.table(
+  grid.IDate=c(
+    as.IDate("2006-04-14"),
+    as.IDate("2011-04-14"),
+    seq(
+      as.IDate("2016-04-14"),
+      as.IDate("2026-04-14"),
+      length.out=11)))))
+```
+
+Next, we do a rolling join to find which releases are nearest the 10 year grid.
+
+```{r}
+(nearest.dt <- Archive.dt[grid.dt, .(
+  file, version, package,
+  release=x.IDate,
+  grid=i.grid.IDate
+), roll="nearest"])
+```
+
+Next, we download the old package sources from the Archive, and extract the Author field of DESCRIPTION.
+
+```{r}
+desc.dt <- nearest.dt[, {
+  cache.dir <- "~/Archive"
+  dir.create(cache.dir, showWarnings = FALSE)
+  dt.tar.gz <- file.path(cache.dir, file)
+  if(!file.exists(dt.tar.gz))
+    download.file(paste0(Archive, package, "/", file), dt.tar.gz)
+  conn <- gzfile(dt.tar.gz, "b")
+  DESCRIPTION <- file.path(package, "DESCRIPTION")
+  untar(conn, files=DESCRIPTION)
+  close(conn)
+  .(Author=read.dcf(DESCRIPTION)[,"Author"])
+}, by=.(version, release)]
+```
+
+The output above seems to have extra newlines, which we remove below:
+
+```{r}
+desc.dt[, no.newlines := gsub("\n", " ", Author)]
+cat(paste(desc.dt$no.newlines,collapse="\n"))
+```
+
+The output above has one line of comma-separated authors per ten year release.
+We would like to convert it to a table with one year per author.
+A simple approach would be
+
+```{r}
+head(sapply(strsplit(desc.dt$no.newlines, ", "), head))
+```
+
+It is clear that the result above does not quite work (Matt’s info is broken into the first two entries).
+Instead we can use
+
+```{r}
+author.pattern <- list(
+  name=".+?",
+  nc::quantifier(
+    " \\[",
+    roles=".+?",
+    "\\]",
+    "?"),
+  nc::quantifier(
+    " \\(", 
+    paren=".+?",
+    "\\)",
+    "?"),
+  ## each author ends with a comma, or the end of the string (\z).
+  nc::alternatives(" with (?:many )?contributions from ", ", ", "\\z"))
+(author.dt <- desc.dt[, nc::capture_all_str(
+  no.newlines, author.pattern
+), by=.(version, release)])
+```
+
+The table above has one row for each person who appears in the Author field.
+We will analyze the roles.
+
+```{r}
+author.dt[roles==""]
+```
+
+We see some old etries above with missing roles, which we fill in below.
+
+```{r}
+author.dt[grepl("Dowle|Srinivasan", name), roles := "aut"]
+author.dt[roles=="", roles := "ctb"]
+author.dt[roles=="aut, cre", roles := "aut"]
+(count.dt <- author.dt[, .(people=.N), by=.(release, version, roles)])
+```
+
+How has this evolved in the past ten years?
+
+```{r}
+library(ggplot2)
+pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
+space.cm <- 0.2
+gg <- ggplot(count.dt, aes(
+  release, people, color=roles))+
+  geom_line()+
+  geom_point()+
+  scale_x_date(breaks="year")+
+  scale_y_log10(limits=c(0.2,500))+
+  directlabels::geom_dl(aes(
+    label=sprintf("%s\n%s", version, pp(people))),
+    data=count.dt[roles=="ctb"],
+    method=list(directlabels::dl.trans(y=y+space.cm), "top.polygons"))+
+  directlabels::geom_dl(aes(
+    label=sprintf("%s\n%s", pp(people), version)),
+    data=count.dt[roles=="aut"],
+    method=list(directlabels::dl.trans(y=y-space.cm), "bottom.polygons"))
+directlabels::direct.label(gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))
+```
+
+The figure above shows that the number of authors and contributors has greatly expanded in the second decade of `data.table`.
+I’m looking forward to the third decade!
+
+# Update of the previous blog
+
+The rest of this post is copied from [my previous post](https://tdhock.github.io/blog/2022/release-history/), with an update based on recent data.
+
+## Analyze several packages for comparison
+
+The code below defines a set of four packages for which we would like
+to analyze the release history (tidyverse packages for comparison).
+
+```{r}
+compare.pkg.dt <- rbind(
+  data.table(project="tidyverse", Package=c("readr","tidyr","dplyr")),
+  data.table(project="deprecated", Package=c("reshape2", "plyr")),
+  data.table(project="data.table", Package="data.table"))
+```
+
+In the code below, we do the same thing for each package, 
+
+```{r}
+(release.dt <- compare.pkg.dt[, {
+  Archive.pkg <- get_Archive(Package)
+  nc::capture_all_str(Archive.pkg, Archive.pattern)
+}, by=names(compare.pkg.dt)])
+```
+
+The result above is a data table with one row for each package
+version. Note that the code set `by` to all column names, so that the
+code is run for each row/package.
+
+## Add columns for plotting
+
+For plotting we add a few more columns,
+
+```{r}
+release.dt[, `:=`(
+  IDate = as.IDate(date.str),
+  year = as.integer(sub("-.*", "", date.str)),
+  package = factor(Package, compare.pkg.dt$Package),
+  Project = paste0('\n', project))]
+setkey(release.dt, Project, Package, IDate)
+release.dt
+```
+
+To explain the new columns above,
+
+* `IDate` is for the date to display on the X axis,
+* `year` is for labeling the first released version each year,
+* `package` is for displaying the Y axis in a particular order
+  (defined by the factor levels),
+* `Project` is for the facet/panel titles (newline so that minimal
+  vertical space is used).
+  
+## Basic plot
+
+The code below creates a basic version history plot,
+
+```{r points}
+(gg.points <- ggplot()+
+  theme(
+    axis.text.x=element_text(hjust=1, angle=40))+
+  facet_grid(Project ~ ., labeller=label_both, scales="free")+
+  geom_point(aes(
+    IDate, package),
+    shape=1,
+    data=release.dt)+
+  scale_x_date("Date", breaks="year"))
+```
+
+The plot above shows a point for every release to CRAN, so you can see
+the distribution of releases over time.
+
+## Add direct labels
+
+Before plotting we make a new table which contains only the first
+release of `data.table` in each year (for direct labels),
+
+```{r}
+(labeled.releases <- release.dt[Package=="data.table", .SD[1], by=year])
+```
+
+```{r points-labels}
+gg.points+
+  directlabels::geom_dl(aes(
+    IDate, package, label=paste0(year, "\n", version)),
+    method=list(
+      cex=0.7,
+      directlabels::polygon.method(
+        "top", offset.cm=0.2, custom.colors=list(
+          colour="white",
+          box.color="black",
+          text.color="black"))),
+    data=labeled.releases)
+```
+  
+The plot above shows a label for the first version released each year.
+
+## Releases per year
+
+One way to compute releases per year would be to add up the total
+number of releases, then divide by the number of years,
+
+```{r release-history}
+(overall.stats <- dcast(
+  release.dt, 
+  project + Package ~ ., 
+  list(min,max,length), 
+  value.var="year"
+)[, releases.per.year := year_length/(year_max-year_min+1)][])
+```
+
+Another way to do it would be to compute the number of releases in
+each year since the release of the package. To do that we first
+compute, for each package, a set of years for which we want to count
+releases.
+
+```{r}
+(max.year <- max(release.dt$year))
+(years.since.release <- release.dt[, .(
+  year=seq(min(year), max.year)
+), by=.(Project, project, Package, package)])
+```
+
+Then we can do a join and summarize to count the number of releases in
+each year, for each package,
+
+```{r}
+(releases.per.year <- release.dt[years.since.release, .(
+  N=as.numeric(.N)
+), on=.NATURAL, by=.EACHI])
+```
+
+Note that `on=.NATURAL` above means to join on the common columns
+between the two tables, and `by=.EACHI` means to compute a summary for
+each value specified in `i` (the first argument in the square bracket).
+We can plot these data as a heat map via
+
+```{r heatMap}
+ggplot()+
+  theme_bw()+
+  theme(panel.spacing=grid::unit(0, "lines"))+
+  geom_tile(aes(
+    year, package, fill=log(N+1)),
+    data=releases.per.year)+
+  geom_text(aes(
+    year, package, label=N),
+    data=releases.per.year)+
+  facet_grid(Project ~ ., labeller=label_both, scales="free", space="free")+
+  scale_fill_gradient("releases\n(log scale)", low="white", high="red")+
+  scale_x_continuous(breaks=seq(2006, 2022, by=2))+
+  coord_cartesian(expand=FALSE)
+```
+
+The heat map above shows a summarized display of the release data we
+saw earlier in the dot plot.
+
+Next, we can apply a list of summary functions over all of the yearly
+counts, for each package, via
+
+```{r}
+(per.year.stats <- dcast(
+  releases.per.year,
+  project + Package ~ .,
+  list(min, max, mean, sd, length),
+  value.var = "N"))
+```
+
+Finally, the code below creates a table to compare the two different ways of
+computing the number of releases per year,
+
+```{r}
+per.year.stats[overall.stats, .(
+  Package, 
+  overall.mean=N_mean, 
+  mean.per.year=releases.per.year
+), on="Package"]
+```
+
+The table above show similar numbers for the two methods of computing
+the number of releases per year.
+
+## Conclusion
+
+We have shown how to download CRAN package release data, how to parse
+the web pages using the `nc` package and regular expressions, how to
+summarize/analyze using `data.table`, and how to visualize using
+`ggplot2`.

From 20c5d28bd0f9b578936351c389fa824e8848f681 Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.hocking@r-project.org>
Date: Fri, 17 Apr 2026 02:29:33 -0400
Subject: [PATCH 02/11] move up library

---
 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index 6525d70..ae9b91c 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -95,8 +95,8 @@ Above the table shows all matches, in the same order as the original Archive web
 Below we key the table by date, which sorts and enables fast joins.
 
 ```{r}
-Archive.dt[, IDate := as.IDate(date.str)]
 library(data.table)
+Archive.dt[, IDate := as.IDate(date.str)]
 setkey(Archive.dt, IDate)
 Archive.dt
 ```

From 75a6c7c2713b34b14bccf1f31487c1e69b225d0d Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.hocking@r-project.org>
Date: Fri, 17 Apr 2026 02:41:19 -0400
Subject: [PATCH 03/11] angle,w=12

---
 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index ae9b91c..a55fa0e 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -10,7 +10,7 @@ image: "copy_on_modify.png"
 ```{r Ropts, echo=FALSE}
 knitr::opts_chunk$set(
   dpi=100,
-  fig.width=9,
+  fig.width=12,
   fig.height=4)
 ```
 
@@ -203,6 +203,8 @@ pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
 space.cm <- 0.2
 gg <- ggplot(count.dt, aes(
   release, people, color=roles))+
+  theme(
+    axis.text.x=element_text(hjust=1, angle=40))+
   geom_line()+
   geom_point()+
   scale_x_date(breaks="year")+

From 2fe1826b09acc5b23ae162ed673abf66017bf34f Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 12:34:46 -0400
Subject: [PATCH 04/11] rm date.str

---
 .../index.qmd                                      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index a55fa0e..2ca8870 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -12,6 +12,8 @@ knitr::opts_chunk$set(
   dpi=100,
   fig.width=12,
   fig.height=4)
+options(
+  datatable.print.nrows=20)
 ```
 
 Happy birthday, `data.table`!
@@ -68,11 +70,12 @@ nc::capture_all_str(Archive.data.table, file.pattern)
 Next, we add to the pattern to match the release date,
 
 ```{r}
+library(data.table)
 Archive.pattern <- list(
   file=file.pattern,
   "</a>",
   "\\s+",
-  date.str=".*?",
+  IDate=".*?", as.IDate,
   "\\s")
 ```
 
@@ -81,8 +84,8 @@ The code above has
 * `file=file.pattern` which means to apply the previous regex, and put the matching text in the `file` column,
 * `"</a>"` which matches the closing `</a>` tag
 * `"\\s+"` which matches one or more white space characters,
-* `date.str=".*?"` which matches zero or more characters (non-greedy,
-  as few as possible), and output them in the `date.str` column,
+* `IDate=".*?", as.IDate,` which matches zero or more characters (non-greedy,
+  as few as possible), then use `as.IDate` to convert the text to efficient integer date, saved in the `IDate` column,
 * `"\\s"` means to match one white space character.
 
 The end result is a table with one row for each matched package version, and one column for each of the named arguments:
@@ -95,8 +98,6 @@ Above the table shows all matches, in the same order as the original Archive web
 Below we key the table by date, which sorts and enables fast joins.
 
 ```{r}
-library(data.table)
-Archive.dt[, IDate := as.IDate(date.str)]
 setkey(Archive.dt, IDate)
 Archive.dt
 ```
@@ -258,8 +259,7 @@ For plotting we add a few more columns,
 
 ```{r}
 release.dt[, `:=`(
-  IDate = as.IDate(date.str),
-  year = as.integer(sub("-.*", "", date.str)),
+  year = as.integer(sub("-.*", "", IDate)),
   package = factor(Package, compare.pkg.dt$Package),
   Project = paste0('\n', project))]
 setkey(release.dt, Project, Package, IDate)

From 5bec46b70ce55494fc9671fd228b81fb0833ae33 Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 13:58:17 -0400
Subject: [PATCH 05/11] comments

---
 .../index.qmd                                 | 55 +++++++++++--------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index 2ca8870..37f768b 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -95,60 +95,69 @@ The end result is a table with one row for each matched package version, and one
 ```
 
 Above the table shows all matches, in the same order as the original Archive web page.
-Below we key the table by date, which sorts and enables fast joins.
+Below we key the table by date, which sorts the data in place (without allocating any new memory), and enables fast joins.
 
 ```{r}
 setkey(Archive.dt, IDate)
 Archive.dt
 ```
 
-Next, we define a grid of dates corresponding to every ten years.
+We see the table above has been sorted by release date.
+Next, we define a grid of dates which we will search for the nearest release.
 
 ```{r}
+every.year.since.2016 <- seq(
+  as.IDate("2016-04-14"),
+  Sys.time(),
+  by="year")
 (grid.dt <- setkey(data.table(
   grid.IDate=c(
-    as.IDate("2006-04-14"),
-    as.IDate("2011-04-14"),
-    seq(
-      as.IDate("2016-04-14"),
-      as.IDate("2026-04-14"),
-      length.out=11)))))
+    as.IDate("2006-04-14"), # first release.
+    as.IDate("2011-04-14"), # fifth anniversary.
+    every.year.since.2016))))
 ```
 
-Next, we do a rolling join to find which releases are nearest the 10 year grid.
+The code above sets the key of the grid, which sorts and enables fast joins.
+No variables were specified to `setkey()`; the default is to use all columns, in this case just one.
+Note that `setkey()` sets the key by reference, then returns the table.
+
+Next, we do a rolling join to find which releases are nearest to each date in the grid.
 
 ```{r}
-(nearest.dt <- Archive.dt[grid.dt, .(
-  file, version, package,
-  release=x.IDate,
-  grid=i.grid.IDate
-), roll="nearest"])
+(nearest.dt <- unique(Archive.dt[grid.dt, .(
+  file, version, package, release=x.IDate
+), roll="nearest"]))
 ```
 
-Next, we download the old package sources from the Archive, and extract the Author field of DESCRIPTION.
+The output above shows one row per release we will analyze.
+For each release, we download the package sources from the Archive, and extract the Author field of DESCRIPTION.
 
 ```{r}
 desc.dt <- nearest.dt[, {
   cache.dir <- "~/Archive"
   dir.create(cache.dir, showWarnings = FALSE)
   dt.tar.gz <- file.path(cache.dir, file)
-  if(!file.exists(dt.tar.gz))
-    download.file(paste0(Archive, package, "/", file), dt.tar.gz)
+  if(!file.exists(dt.tar.gz)){
+    url.tar.gz <- paste0(Archive, package, "/", file)
+    download.file(url.tar.gz, dt.tar.gz)
+  }
   conn <- gzfile(dt.tar.gz, "b")
   DESCRIPTION <- file.path(package, "DESCRIPTION")
   untar(conn, files=DESCRIPTION)
   close(conn)
-  .(Author=read.dcf(DESCRIPTION)[,"Author"])
+  as.data.table(read.dcf(DESCRIPTION)[,"Author",drop=FALSE])
 }, by=.(version, release)]
+cat_head <- function(x)cat(head(x),sep="\n-----------\n")
+cat_head(desc.dt$Author)
 ```
 
-The output above seems to have extra newlines, which we remove below:
+We see above that the `Author` field can contain newlines, which we remove below, to make later parsing easier:
 
 ```{r}
-desc.dt[, no.newlines := gsub("\n", " ", Author)]
-cat(paste(desc.dt$no.newlines,collapse="\n"))
+desc.dt[, no.newlines := gsub("\n", " ", Author)][, cat_head(no.newlines)]
 ```
 
+We see above that the new column has no newlines.
 The output above has one line of comma-separated authors per ten year release.
 We would like to convert it to a table with one year per author.
 A simple approach would be
@@ -173,14 +182,14 @@ author.pattern <- list(
     paren=".+?",
     "\\)",
     "?"),
-  ## each author ends with a comma, or the end of the string (\z).
+  ## each author ends with one of these (\z means end of string).
   nc::alternatives(" with (?:many )?contributions from ", ", ", "\\z"))
 (author.dt <- desc.dt[, nc::capture_all_str(
   no.newlines, author.pattern
 ), by=.(version, release)])
 ```
 
-The table above has one row for each person who appears in the Author field.
+The table above has one row for each time a person appears in the Author field of one of the releases.
 We will analyze the roles.
 
 ```{r}

From 951204ed7fd13eef8d29259baf597fb7b0ac59fb Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 14:43:35 -0400
Subject: [PATCH 06/11] fcase

---
 .../index.qmd                                 | 75 +++++++++++++------
 1 file changed, 54 insertions(+), 21 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index 37f768b..c4ac0f5 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -196,38 +196,63 @@ We will analyze the roles.
 author.dt[roles==""]
 ```
 
-We see some old etries above with missing roles, which we fill in below.
+We see some old entries above with missing roles, which we fill in below.
 
 ```{r}
-author.dt[grepl("Dowle|Srinivasan", name), roles := "aut"]
-author.dt[roles=="", roles := "ctb"]
-author.dt[roles=="aut, cre", roles := "aut"]
-(count.dt <- author.dt[, .(people=.N), by=.(release, version, roles)])
+linewidth.values <- c(
+  ctb=2,
+  aut=1)
+author.dt[
+, Role := factor(fcase(
+  roles=="aut, cre" | grepl("Dowle|Srinivasan", name), "aut",
+  roles=="", "ctb",
+  default=roles), names(linewidth.values))
+][
+, table(roles, Role, useNA="always")
+]
+```
+
+Above we use `fcase()` to create a new `Role` column, with factor levels in a non-default order (to control legend entry display order below).
+Then we chain square brackets to display a table which shows how `roles` values are mapped to `Role`.
+The counts look reasonable, so the next step is to count how many people with each role in each release:
+
+```{r}
+(count.dt <- author.dt[, .(people=.N), by=.(release, version, Role)])
 ```
 
 How has this evolved in the past ten years?
 
 ```{r}
 library(ggplot2)
-pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
-space.cm <- 0.2
 gg <- ggplot(count.dt, aes(
-  release, people, color=roles))+
+  release, people, color=Role))+
   theme(
     axis.text.x=element_text(hjust=1, angle=40))+
-  geom_line()+
-  geom_point()+
-  scale_x_date(breaks="year")+
-  scale_y_log10(limits=c(0.2,500))+
+  geom_line(aes(linewidth=Role))+
+  geom_point(shape=21, fill="white")+
+  scale_x_date(breaks="year", )+
+  scale_linewidth_manual(values=linewidth.values)+
+  scale_y_log10(limits=c(0.2, 500))
+gg
+```
+
+Above we see a time series showing the increasing authors and contributors over time.
+To emphasize the values at each release, we add direct labels below:
+
+```{r}
+pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
+space.cm <- 0.2
+cex <- 0.7
+directlabels::direct.label(
+  gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+
   directlabels::geom_dl(aes(
     label=sprintf("%s\n%s", version, pp(people))),
-    data=count.dt[roles=="ctb"],
-    method=list(directlabels::dl.trans(y=y+space.cm), "top.polygons"))+
+    data=count.dt[Role=="ctb"],
+    method=list(directlabels::dl.trans(cex=cex, y=y+space.cm), "top.polygons"))+
   directlabels::geom_dl(aes(
     label=sprintf("%s\n%s", pp(people), version)),
-    data=count.dt[roles=="aut"],
-    method=list(directlabels::dl.trans(y=y-space.cm), "bottom.polygons"))
-directlabels::direct.label(gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))
+    data=count.dt[Role=="aut"],
+    method=list(directlabels::dl.trans(cex=cex, y=y-space.cm), "bottom.polygons"))
 ```
 
 The figure above shows that the number of authors and contributors has greatly expanded in the second decade of `data.table`.
@@ -369,18 +394,26 @@ each value specified in `i` (the first argument in the square bracket).
 We can plot these data as a heat map via
 
 ```{r heatMap}
+this.year <- as.integer(strftime(Sys.time(), "%Y"))
 ggplot()+
   theme_bw()+
-  theme(panel.spacing=grid::unit(0, "lines"))+
+  theme(
+    panel.spacing=grid::unit(0, "lines"),
+    axis.text.x=element_text(hjust=1, angle=40))+
   geom_tile(aes(
-    year, package, fill=log(N+1)),
+    year, package, fill=N),
     data=releases.per.year)+
   geom_text(aes(
     year, package, label=N),
     data=releases.per.year)+
   facet_grid(Project ~ ., labeller=label_both, scales="free", space="free")+
-  scale_fill_gradient("releases\n(log scale)", low="white", high="red")+
-  scale_x_continuous(breaks=seq(2006, 2022, by=2))+
+  scale_fill_gradient(
+    "releases",
+    low="white",
+    high="red",
+    breaks=c(0, 2^seq(0, 4)),
+    transform=scales::transform_log1p())+
+  scale_x_continuous(breaks=seq(2006, this.year))+
   coord_cartesian(expand=FALSE)
 ```
 

From f4f5416998598b4204139f2c7da2191328c238d7 Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 14:57:19 -0400
Subject: [PATCH 07/11] prop

---
 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index c4ac0f5..f77fd4d 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -227,10 +227,11 @@ library(ggplot2)
 gg <- ggplot(count.dt, aes(
   release, people, color=Role))+
   theme(
+    panel.grid.minor=element_blank(),
     axis.text.x=element_text(hjust=1, angle=40))+
   geom_line(aes(linewidth=Role))+
   geom_point(shape=21, fill="white")+
-  scale_x_date(breaks="year", )+
+  scale_x_date(breaks="year")+
   scale_linewidth_manual(values=linewidth.values)+
   scale_y_log10(limits=c(0.2, 500))
 gg
@@ -241,10 +242,17 @@ To emphasize the values at each release, we add direct labels below:
 
 ```{r}
 pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
+prop <- 0.1
 space.cm <- 0.2
 cex <- 0.7
 directlabels::direct.label(
   gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+
+  scale_x_date(
+    breaks=grid.dt$grid.IDate,
+    limits=grid.dt[, {
+      i <- as.integer(grid.IDate)
+      as.IDate(c(min(i), (1+prop)*max(i)-prop*min(i)))
+    }])+
   directlabels::geom_dl(aes(
     label=sprintf("%s\n%s", version, pp(people))),
     data=count.dt[Role=="ctb"],

From 77f6c897c2171789691b378331ce96b0df53f534 Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 15:00:38 -0400
Subject: [PATCH 08/11] comments

---
 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index f77fd4d..df212de 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -242,9 +242,12 @@ To emphasize the values at each release, we add direct labels below:
 
 ```{r}
 pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
+## To define upper limit of X scale, we use prop.
+## prop=0 means no extra space.
+## prop=0.1 means 10% more space, etc.
 prop <- 0.1
-space.cm <- 0.2
-cex <- 0.7
+space.cm <- 0.2 # offset of direct labels from data.
+cex <- 0.7 # text size of direct labels.
 directlabels::direct.label(
   gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+
   scale_x_date(

From 56e3cd44144032f95b81b76d445d17911dd48a9f Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 15:17:39 -0400
Subject: [PATCH 09/11] poly.method

---
 .../index.qmd                                      | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index df212de..36ca56f 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -246,8 +246,14 @@ pp <- function(num)sprintf("%d %s", num, ifelse(num==1, "person", "people"))
 ## prop=0 means no extra space.
 ## prop=0.1 means 10% more space, etc.
 prop <- 0.1
-space.cm <- 0.2 # offset of direct labels from data.
-cex <- 0.7 # text size of direct labels.
+space.cm <- 0.2 # space between polygon point and data point.
+poly.method <- function(position, direction)substitute(list(
+  directlabels::dl.trans(
+    cex=0.7, # text size of direct labels.
+    y=y+YSPACE),
+  directlabels::polygon.method(
+    POSITION, offset.cm=0.5)), #space between polygon point and text.
+  list(YSPACE=direction*space.cm, POSITION=position))
 directlabels::direct.label(
   gg, list(directlabels::dl.trans(x=x+space.cm), "right.polygons"))+
   scale_x_date(
@@ -259,11 +265,11 @@ directlabels::direct.label(
   directlabels::geom_dl(aes(
     label=sprintf("%s\n%s", version, pp(people))),
     data=count.dt[Role=="ctb"],
-    method=list(directlabels::dl.trans(cex=cex, y=y+space.cm), "top.polygons"))+
+    method=poly.method("top", 1))+
   directlabels::geom_dl(aes(
     label=sprintf("%s\n%s", pp(people), version)),
     data=count.dt[Role=="aut"],
-    method=list(directlabels::dl.trans(cex=cex, y=y-space.cm), "bottom.polygons"))
+    method=poly.method("bottom", -1))
 ```
 
 The figure above shows that the number of authors and contributors has greatly expanded in the second decade of `data.table`.

From 83c1669acc28c672527b9f05abf4bc08a31bdcad Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.dylan.hocking@usherbrooke.ca>
Date: Fri, 17 Apr 2026 15:43:21 -0400
Subject: [PATCH 10/11] ggtitle

---
 posts/2026-04-15-happy_birthday-toby_hocking/index.qmd | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index 36ca56f..ae1fb43 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -157,16 +157,15 @@ We see above that the `Author` field can contain newlines, which we remove below
 desc.dt[, no.newlines := gsub("\n", " ", Author)][, cat_head(no.newlines)]
 ```
 
-We see above that the new column has no newlines.
-The output above has one line of comma-separated authors per ten year release.
-We would like to convert it to a table with one year per author.
+The output above has one line of comma-separated authors per release.
+We would like to convert these data to a table with one year per author.
 A simple approach would be
 
 ```{r}
 head(sapply(strsplit(desc.dt$no.newlines, ", "), head))
 ```
 
-It is clear that the result above does not quite work (Matt’s info is broken into the first two entries).
+It is clear that the result above does not quite work (Matt’s `aut, cre` role contains a comma so is broken into two entries).
 Instead we can use
 
 ```{r}
@@ -213,7 +212,7 @@ author.dt[
 ```
 
 Above we use `fcase()` to create a new `Role` column, with factor levels in a non-default order (to control legend entry display order below).
-Then we chain square brackets to display a table which shows how `roles` values are mapped to `Role`.
+Then we chain square brackets to display a table which shows how `roles` are mapped to `Role`.
 The counts look reasonable, so the next step is to count how many people with each role in each release:
 
 ```{r}
@@ -226,6 +225,7 @@ How has this evolved in the past ten years?
 library(ggplot2)
 gg <- ggplot(count.dt, aes(
   release, people, color=Role))+
+  ggtitle("data.table contributor and author counts for selected releases")+
   theme(
     panel.grid.minor=element_blank(),
     axis.text.x=element_text(hjust=1, angle=40))+

From 7ba5b0a3021cec1590aa86f82e40916800a73703 Mon Sep 17 00:00:00 2001
From: Toby Dylan Hocking <toby.hocking@r-project.org>
Date: Mon, 20 Apr 2026 22:32:58 -0400
Subject: [PATCH 11/11] options

---
 .../index.qmd                                 | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
index ae1fb43..0edbb71 100644
--- a/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
+++ b/posts/2026-04-15-happy_birthday-toby_hocking/index.qmd
@@ -12,8 +12,6 @@ knitr::opts_chunk$set(
   dpi=100,
   fig.width=12,
   fig.height=4)
-options(
-  datatable.print.nrows=20)
 ```
 
 Happy birthday, `data.table`!
@@ -64,6 +62,7 @@ The code above specifies a regular expression:
 Below we use that pattern to convert the web page into a data table with two columns,
 
 ```{r}
+options(datatable.print.nrows=20) # instead of default 100.
 nc::capture_all_str(Archive.data.table, file.pattern)
 ```
 
@@ -147,17 +146,24 @@ desc.dt <- nearest.dt[, {
   close(conn)
   as.data.table(read.dcf(DESCRIPTION)[,"Author",drop=FALSE])
 }, by=.(version, release)]
-cat_head <- function(x)cat(head(x),sep="\n-----------\n")
-cat_head(desc.dt$Author)
 ```
 
-We see above that the `Author` field can contain newlines, which we remove below, to make later parsing easier:
+To avoid printing the full Author column (a long string), we can set an option:
+
+```{r}
+options(
+  datatable.prettyprint.char=30, # print ... after this many characters.
+  width=100) # max characters before wrapping columns to next line.
+desc.dt
+```
+
+We see above that the `Author` field can contain newlines (after the comma), which we remove below, to make later parsing easier:
 
 ```{r}
-desc.dt[, no.newlines := gsub("\n", " ", Author)][, cat_head(no.newlines)]
+desc.dt[, no.newlines := gsub("\n", " ", Author)][]
 ```
 
-The output above has one line of comma-separated authors per release.
+The output above has a new column of comma-separated authors per release (with no newlines).
 We would like to convert these data to a table with one year per author.
 A simple approach would be