diff --git a/NEWS.md b/NEWS.md
index c91e72b81c..4c9647cdfb 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -133,6 +133,8 @@
     # 2:     3
     ```
 
+24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR.
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.
diff --git a/R/data.table.R b/R/data.table.R
index 96e9547fc7..4dfa9c276a 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -809,8 +809,8 @@ replace_dot_alias = function(e) {
           # when the 'by' expression includes get/mget/eval, all.vars cannot be trusted to infer all used columns, #4981
           allbyvars = NULL
         else
-          allbyvars = intersect(all.vars(bysub), names_x)  
-        
+          allbyvars = intersect(all.vars(bysub), names_x)
+
         orderedirows = .Call(CisOrderedSubset, irows, nrow(x))  # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__)
         bysameorder = byindex = FALSE
         if (!bysub %iscall% ":" && ##Fix #4285
@@ -1740,13 +1740,13 @@ replace_dot_alias = function(e) {
           # is.symbol() is for #1369, #1974 and #2949
           if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE)
           if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE)  # 875
-          if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE)
+          if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE)
           #                       ^^ base::startWith errors on NULL unfortunately
           #        head-tail uses default value n=6 which as of now should not go gforce ... ^^
           # otherwise there must be three arguments, and only in two cases:
           #   1) head/tail(x, 1) or 2) x[n], n>0
           length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) &&
-            ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) )
+            ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) )
         }
         if (jsub[[1L]]=="list") {
           GForce = TRUE
@@ -1762,6 +1762,8 @@ replace_dot_alias = function(e) {
               if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame())  # tests 1187.2 & 1187.4
             }
           else {
+            # adding argument to ghead/gtail if none is supplied to g-optimized head/tail
+            if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L
             jsub[[1L]] = as.name(paste0("g", jsub[[1L]]))
             if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame())   # tests 1187.3 & 1187.5
           }
@@ -1841,6 +1843,25 @@ replace_dot_alias = function(e) {
     ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971.
     gi = if (length(o__)) o__[f__] else f__
     g = lapply(grpcols, function(i) groups[[i]][gi])
+
+    # adding ghead/gtail(n) support for n > 1 #5060 #523
+    q3 = 0
+    if (!is.symbol(jsub)) {
+      headTail_arg = function(q) {
+        if (length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) &&
+         (q1 <- q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3
+        else 0
+      }
+      if (jsub[[1L]] == "list"){
+        q3 = max(sapply(jsub, headTail_arg))
+      } else if (length(jsub)==3L) {
+        q3 = headTail_arg(jsub)
+      }
+    }
+    if (q3 > 0) {
+      grplens = pmin.int(q3, len__)
+      g = lapply(g, rep.int, times=grplens)
+    }
     ans = c(g, ans)
   } else {
     ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 4d782e4e92..32b16e471f 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -8116,21 +8116,36 @@ test(1579.18, dt[, tail(.SD,1L), by=x],    dt[, utils::tail(.SD,1L), by=x])
 test(1579.19, dt[, tail(.SD,1L), by=x],    dt[, utils::tail(.SD,1L), by=x])
 test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x])
 test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x])
-# GForce _doesn't_ work when n > 1
-test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE')
+# 1579.22 tested gtail with n>1; now 1579.4+ below
 
 mysub <- function(x, n) x[n]
-test(1579.23, dt[, .SD[2],  by=x],    dt[, mysub(.SD,2),  by=x])
-test(1579.24, dt[, .SD[2],  by=x],    dt[, mysub(.SD,2),  by=x])
-test(1579.25, dt[, .SD[2],  keyby=x], dt[, mysub(.SD,2),  keyby=x])
-test(1579.26, dt[, .SD[2],  keyby=x], dt[, mysub(.SD,2),  keyby=x])
-test(1579.27, dt[, .SD[2L], by=x],    dt[, mysub(.SD,2L), by=x])
-test(1579.28, dt[, .SD[2L], by=x],    dt[, mysub(.SD,2L), by=x])
-test(1579.29, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x])
-test(1579.30, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x])
-
-ans = capture.output(dt[, .SD[2], by=x, verbose=TRUE])
-test(1579.31, any(grepl("GForce optimized", ans)), TRUE)
+test(1579.23, dt[, .SD[2],  by=x, verbose=TRUE], dt[, mysub(.SD,2),  by=x], output="GForce optimized.*g[[]")
+test(1579.24, dt[, .SD[2],  keyby=x], dt[, mysub(.SD,2),  keyby=x])
+test(1579.25, dt[, .SD[2L], by=x],    dt[, mysub(.SD,2L), by=x])
+test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x])
+test(1579.27, dt[, .SD[15], by=x],    dt[, mysub(.SD,15), by=x])   # tests 15 > grpsize and that NA is correct including for integer64
+test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x])
+
+# gforce head/tail for n>1, #5060
+set.seed(99)
+DT = data.table(x = sample(letters[1:5], 20, TRUE),
+                y = rep.int(1:2, 10),  # to test 2 grouping columns get rep'd properly
+                i = sample(c(-2L,0L,3L,NA), 20, TRUE),
+                d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE),
+                s = sample(c("foo","bar",NA), 20, TRUE),
+                l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE))
+if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)]
+options(datatable.optimize=2L)
+test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3))   # the smallest group is 2, so n=5 tests n constrained to grpsize
+test(1579.402, DT[, head(.SD,2),    by=x, verbose=TRUE], DT[, utils::head(.SD,2),    by=x], output="optimized.*ghead")
+test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead")
+test(1579.404, DT[, head(.SD,5),    by=x, verbose=TRUE], DT[, utils::head(.SD,5),    by=x], output="optimized.*ghead")
+test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead")
+test(1579.406, DT[, tail(.SD,2),    by=x, verbose=TRUE], DT[, utils::tail(.SD,2),    by=x], output="optimized.*gtail")
+test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail")
+test(1579.408, DT[, tail(.SD,5),    by=x, verbose=TRUE], DT[, utils::tail(.SD,5),    by=x], output="optimized.*gtail")
+test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail")
+test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail")
 
 options(datatable.optimize = Inf)
 
@@ -14695,11 +14710,11 @@ DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1)
 test(2018.1, DT[, head(.SD), a, verbose=TRUE],
              data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)),
              output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'",
-                      "GForce is on, left j unchanged"))
+                      "GForce optimized j to 'list(ghead(b, n = 6L), ghead(d, n = 6L))'"))
 test(2018.2, DT[, head(b), a, verbose=TRUE],
              data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)),
              output=c("lapply optimization is on, j unchanged as 'head(b)'",
-                      "GForce is on, left j unchanged"))
+                      "GForce optimized j to 'ghead(b, n = 6L)'"))
 test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1)))
 test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12)))
 # gforce tests coverage
diff --git a/src/gsumm.c b/src/gsumm.c
index f806b1e3c8..7470f9f527 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -900,81 +900,72 @@ SEXP gmedian(SEXP x, SEXP narmArg) {
   return ans;
 }
 
-static SEXP gfirstlast(SEXP x, const bool first, const int w) {
+static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) {
   // w: which item (1 other than for gnthvalue when could be >1)
+  // headw: select 1:w of each group when first=true, and (n-w+1):n when first=false (i.e. tail)
   const bool nosubset = irowslen == -1;
+  const bool issorted = !isunsorted; // make a const-bool for use inside loops
   const int n = nosubset ? length(x) : irowslen;
-  SEXP ans;
   if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, first?"gfirst":"glast");
-  const bool gnth = w>1;  // const bool to avoid fetching grpsize[i] when not needed
-  switch(TYPEOF(x)) {
-  case LGLSXP: {
-    const int *ix = LOGICAL(x);
-    ans = PROTECT(allocVector(LGLSXP, ngrp));
-    int *ians = LOGICAL(ans);
-    for (int i=0; i<ngrp; ++i) {
-      if (gnth && w>grpsize[i]) { ians[i]=NA_LOGICAL; continue; }
-      int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1;
-      if (isunsorted) k = oo[k]-1;
-      ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_LOGICAL : ix[irows[k]-1]);
-    }
-  }
-    break;
-  case INTSXP: {
-    const int *ix = INTEGER(x);
-    ans = PROTECT(allocVector(INTSXP, ngrp));
-    int *ians = INTEGER(ans);
+  if (w==1 && headw) error(_("Internal error: gfirstlast headw should only be true when w>1"));
+  int anslen = ngrp;
+  if (headw) {
+    anslen = 0;
     for (int i=0; i<ngrp; ++i) {
-      if (gnth && w>grpsize[i]) { ians[i]=NA_INTEGER; continue; }
-      int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1;
-      if (isunsorted) k = oo[k]-1;
-      ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_INTEGER : ix[irows[k]-1]);
+      anslen += MIN(w, grpsize[i]);
     }
   }
-    break;
-  case REALSXP: {
-    const double *dx = REAL(x);
-    ans = PROTECT(allocVector(REALSXP, ngrp));
-    double *dans = REAL(ans);
-    for (int i=0; i<ngrp; ++i) {
-      if (gnth && w>grpsize[i]) { dans[i]=NA_REAL; continue; }
-      int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1;
-      if (isunsorted) k = oo[k]-1;
-      dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_REAL : dx[irows[k]-1]);
-    }
+  SEXP ans = PROTECT(allocVector(TYPEOF(x), anslen));
+  int ansi = 0;
+  #define DO(CTYPE, RTYPE, RNA, ASSIGN) {                                                          \
+    const CTYPE *xd = (const CTYPE *)RTYPE(x);                                                     \
+    if (headw) {                                                                                   \
+      /* returning more than 1 per group; w>1 */                                                   \
+      for (int i=0; i<ngrp; ++i) {                                                                 \
+        const int grpn = grpsize[i];                                                               \
+        const int thisn = MIN(w, grpn);                                                            \
+        const int jstart = ff[i]-1+ (!first)*(grpn-thisn);                                         \
+        const int jend = jstart+thisn;                                                             \
+        for (int j=jstart; j<jend; ++j) {                                                          \
+          const int k = issorted ? j : oo[j]-1;                                                    \
+          /* ternary on const-bool assumed to be branch-predicted and ok inside loops */           \
+          const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]);      \
+          ASSIGN;                                                                                  \
+        }                                                                                          \
+      }                                                                                            \
+    } else if (w==1) {                                                                             \
+      for (int i=0; i<ngrp; ++i) {                                                                 \
+        const int j = ff[i]-1 + (first ? 0 : grpsize[i]-1);                                        \
+        const int k = issorted ? j : oo[j]-1;                                                      \
+        const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]);        \
+        ASSIGN;                                                                                    \
+      }                                                                                            \
+    } else if (w>1 && first) {                                                                     \
+      /* gnthvalue */                                                                              \
+      for (int i=0; i<ngrp; ++i) {                                                                 \
+        const int grpn = grpsize[i];                                                               \
+        if (w>grpn) { const CTYPE val=RNA; ASSIGN; continue; }                                     \
+        const int j = ff[i]-1+w-1;                                                                 \
+        const int k = issorted ? j : oo[j]-1;                                                      \
+        const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]);        \
+        ASSIGN;                                                                                    \
+      }                                                                                            \
+    } else {                                                                                       \
+      /* w>1 && !first not supported because -i in R means everything-but-i and gnthvalue */       \
+      /* currently takes n>0 only. However, we could still support n'th from the end, somehow */   \
+      error(_("Internal error: unanticipated case in gfirstlast first=%d w=%d headw=%d"),          \
+              first, w, headw);                                                                    \
+    }                                                                                              \
   }
-    break;
-  case CPLXSXP: {
-    const Rcomplex *dx = COMPLEX(x);
-    ans = PROTECT(allocVector(CPLXSXP, ngrp));
-    Rcomplex *dans = COMPLEX(ans);
-    for (int i=0; i<ngrp; ++i) {
-      if (gnth && w>grpsize[i]) { dans[i]=NA_CPLX; continue; }
-      int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1;
-      if (isunsorted) k = oo[k]-1;
-      dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_CPLX : dx[irows[k]-1]);
-    }
-  } break;
-  case STRSXP: {
-    const SEXP *sx = STRING_PTR(x);
-    ans = PROTECT(allocVector(STRSXP, ngrp));
-    for (int i=0; i<ngrp; ++i) {
-      if (gnth && w>grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; }
-      int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1;
-      if (isunsorted) k = oo[k]-1;
-      SET_STRING_ELT(ans, i, nosubset ? sx[k] : (irows[k]==NA_INTEGER ? NA_STRING : sx[irows[k]-1]));
-    }
-  } break;
-  case VECSXP: {
-    const SEXP *vx = SEXPPTR_RO(x);
-    ans = PROTECT(allocVector(VECSXP, ngrp));
-    for (int i=0; i<ngrp; ++i) {
-      if (gnth && w>grpsize[i]) { SET_VECTOR_ELT(ans, i, ScalarLogical(NA_LOGICAL)); continue; }
-      int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1;
-      if (isunsorted) k = oo[k]-1;
-      SET_VECTOR_ELT(ans, i, nosubset ? vx[k] : (irows[k]==NA_INTEGER ? ScalarLogical(NA_LOGICAL) : vx[irows[k]-1]));
-    }
-  } break;
+  switch(TYPEOF(x)) {
+  case LGLSXP:  { int      *ansd=LOGICAL(ans); DO(int,      LOGICAL, NA_LOGICAL,   ansd[ansi++]=val) } break;
+  case INTSXP:  { int      *ansd=INTEGER(ans); DO(int,      INTEGER, NA_INTEGER,   ansd[ansi++]=val) } break;
+  case REALSXP: if (INHERITS(x, char_integer64)) {
+           int64_t *ansd=(int64_t *)REAL(ans); DO(int64_t,  REAL,    NA_INTEGER64, ansd[ansi++]=val) }
+           else { double      *ansd=REAL(ans); DO(double,   REAL,    NA_REAL,      ansd[ansi++]=val) } break;
+  case CPLXSXP: { Rcomplex *ansd=COMPLEX(ans); DO(Rcomplex, COMPLEX, NA_CPLX,      ansd[ansi++]=val) } break;
+  case STRSXP:  DO(SEXP, STRING_PTR, NA_STRING,                 SET_STRING_ELT(ans,ansi++,val))        break;
+  case VECSXP:  DO(SEXP, SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(ans,ansi++,val))        break;
   default:
     error(_("Type '%s' not supported by GForce head/tail/first/last/`[`. Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)));
   }
@@ -984,26 +975,28 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w) {
 }
 
 SEXP glast(SEXP x) {
-  return gfirstlast(x, false, 1);
+  return gfirstlast(x, false, 1, false);
 }
 
 SEXP gfirst(SEXP x) {
-  return gfirstlast(x, true, 1);
+  return gfirstlast(x, true, 1, false);
 }
 
-SEXP gtail(SEXP x, SEXP valArg) {
-  if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, gtail is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov
-  return gfirstlast(x, false, 1);
+SEXP gtail(SEXP x, SEXP nArg) {
+  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov
+  const int n=INTEGER(nArg)[0];
+  return n==1 ? glast(x) : gfirstlast(x, false, n, true);
 }
 
-SEXP ghead(SEXP x, SEXP valArg) {
-  if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, ghead is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov
-  return gfirstlast(x, true, 1);
+SEXP ghead(SEXP x, SEXP nArg) {
+  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov
+  const int n=INTEGER(nArg)[0];
+  return n==1 ? gfirst(x) : gfirstlast(x, true, n, true);
 }
 
-SEXP gnthvalue(SEXP x, SEXP valArg) {
-  if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<=0) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov
-  return gfirstlast(x, true, INTEGER(valArg)[0]);
+SEXP gnthvalue(SEXP x, SEXP nArg) {
+  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov
+  return gfirstlast(x, true, INTEGER(nArg)[0], false);
 }
 
 // TODO: gwhich.min, gwhich.max