-
Notifications
You must be signed in to change notification settings - Fork 1k
Expand file tree
/
Copy pathdata.table.R
More file actions
3751 lines (3563 loc) · 189 KB
/
data.table.R
File metadata and controls
3751 lines (3563 loc) · 189 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
dim.data.table = function(x)
{
.Call(Cdim, x)
}
.global = new.env() # thanks to: http://stackoverflow.com/a/12605694/403310
methods::setPackageName("data.table",.global)
.global$print = ""
# NB: if adding to/editing this list, be sure to do the following:
# (1) add to man/special-symbols.Rd
# (2) export() in NAMESPACE
# (3) add to vignettes/datatable-importing.Rmd#globals section
.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL
# These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
# But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
# And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
# utils::globalVariables(c(".SD",".N")) was tried as well, but exporting seems better.
# So even though .BY doesn't appear in this file, it should still be NULL here and exported because it's
# defined in SDenv and can be used by users.
is.data.table = function(x) inherits(x, "data.table")
is.ff = function(x) inherits(x, "ff") # define this in data.table so that we don't have to require(ff), but if user is using ff we'd like it to work
#NCOL = function(x) {
# # copied from base, but additionally covers data.table via is.list()
# # because NCOL in base explicitly tests using is.data.frame()
# if (is.list(x) && !is.ff(x)) return(length(x))
# if (is.array(x) && length(dim(x)) > 1L) ncol(x) else as.integer(1L)
#}
#NROW = function(x) {
# if (is.data.frame(x) || is.data.table(x)) return(nrow(x))
# if (is.list(x) && !is.ff(x)) stopf("List is not a data.frame or data.table. Convert first before using NROW") # list may have different length elements, which data.table and data.frame's resolve.
# if (is.array(x)) nrow(x) else length(x)
#}
null.data.table = function() {
ans = list()
setattr(ans,"class",c("data.table","data.frame"))
setattr(ans,"row.names",.set_row_names(0L))
setalloccol(ans)
}
data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
{
# NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l
# first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking.
x = list(...) # list() doesn't copy named inputs as from R >= 3.1.0 (a very welcome change)
nd = name_dots(...)
names(x) = nd$vnames
if (length(x)==0L) return( null.data.table() )
if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #48
ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names, .named=nd$.named) # see comments inside as.data.table.list re copies
if (!is.null(key)) {
if (!is.character(key)) stopf("key argument of data.table() must be character")
if (length(key)==1L) key = cols_from_csv(key)
setkeyv(ans,key)
} else {
# retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table().
# If DT inputs with keys have been recycled then can't retain key
ckey = NULL
for (i in seq_along(x)) {
xi = x[[i]]
if (is.data.table(xi) && haskey(xi) && nrow(xi)==nrow(ans)) ckey=c(ckey, key(xi))
}
if (length(ckey) &&
!anyDuplicated(ckey) &&
identical(is.na(chmatchdup(c(ckey,ckey), names(ans))), rep(c(FALSE,TRUE),each=length(ckey)))) {
setattr(ans, "sorted", ckey)
}
}
if (isTRUE(stringsAsFactors)) {
for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
# as_factor is internal function in fread.R currently
}
setalloccol(ans) # returns a NAMED==0 object, unlike data.frame()
}
replace_dot_alias = function(e) {
# we don't just simply alias .=list because i) list is a primitive (faster to iterate) and ii) we test for use
# of "list" in several places so it saves having to remember to write "." || "list" in those places
if (is.call(e) && !is.function(e[[1L]])) {
# . alias also used within bquote, #1912
if (e[[1L]] == 'bquote') return(e)
if (e[[1L]] == ".") e[[1L]] = quote(list)
for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_dot_alias(e[[i]])
}
e
}
.massagei = function(x) {
# J alias for list as well in i, just if the first symbol
# if x = substitute(base::order) then as.character(x[[1L]]) == c("::", "base", "order")
if (x %iscall% c("J","."))
x[[1L]] = quote(list)
x
}
.checkTypos = function(err, ref) {
err_str <- conditionMessage(err)
# a slightly wonky workaround so that this still works in non-English sessions, #4989
# generate this at run time (as opposed to e.g. onAttach) since session language is
# technically OK to update (though this should be rare), and since it's low-cost
# to do so here because we're about to error anyway.
missing_obj_regex = gsub(
"'____missing_datatable_variable____'",
"'(?<obj_name>[^']+)'",
# expression() to avoid "no visible binding for global variable" note from R CMD check
conditionMessage(tryCatch(eval(quote(`____missing_datatable_variable____`)), error=identity)),
fixed=TRUE
)
idx = regexpr(missing_obj_regex, err_str, perl=TRUE)
if (idx == -1L)
stop(err) # Pass 'err' to retain call site data (#7444); beware also #6588
start = attr(idx, "capture.start", exact=TRUE)[ , "obj_name"]
used = substr(
err_str,
start,
start + attr(idx, "capture.length", exact=TRUE)[ , "obj_name"] - 1L
)
found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE)
if (length(found)) {
stopf("Object '%s' not found. Perhaps you intended %s", used, brackify(found))
} else {
stopf("Object '%s' not found amongst %s", used, brackify(ref))
}
}
.reassign_extracted_table = function(name, value, env = parent.frame(2L)) {
k = eval(name[[2L]], env, env)
if (is.list(k)) {
origj = j = if (name %iscall% "$") as.character(name[[3L]]) else eval(name[[3L]], env, env)
if (length(j) != 1L) {
stopf("Invalid set* operation on a recursive index L[[i]] where i has length %d. Chain [[ instead.", length(j))
}
if (is.character(j)) {
j = match(j, names(k))
if (is.na(j)) {
stopf("Item '%s' not found in names of input list", origj)
}
}
.Call(Csetlistelt, k, as.integer(j), value)
} else if (is.environment(k) && exists(as.character(name[[3L]]), k, inherits = FALSE)) {
assign(as.character(name[[3L]]), value, k, inherits = FALSE)
} else if (isS4(k)) {
.Call(CsetS4elt, k, as.character(name[[3L]]), value)
}
}
# Transform lapply(.SD, fun) or Map(fun, .SD) into list(fun(col1), fun(col2), ...)
#
# It may seem inefficient to construct a potentially long expression. But, consider calling
# lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it
# is called, involving small memory allocations.
# The R level lapply calls as.list which needs a shallow copy.
# lapply also does a setAttib of names (duplicating the same names over and over again
# for each group) which is terrible for our needs. We replace all that with a
# (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol
# lookup), and the eval() inside dogroups hardly has to do anything. All this results in
# overhead minimised. We don't need to worry about the env passed to the eval in a possible
# lapply replacement, or how to pass ... efficiently to it.
# Plus we optimize lapply first, so that mean() can be optimized too as well, next.
.massageSD = function(jsub, sdvars, SDenv, funi) {
txt = as.list(jsub)[-1L]
if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110
# support Map instead of lapply #5336
fun = if (jsub %iscall% "Map") txt[[1L]] else txt[[2L]]
if (fun %iscall% "function") { # NB: '\(x)' only exists pre-parser, so it's also covered
# Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT
# http://stackoverflow.com/questions/13441868/data-table-and-stratified-means
# adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD)
# replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!)
thisfun = paste0("..LAPPLY_FUN", funi) # Fix for #985
assign(thisfun, eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD
lockBinding(thisfun, SDenv)
txt[[1L]] = as.name(thisfun)
} else {
if (is.character(fun)) fun = as.name(fun)
txt[[1L]] = fun
}
ans = vector("list", length(sdvars)+1L)
ans[[1L]] = as.name("list")
for (ii in seq_along(sdvars)) {
txt[[2L]] = as.name(sdvars[ii])
ans[[ii+1L]] = as.call(txt)
}
jsub = as.call(ans) # important no names here
jvnames = sdvars # but here instead
list(jsub=jsub, jvnames=jvnames, funi=funi+1L)
}
# Optimize .SD subsetting patterns like .SD[1], head(.SD), first(.SD)
# return NULL for no optimization possible
.optimize_sd_subset = function(jsub, sdvars, SDenv, envir) {
if (!is.call(jsub) || length(jsub) < 2L || !is.name(jsub[[2L]]) || jsub[[2L]] != ".SD") return(NULL)
# g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612
subopt = length(jsub) == 3L &&
(jsub %iscall% "[" ||
(jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), SDenv$.SDall, envir))) &&
(is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
headopt = jsub %iscall% c("head", "tail")
firstopt = jsub %iscall% c("first", "last") # fix for #2030
if (subopt || headopt || firstopt) {
if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
jsub_new = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub })))
return(list(jsub=jsub_new, jvnames=sdvars))
}
NULL
}
# Optimize c(...) expressions
.optimize_c_expr = function(jsub, jvnames, sdvars, SDenv, funi, envir) {
if (!jsub %iscall% "c" || length(jsub) <= 1L) {
return(list(jsub=jsub, jvnames=jvnames, funi=funi, optimized=FALSE))
}
# FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here.
# FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains
# (1) lapply(.SD, ...)
# (2) simply .SD or .SD[..]
# (3) .N
# (4) list(...)
# (5) functions that normally return a single value*
# On (5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always*
# return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT.
# One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output
# Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations.
# For now, we optimise all functions mentioned in 'optfuns' below.
optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var")
any_optimized = FALSE
jsubl = as.list.default(jsub)
oldjvnames = jvnames
jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward.
for (i_ in 2L:length(jsubl)) {
this = jsub[[i_]]
# Case 1: Plain name (.SD or .N)
if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names
if (this == ".SD") { # optimise '.SD' alone
any_optimized = TRUE
jsubl[[i_]] = lapply(sdvars, as.name)
jvnames = c(jvnames, sdvars)
} else if (this == ".N") {
# don't optimise .I in c(.SD, .I), its length can be > 1
# only c(.SD, list(.I)) should be optimised!! .N is always length 1.
jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this))
} else {
# jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_])
return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE))
}
}
# Case 2: Call expression
else if (is.call(this)) {
# Case 2a: lapply(.SD, ...) or Map(fun, .SD)
is_lapply = this[[1L]] == "lapply" && length(this) >= 2L && this[[2L]] == ".SD"
is_map = this[[1L]] == "Map" && length(this) >= 3L && this[[3L]] == ".SD"
if ((is_lapply || is_map) && length(sdvars)) {
any_optimized = TRUE
massage_result = .massageSD(this, sdvars, SDenv, funi)
funi = massage_result$funi
jsubl[[i_]] = as.list(massage_result$jsub[-1L]) # just keep the '.' from list(.)
jn__ = massage_result$jvnames
if (isTRUE(nzchar(names(jsubl)[i_]))) {
# Fix for #2311, prepend named arguments of c() to column names of .SD
# e.g. c(mean=lapply(.SD, mean)) or c(mean=Map(mean, .SD))
jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1))
}
jvnames = c(jvnames, jn__)
}
# Case 2b: list(...)
else if (this[[1L]] == "list") {
# also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen
if (length(this) == 1L) {
jsubl[[i_]] = list() # empty list gets dropped by unlist later
next
}
jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.)
# Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.*
jl__names = names(jl__) %||% rep("", length(jl__))
pname = names(jsubl)[i_]
if (isTRUE(nzchar(pname))) {
jl__hasname = nzchar(jl__names)
jn__ = if (length(jl__) > 1L) paste0(pname, seq_along(jl__)) else pname
jn__[jl__hasname] = paste(pname, jl__names[jl__hasname], sep=".")
} else {
jn__ = jl__names
}
idx = vapply_1b(jl__, identical, quote(.I))
if (any(idx))
jn__[idx & !nzchar(jn__)] = "I" # this & is correct not &&
jvnames = c(jvnames, jn__)
jsubl[[i_]] = jl__
any_optimized = TRUE
}
# Case 2c: Single-value functions like mean, sum, etc.
else if (this %iscall% optfuns && length(this)>1L) {
jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_])
}
# Case 2d: .SD[1] or similar subsetting
else if (length(this) == 3L
&& (this[[1L]] == "[" || this[[1L]] == "head")
&& this[[2L]] == ".SD"
&& (is.numeric(this[[3L]]) || this[[3L]] == ".N")) {
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
any_optimized = TRUE
jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this })
jvnames = c(jvnames, sdvars)
}
# Case 2e: Complex .SD usage - can't optimize
# else if (any(all.vars(this) == ".SD")) {
# TODO, TO DO: revisit complex cases (as illustrated below)
# complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp]
# hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation.
# return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE))
# }
# Case 2f: Other cases - skip optimization
else {
# TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any?
return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE))
}
}
# Case 3: Other types - can't optimize
else {
return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE))
}
}
# Return result
if (!any_optimized) {
# Can't optimize - return original
return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE))
}
# Optimization successful
setattr(jsubl, 'names', NULL)
jsub_new = as.call(unlist(jsubl, use.names=FALSE))
jsub_new[[1L]] = quote(list)
list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE)
}
# Optimize lapply(.SD, ...) expressions
# This function transforms lapply(.SD, fun) into list(fun(col1), fun(col2), ...)
# Returns: list(jsub=call/name, jvnames=character)
.optimize_lapply = function(jsub, jvnames, sdvars, SDenv, verbose, envir) {
oldjsub = jsub
funi = 1L # Fix for #985
# Try different optimization patterns in order
# Pattern 1: Plain .SD -> list(col1, col2, ...)
if (is.name(jsub) && jsub == ".SD") {
jsub = as.call(c(quote(list), lapply(sdvars, as.name)))
jvnames = sdvars
}
# Pattern 2: .SD subsetting like .SD[1], head(.SD), first(.SD)
else if (!is.null(result <- .optimize_sd_subset(jsub, sdvars, SDenv, envir))) {
jsub = result$jsub
jvnames = result$jvnames
}
# Pattern 3a: lapply(.SD, fun)
else if (is.call(jsub) && jsub %iscall% "lapply" && length(jsub) >= 2L && jsub[[2L]] == ".SD" && length(sdvars)) {
massage_result = .massageSD(jsub, sdvars, SDenv, funi)
jsub = massage_result$jsub
jvnames = massage_result$jvnames
funi = massage_result$funi
}
# Pattern 3a2: lapply(list(col1, col2, ...), fun)
else if (is.call(jsub) && jsub %iscall% "lapply" && length(jsub) >= 2L &&
jsub[[2L]] %iscall% "list" && length(jsub[[2L]]) > 1L) {
cnames = as.list(jsub[[2L]])[-1L]
if (all(vapply_1b(cnames, is.name))) {
cnames = vapply_1c(cnames, as.character)
massage_result = .massageSD(jsub, cnames, SDenv, funi)
jsub = massage_result$jsub
jvnames = NULL # consistent with datatable.optimize=0L behavior
funi = massage_result$funi
}
}
# Pattern 3b: Map(fun, .SD)
# Only optimize if .SD appears exactly once to avoid cases like Map(rep, .SD, .SD)
else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars) &&
sum(vapply_1b(as.list(jsub), identical, quote(.SD))) == 1L) {
massage_result = .massageSD(jsub, sdvars, SDenv, funi)
jsub = massage_result$jsub
jvnames = massage_result$jvnames
funi = massage_result$funi
}
# Pattern 4: c(...) with .SD components
else if (is.call(jsub)) {
c_result = .optimize_c_expr(jsub, jvnames, sdvars, SDenv, funi, envir)
if (c_result$optimized) {
jsub = c_result$jsub
jvnames = c_result$jvnames
funi = c_result$funi
}
}
# Verbose output
if (verbose) {
if (!identical(oldjsub, jsub))
catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L))
else
catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L))
}
list(jsub=jsub, jvnames=jvnames)
}
# Optimize expressions using GForce (C-level optimizations)
# This function replaces functions like mean() with gmean() for fast C implementations
.optimize_gforce = function(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) {
GForce = FALSE
# TODO: FR #971, make GForce work with joins. joins could work with
# nomatch=NULL even now.. but not switching it on yet, will deal it separately.
if (getOption("datatable.optimize") < 2L || is.data.table(i) || byjoin || !length(f__))
return(list(GForce=FALSE, jsub=jsub))
if (!length(ansvars) && !use.I) {
if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) {
if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L))
return(list(GForce=TRUE, jsub=jsub))
}
return(list(GForce=FALSE, jsub=jsub))
}
# turn off GForce for the combination of := and .N
if (length(lhs) && is.symbol(jsub))
return(list(GForce=FALSE, jsub=jsub))
# Apply GForce
if (jsub %iscall% "list") {
GForce = TRUE
for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) {
if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break}
}
} else
GForce = .gforce_ok(jsub, SDenv$.SDall, envir)
if (!GForce) {
if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n")
return(list(GForce=FALSE, jsub=jsub))
}
if (jsub %iscall% "list") {
for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) {
if (is.N(jsub[[ii]])) next; # For #334
jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir)
}
} else {
# adding argument to ghead/gtail if none is supplied to g-optimized head/tail
if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L
jsub = .gforce_jsub(jsub, names_x, envir)
}
if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L))
list(GForce=GForce, jsub=jsub)
}
# Old mean() optimization fallback when GForce is not used
.optimize_mean = function(jsub, SDenv, verbose, GForce) {
if (GForce || is.name(jsub)) return(jsub)
# Still do the old speedup for mean, for now
nomeanopt=FALSE # to be set by .optmean() using <<- inside it
oldjsub = jsub
if (jsub %iscall% "list") {
# Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if()
# jsub[[1]]=="list" so the first item of todo will always be FALSE
todo = sapply(jsub, `%iscall%`, 'mean')
if (any(todo)) {
w = which(todo)
jsub[w] = lapply(jsub[w], .optmean)
}
} else if (jsub %iscall% "mean") {
jsub = .optmean(jsub)
}
if (nomeanopt) {
warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE)
}
if (verbose) {
if (!identical(oldjsub, jsub))
catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L))
else
catf("Old mean optimization is on, left j unchanged.\n")
}
assign("Cfastmean", Cfastmean, SDenv)
# Old comments still here for now ...
# Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow.
# Maybe change to :
# assign("mean", fastmean, SDenv) # neater than the hard work above, but slower
# when fastmean can do trim.
jsub
}
# attempts to optimize j expressions using lapply, GForce, and mean optimizations
.attempt_optimize = function(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) {
if (getOption("datatable.optimize") < 1L) {
if (verbose) catf("All optimizations are turned off\n")
return(list(GForce=FALSE, jsub=jsub, jvnames=jvnames))
}
if (!(is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N")))) {
if (verbose) catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L))
return(list(GForce=FALSE, jsub=jsub, jvnames=jvnames))
}
# Step 1: Apply lapply(.SD) optimization
lapply_result = .optimize_lapply(jsub, jvnames, sdvars, SDenv, verbose, envir)
jsub = lapply_result$jsub
jvnames = lapply_result$jvnames
# Step 2: Apply GForce optimization
gforce_result = .optimize_gforce(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir)
GForce = gforce_result$GForce
jsub = gforce_result$jsub
# Step 3: Apply old mean optimization (fallback when GForce is not used)
jsub = .optimize_mean(jsub, SDenv, verbose, GForce)
list(GForce=GForce, jsub=jsub, jvnames=jvnames)
}
"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0.0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive()))
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
# test explicitly if the caller is [.data.table (even stronger test. TO DO.)
# the drop=NULL is to sink drop argument when dispatching to [.data.frame; using '...' stops test 147
if (!cedta()) {
# Fix for #500 (to do)
if (substitute(j) %iscall% c(":=", "let")) {
# Throw a specific error message
stopf("[ was called on a data.table in an environment that is not data.table-aware (i.e. cedta()), but '%s' was used, implying the owner of this call really intended for data.table methods to be called. See vignette('datatable-importing') for details on properly importing data.table.", as.character(substitute(j)[[1L]]))
}
Nargs = nargs() - (!missing(drop))
ans = if (Nargs<3L) { `[.data.frame`(x,i) } # drop ignored anyway by DF[i]
else if (missing(drop)) `[.data.frame`(x,i,j)
else `[.data.frame`(x,i,j,drop)
# added is.data.table(ans) check to fix bug #81
if (!missing(i) && is.data.table(ans)) setkey(ans, NULL) # drops index too; tested by plyr::arrange test in other.Rraw
return(ans)
}
if (!missing(verbose)) {
if (!is.integer(verbose) && !is.logical(verbose)) stopf("verbose must be logical or integer")
if (length(verbose)!=1L || anyNA(verbose)) stopf("verbose must be length 1 non-NA")
# set the global verbose option because that is fetched from C code without having to pass it through
oldverbose = options(datatable.verbose=verbose)
on.exit(options(oldverbose))
}
.global$print=""
missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector
if (missingby || missing(j)) {
if (!missingby) warningf("Ignoring by/keyby because 'j' is not supplied")
by = bysub = NULL
keyby = FALSE
} else {
if (missing(by)) {
by = bysub = if (is.null(env)) substitute(keyby)
else eval(substitute(substitute2(.keyby, env), list(.keyby = substitute(keyby))))
keyby = TRUE
} else {
by = bysub = if (is.null(env)) substitute(by)
else eval(substitute(substitute2(.by, env), list(.by = substitute(by))))
if (missing(keyby))
keyby = FALSE
else if (!isTRUEorFALSE(keyby))
stopf("When by and keyby are both provided, keyby must be TRUE or FALSE")
}
if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304
else if (verbose && !is.null(env)) catf("Argument '%s' after substitute: %s\n", "by", paste(deparse(bysub, width.cutoff=500L), collapse="\n"))
}
bynull = !missingby && is.null(by) #3530
byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI"
naturaljoin = FALSE
names_x = names(x)
if (missing(i) && !missing(on)) {
on_tmp = eval.parent(.massagei(substitute(on)))
if (!is.list(on_tmp) || !length(names(on_tmp))) {
warningf("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join (i.e. join on common names) is invoked. Ignoring on= which is '%s'.", class1(on_tmp))
on = NULL
} else {
i = on_tmp
naturaljoin = TRUE
}
}
if (missing(i) && missing(j)) {
tt_isub = substitute(i)
tt_jsub = substitute(j)
if (!is.null(names(sys.call())) && # not relying on nargs() as it considers DT[,] to have 3 arguments, #3163
tryCatch(!is.symbol(tt_isub), error=function(e)TRUE) && # a symbol that inherits missingness from caller isn't missing for our purpose; test 1974
tryCatch(!is.symbol(tt_jsub), error=function(e)TRUE)) {
warningf("i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future.")
}
return(x)
}
if (!mult %chin% c("first", "last", "all", "error")) stopf("mult argument can only be 'first', 'last', 'all' or 'error'")
missingroll = missing(roll)
if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
if (is.character(roll)) {
if (roll!="nearest") stopf("roll is '%s' (type character). Only valid character value is 'nearest'.", roll)
} else {
roll = if (isTRUE(roll)) +Inf else as.double(roll)
}
force(rollends)
if (!is.logical(rollends)) stopf("rollends must be a logical vector")
if (length(rollends)>2L) stopf("rollends must be length 1 or 2")
if (length(rollends)==1L) rollends=rep.int(rollends,2L)
# TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one.
.unsafe.opt() #3585
missingnomatch = missing(nomatch)
nomatch0 = identical(nomatch,0.0) || identical(nomatch,0L) || identical(nomatch, FALSE) # for warning with row-numbers in i; #4353
if (nomatch0) nomatch=NULL # retain nomatch=0|FALSE backwards compatibility, #857 #5214
if (!is.null(nomatch)) {
if (!(length(nomatch)==1L && is.na(nomatch))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)")
nomatch=NA # convert NA_character_ to NA-logical, PR#5216
}
if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which)
if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.")
if (!with && missing(j)) stopf("j must be provided when with=FALSE")
if (!missing(by) && !(isTRUEorFALSE(showProgress) || (is.numeric(showProgress) && length(showProgress)==1L && showProgress >= 0))) stopf("showProgress must be TRUE, FALSE, or a single non-negative number") # nocov
irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
notjoin = FALSE
rightcols = leftcols = integer()
optimizedSubset = FALSE ## flag: tells whether a normal query was optimized into a join.
..syms = NULL
av = NULL
jsub = NULL
if (!missing(j)) {
if (is.null(env)) jsub = substitute(j) else {
jsub = eval(substitute(
substitute2(.j, env),
list(.j = substitute(j))
))
if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose && !is.null(env)) catf("Argument '%s' after substitute: %s\n", "j", paste(deparse(jsub, width.cutoff=500L), collapse="\n"))
}
}
if (!missing(j)) {
jsub = replace_dot_alias(jsub)
root = root_name(jsub)
# exclude '..1' etc. for #5460
av = grepv("^[.][.](?:[.]|[0-9]+)$", all.vars(jsub), invert=TRUE)
all..names = FALSE
if ((.is_withFALSE_range(jsub, x, root, av)) ||
(root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || ## x[, !(V8:V10)]
( (!length(av) || (all..names <- all(startsWith(av, "..")))) && ## x[, "V1"]; x[, ..v]
root %chin% c("","c","paste","paste0","-","!") && ## x[, c("V1","V2")]; x[, paste("V",1:2,sep="")]; x[, paste0("V",1:2)]
missingby )) { # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed)
# When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find.
# If variable names do occur, but they are all prefixed with .., then that means look up in calling scope.
# Automatically set with=FALSE in this case so that DT[,1], DT[,2:3], DT[,"someCol"] and DT[,c("colB","colD")]
# work as expected. As before, a vector will never be returned, but a single column data.table
# for type consistency with >1 cases. To return a single vector use DT[["someCol"]] or DT[[3]].
# The root==":" is to allow DT[,colC:colH] even though that contains two variable names.
# root == "-" or "!" is for tests 1504.11 and 1504.13 (a : with a ! or - modifier root)
# We don't want to evaluate j at all in making this decision because i) evaluating could itself
# increment some variable and not intended to be evaluated a 2nd time later on and ii) we don't
# want decisions like this to depend on the data or vector lengths since that can introduce
# inconsistency reminiscent of drop=TRUE in [.data.frame that we seek to avoid.
with=FALSE
if (all..names) {
for (..name in av) {
name = substr(..name, 3L, nchar(..name))
if (!nzchar(name)) stopf("The symbol .. is invalid. The .. prefix must be followed by at least one character.")
parent_has_..name = exists(..name, where=parent.frame())
if (!exists(name, where=parent.frame())) {
suggested = if (parent_has_..name)
gettextf(" Variable '..%s' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.", name)
# We have recommended 'manual' .. prefix in the past, so try to be helpful
else
""
stopf("Variable '%s' is not found in calling scope. Looking in calling scope because you used the .. prefix.%s", name, suggested)
} else if (parent_has_..name) {
warningf("Both '%1$s' and '..%1$s' exist in calling scope. Please remove the '..%1$s' variable in calling scope for clarity.", name)
}
}
..syms = av
}
} else if (is.name(jsub)) {
if (startsWith(as.character(jsub), "..")) internal_error("DT[, ..var] should be dealt with by the branch above now.") # nocov
if (!with && !exists(as.character(jsub), where=parent.frame()))
stopf("Variable '%s' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.", as.character(jsub))
}
if (root=="{") {
if (length(jsub) == 2L) {
jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376]
root = root_name(jsub)
} else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") {
#2142 -- j can be {} and have length 1
stopf("Invalid use of `:=` inside `{}`. `:=` must be the only expression inside `{}` when used in `j`. Instead of: DT[{tmp1 <- ...; tmp2 <- ...; someCol := tmp1 * tmp2}], Use: DT[, someCol := {tmp1 <- ...; tmp2 <- ...; tmp1 * tmp2}]")
}
}
if (root=="eval" && !any(all.vars(jsub[[2L]]) %chin% names_x)) {
# TODO: this && !any depends on data. Can we remove it?
# Grab the dynamic expression from calling scope now to give the optimizer a chance to optimize it
# Only when top level is eval call. Not nested like x:=eval(...) or `:=`(x=eval(...), y=eval(...))
jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression
if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call
# Note that the dynamic expression could now be := (new in v1.9.7)
jsub = replace_dot_alias(jsub)
root = root_name(jsub)
}
if (root == ":=" || root == "let") { # let(...) as alias for :=(...) (#3795)
if (root == "let")
jsub[[1L]] = as.symbol(":=")
allow.cartesian=TRUE # (see #800)
if (!missing(i) && keyby)
stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i")
if (!missingnomatch) {
warningf("nomatch isn't relevant together with :=, ignoring nomatch")
nomatch=NULL
}
}
}
# setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead.
dupdiff = function(x, y) x[!x %chin% y]
isub = NULL
if (!missing(i)) {
if (is.null(env)) isub = substitute(i) else {
isub = eval(substitute(
substitute2(.i, env),
list(.i = substitute(i))
))
if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose && !is.null(env)) catf("Argument '%s' after substitute: %s\n", "i", paste(deparse(isub, width.cutoff=500L), collapse="\n"))
}
}
if (!missing(i)) {
xo = NULL
if (identical(isub, NA)) {
# only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA]
# replace NA in this case with NA_integer_ as that's almost surely what user intended to
# return a single row with NA in all columns. (DT[0] returns an empty table, with correct types.)
# Any expression (including length 1 vectors) that evaluates to a single NA logical will
# however be left as NA logical since that's important for consistency to return empty in that
# case; e.g. DT[Col==3] where DT is 1 row and Col contains NA.
# Replacing the NA symbol makes DT[NA] and DT[c(1,NA)] consistent and provides
# an easy way to achieve a single row of NA as users expect rather than requiring them
# to know and change to DT[NA_integer_].
isub=NA_integer_
}
isnull_inames = FALSE
# Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires
# the "eval" to be checked before `as.name("!")`. Therefore interchanged.
restore.N = remove.N = FALSE
old.N = get0(".N", envir=parent.frame(), inherits=FALSE)
if (!is.null(old.N)) {
locked.N = bindingIsLocked(".N", parent.frame())
if (locked.N) eval(call("unlockBinding", ".N", parent.frame())) # eval call to pass R CMD check NOTE until we find cleaner way
assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
restore.N = TRUE
# the comment below is invalid hereafter (due to fix for #1145)
# binding locked when .SD[.N] but that's ok as that's the .N we want anyway
# TO DO: change isub at C level s/.N/nrow(x); changing a symbol to a constant should be ok
} else {
assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
remove.N = TRUE
}
if (isub %iscall% "eval") { # TO DO: or ..()
isub = eval(.massagei(isub[[2L]]), parent.frame(), parent.frame())
if (is.expression(isub)) isub=isub[[1L]]
}
if (isub %iscall% "!") {
notjoin = TRUE
if (!missingnomatch) stopf("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch.");
nomatch = NULL
isub = isub[[2L]]
# #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!"
if (isub %iscall% "(" && !is.name(isub[[2L]]))
isub = isub[[2L]]
}
if (is.null(isub)) return( null.data.table() )
if (length(o <- .prepareFastSubset(isub = isub, x = x,
enclos = parent.frame(),
notjoin = notjoin, verbose = verbose))){
## redirect to the is.data.table(x) == TRUE branch.
## Additional flag to adapt things after bmerge:
optimizedSubset = TRUE
notjoin = o$notjoin
i = o$i
on = o$on
## the following two are ignored if i is not a data.table.
## Since we are converting i to data.table, it is important to set them properly.
nomatch = NULL
mult = "all"
}
else if (!is.name(isub)) {
ienv = new.env(parent=parent.frame())
if (getOption("datatable.optimize") >= 1L) assign("order", forder, ienv)
i = tryCatch(eval(.massagei(isub), x, ienv),
dt_invalid_let_error = function(e) stopf("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number."),
error = function(e) .checkTypos(e, names_x)
)
} else {
# isub is a single symbol name such as B in DT[B]
i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE)
if (inherits(i,"try-error") || is.function(i)) {
# must be "not found" since isub is a mere symbol
col = try(eval(isub, x), silent=TRUE) # is it a column name?
msg = if (inherits(col, "try-error")) gettextf(
"'%s' is not found in calling scope and it is not a column name either",
as.character(isub)
) else gettextf(
"'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized",
as.character(isub), typeof(col)
)
stopf("%s. When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.", msg)
}
}
if (restore.N) {
assign(".N", old.N, envir=parent.frame())
if (locked.N) lockBinding(".N", parent.frame())
}
if (remove.N) rm(list=".N", envir=parent.frame())
if (is.matrix(i)) {
if (is.numeric(i) && ncol(i)==1L) { # #826 - subset DT on single integer vector stored as matrix
i = as.integer(i)
} else {
stopf("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please report to data.table issue tracker if you'd like this, or add your comments to FR #657.")
}
}
if (is.logical(i)) {
if (notjoin) {
notjoin = FALSE
i = !i
}
}
if (is.null(i)) return( null.data.table() )
if (is.character(i) || is.factor(i)) {
isnull_inames = TRUE
i = data.table(V1=i) # for user convenience; e.g. DT["foo"] without needing DT[.("foo")]
} else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) { i = as.data.table(i[[1L]]) }
else if (identical(class(i),"data.frame")) { i = as.data.table(i) } # TO DO: avoid these as.data.table() and use a flag instead
else if (identical(class(i),"list")) {
isnull_inames = is.null(names(i))
i = as.data.table(i)
}
if (is.data.frame(i)) {
if (missing(on)) {
if (!haskey(x)) {
stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
}
} else if (identical(substitute(on), as.name(".NATURAL"))) {
naturaljoin = TRUE
}
if (naturaljoin) { # natural join #629
common_names = intersect(names_x, names(i))
len_common_names = length(common_names)
if (!len_common_names) stopf("Attempting to do natural join but no common columns in provided tables")
if (verbose) {
which_cols_msg = if (len_common_names == length(x)) {
catf("Joining but 'x' has no key, natural join using all 'x' columns\n")
} else {
catf("Joining but 'x' has no key, natural join using: %s\n", brackify(common_names))
}
}
on = common_names
}
if (!missing(on)) {
# on = .() is now possible, #1257
on_ops = .parse_on(substitute(on), isnull_inames)
on = on_ops[[1L]]
ops = on_ops[[2L]]
if (any(ops > 1L)) { ## fix for #4489; ops = c("==", "<=", "<", ">=", ">", "!=")
allow.cartesian = TRUE
}
# TODO: collect all '==' ops first to speeden up Cnestedid
rightcols = colnamesInt(x, names(on), check_dups=FALSE)
leftcols = colnamesInt(i, unname(on), check_dups=FALSE)
} else {
## missing on
rightcols = chmatch(key(x), names_x) # NAs here (i.e. invalid data.table) checked in bmerge()
leftcols = if (haskey(i))
chmatch(head(key(i), length(rightcols)), names(i))
else
seq_len(min(length(i),length(rightcols)))
rightcols = head(rightcols,length(leftcols))
ops = rep(1L, length(leftcols))
}
# Implementation for not-join along with by=.EACHI, #604
if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571
notjoin = FALSE
if (verbose) {last.started.at=proc.time();catf("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()}
orignames = copy(names(i))
i = setdiff_(x, i, rightcols, leftcols) # part of #547
if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()}
setnames(i, orignames[leftcols])
setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
}
i = .shallow(i, retain.key = TRUE)
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all"
xo = ans$xo ## to make it available for further use.
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
# 'setorder', as there's another 'setorder' in generating 'irows' below...
if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices)
allLen1 = ans$allLen1
f__ = ans$starts
len__ = ans$lens
allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time.
indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix
# When no match, len__ is 0 for nomatch=NULL and 1 for nomatch=NA, so len__ isn't .N
# If using secondary key of x, f__ will refer to xo
if (is.na(which)) {
w = if (notjoin) f__!=0L else is.na(f__)
return( if (length(xo)) fsort(xo[w], internal=TRUE) else which(w) )
}
if (mult=="all") {
# is by=.EACHI along with non-equi join?
nqbyjoin = byjoin && length(ans$indices) && !allGrp1
if (!byjoin || nqbyjoin) {
# Really, `anyDuplicated` in base is AWESOME!
# allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
if (allLen1) {
irows = f__
} else {
join.many = isTRUE(getOption("datatable.join.many")) # #914, default TRUE for backward compatibility
anyDups = !notjoin &&
(
# #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
(join.many && !allow.cartesian) ||
# special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
(!join.many && (length(f__) != 1L || len__ != nrow(x)))
) &&
anyDuplicated(f__, incomparables = c(0L, NA_integer_)) > 0L
limit = if (anyDups) { # #742. If 'i' has no duplicates, ignore
if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
if (allow.cartesian) internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
as.double(nrow(x)+nrow(i)) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
}
irows = vecseq(f__, len__, limit)
}
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate
# Fix for #1092 and #1074
# TODO: implement better version of "any"/"all"/"which" to avoid
# unnecessary construction of logical vectors
if (is.null(nomatch) && allLen1) irows = irows[irows != 0L]
} else {
if (length(xo) && missing(on))
internal_error("Cannot by=.EACHI when joining to an index, yet") # nocov
# since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too.
if (length(irows))
internal_error("irows has length in by=.EACHI") # nocov
}
if (nqbyjoin) {
irows = if (length(xo)) xo[irows] else irows
xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix
f__ = ans[[1L]]; len__ = ans[[2L]]
allLen1 = FALSE # TODO; should this always be FALSE?
irows = NULL # important to reset
if (any_na(as_list(xo))) xo = xo[!is.na(xo)]
}
} else {
if (!byjoin) { #1287 and #1271
irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic
if (is.null(nomatch)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows
}
# TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"?
# TODO: how about when nomatch=NULL, can we avoid allocating then as well?
}
if (length(xo) && length(irows)) {
irows = xo[irows] # TO DO: fsort here?
if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit.
if (verbose) {last.started.at=proc.time();catf("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()}
irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
}
}
if (optimizedSubset){
## special treatment for calls like DT[x == 3] that are transformed into DT[J(x=3), on = "x==x"]
if(!.Call(CisOrderedSubset, irows, nrow(x))){
## restore original order. This is a very expensive operation.
## benchmarks have shown that starting with 1e6 irows, a tweak can significantly reduce time
## (see #2366)
if (verbose) {last.started.at=proc.time();catf("Reordering %d rows after bmerge done in ... ", length(irows));flush.console()}
if(length(irows) < 1e6L){
irows = fsort(irows, internal=TRUE) ## internally, fsort on integer falls back to forderv
} else {
irows = as.integer(fsort(as.numeric(irows))) ## nocov; parallelized for numeric, but overhead of type conversion
}
if (verbose) {cat(timetaken(last.started.at), "\n");flush.console()} # notranslate
}
## make sure, all columns are taken from x and not from i.
## This is done by simply telling data.table to continue as if there was a simple subset
leftcols = integer(0L)
rightcols = integer(0L)
i = irows ## important to make i not a data.table because otherwise Gforce doesn't kick in
}
}
else {
if (!is.null(on)) {
stopf("logical error. i is not a data.table, but 'on' argument is provided.")
}