I have hundreds of txt delim tables like this:
https://gist.github.com/renqian/d81cb48c686cf5376800
However, while read.table properly handles reading the content with sep="\t", data.table::fread produces a data.table with wrongly-typed columns, some of them coerced to character due to white-space characters 0x20 while they truly are numeric or integer when such white-spaces are properly trimmed.
> df <- read.table("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T,sep = "\t")
> colnames(df)
[1] "IFLxID" "IFLxName" "Ifcd" "Tdate" "Ttime" "UpdateMillisec" "Cp"
[8] "Chg" "ChgPct" "Cq" "Cm" "Oc" "S5" "S4"
[15] "S3" "S2" "S1" "B1" "B2" "B3" "B4"
[22] "B5" "Sv5" "Sv4" "Sv3" "Sv2" "Sv1" "Bv1"
[29] "Bv2" "Bv3" "Bv4" "Bv5" "BS" "Bsratio" "PreClosePrc"
[36] "OpenPrc" "Hp" "Lp" "ClosePrc" "UpperLmtPrc" "LowerLmtPrc" "Tq"
[43] "Tm" "PreOpnIntrst" "OpnIntrst" "PreStlmtPrc" "StlmtPrc" "PreDelta" "Delta"
[50] "SettleGroupID" "SettleID"
> sapply(df,class)
IFLxID IFLxName Ifcd Tdate Ttime UpdateMillisec Cp Chg
"factor" "factor" "factor" "integer" "integer" "numeric" "numeric" "numeric"
ChgPct Cq Cm Oc S5 S4 S3 S2
"numeric" "numeric" "numeric" "factor" "numeric" "numeric" "numeric" "numeric"
S1 B1 B2 B3 B4 B5 Sv5 Sv4
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
Sv3 Sv2 Sv1 Bv1 Bv2 Bv3 Bv4 Bv5
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
BS Bsratio PreClosePrc OpenPrc Hp Lp ClosePrc UpperLmtPrc
"factor" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
LowerLmtPrc Tq Tm PreOpnIntrst OpnIntrst PreStlmtPrc StlmtPrc PreDelta
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
Delta SettleGroupID SettleID
"numeric" "logical" "numeric"
> df$UpdateMillisec[1]
[1] 500
The results produced by data.table::fread is as follows:
> dt <- data.table::fread("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T)
Read 100000 rows and 51 (of 51) columns from 0.041 GB file in 00:00:10
> colnames(dt)
[1] "IFLxID" "IFLxName" "Ifcd " "Tdate " "Ttime " "UpdateMillisec"
[7] "Cp " "Chg " "ChgPct" "Cq " "Cm " "Oc "
[13] "S5 " "S4 " "S3 " "S2 " "S1 " "B1 "
[19] "B2 " "B3 " "B4 " "B5 " "Sv5 " "Sv4 "
[25] "Sv3 " "Sv2 " "Sv1 " "Bv1 " "Bv2 " "Bv3 "
[31] "Bv4 " "Bv5 " "BS" "Bsratio" "PreClosePrc" "OpenPrc"
[37] "Hp " "Lp " "ClosePrc" "UpperLmtPrc" "LowerLmtPrc" "Tq "
[43] "Tm " "PreOpnIntrst" "OpnIntrst" "PreStlmtPrc" "StlmtPrc" "PreDelta"
[49] "Delta" "SettleGroupID" "SettleID"
> sapply(dt,class)
IFLxID IFLxName Ifcd Tdate Ttime UpdateMillisec Cp
"character" "character" "character" "integer" "integer" "character" "numeric"
Chg ChgPct Cq Cm Oc S5 S4
"character" "character" "character" "character" "character" "character" "character"
S3 S2 S1 B1 B2 B3 B4
"character" "character" "character" "character" "character" "character" "character"
B5 Sv5 Sv4 Sv3 Sv2 Sv1 Bv1
"character" "character" "character" "character" "character" "character" "character"
Bv2 Bv3 Bv4 Bv5 BS Bsratio PreClosePrc
"character" "character" "character" "character" "character" "character" "character"
OpenPrc Hp Lp ClosePrc UpperLmtPrc LowerLmtPrc Tq
"character" "character" "character" "character" "character" "character" "character"
Tm PreOpnIntrst OpnIntrst PreStlmtPrc StlmtPrc PreDelta Delta
"character" "character" "character" "character" "character" "character" "numeric"
SettleGroupID SettleID
"integer" "character"
> dt$UpdateMillisec[1]
[1] "500 "
Even when sep="\t" is explicitly specified, things remain the same:
> dt <- data.table::fread("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T,sep = "\t")
Read 100000 rows and 51 (of 51) columns from 0.041 GB file in 00:00:09
> colnames(dt)
[1] "IFLxID" "IFLxName" "Ifcd " "Tdate " "Ttime " "UpdateMillisec"
[7] "Cp " "Chg " "ChgPct" "Cq " "Cm " "Oc "
[13] "S5 " "S4 " "S3 " "S2 " "S1 " "B1 "
[19] "B2 " "B3 " "B4 " "B5 " "Sv5 " "Sv4 "
[25] "Sv3 " "Sv2 " "Sv1 " "Bv1 " "Bv2 " "Bv3 "
[31] "Bv4 " "Bv5 " "BS" "Bsratio" "PreClosePrc" "OpenPrc"
[37] "Hp " "Lp " "ClosePrc" "UpperLmtPrc" "LowerLmtPrc" "Tq "
[43] "Tm " "PreOpnIntrst" "OpnIntrst" "PreStlmtPrc" "StlmtPrc" "PreDelta"
[49] "Delta" "SettleGroupID" "SettleID"
> sapply(dt,class)
IFLxID IFLxName Ifcd Tdate Ttime UpdateMillisec Cp
"character" "character" "character" "integer" "integer" "character" "numeric"
Chg ChgPct Cq Cm Oc S5 S4
"character" "character" "character" "character" "character" "character" "character"
S3 S2 S1 B1 B2 B3 B4
"character" "character" "character" "character" "character" "character" "character"
B5 Sv5 Sv4 Sv3 Sv2 Sv1 Bv1
"character" "character" "character" "character" "character" "character" "character"
Bv2 Bv3 Bv4 Bv5 BS Bsratio PreClosePrc
"character" "character" "character" "character" "character" "character" "character"
OpenPrc Hp Lp ClosePrc UpperLmtPrc LowerLmtPrc Tq
"character" "character" "character" "character" "character" "character" "character"
Tm PreOpnIntrst OpnIntrst PreStlmtPrc StlmtPrc PreDelta Delta
"character" "character" "character" "character" "character" "character" "numeric"
SettleGroupID SettleID
"integer" "character"
> dt$UpdateMillisec[1]
[1] "500 "
The white-spaces are represented by 20 in hex raw:
> x <- dt$UpdateMillisec[1]
> x
[1] "500 "
> charToRaw(x)
[1] 35 30 30 20 20 20 20 20 20 20 20 20 20 20
I have hundreds of
txtdelim tables like this:https://gist.github.com/renqian/d81cb48c686cf5376800
However, while
read.tableproperly handles reading the content withsep="\t",data.table::freadproduces adata.tablewith wrongly-typed columns, some of them coerced tocharacterdue to white-space characters0x20while they truly arenumericorintegerwhen such white-spaces are properly trimmed.The results produced by
data.table::freadis as follows:Even when
sep="\t"is explicitly specified, things remain the same:The white-spaces are represented by
20in hex raw: