Support variable length whitespace delimiter in fread, as read.table does

I have hundreds of `txt` delim tables like this:

https://gist.github.com/renqian/d81cb48c686cf5376800

However, while `read.table` properly handles reading the content with `sep="\t"`, `data.table::fread` produces a `data.table` with wrongly-typed columns, some of them coerced to `character` due to white-space characters `0x20` while they truly are `numeric` or `integer` when such white-spaces are properly trimmed.

``` rconsole
> df <- read.table("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T,sep = "\t")
> colnames(df)
 [1] "IFLxID"         "IFLxName"       "Ifcd"           "Tdate"          "Ttime"          "UpdateMillisec" "Cp"            
 [8] "Chg"            "ChgPct"         "Cq"             "Cm"             "Oc"             "S5"             "S4"            
[15] "S3"             "S2"             "S1"             "B1"             "B2"             "B3"             "B4"            
[22] "B5"             "Sv5"            "Sv4"            "Sv3"            "Sv2"            "Sv1"            "Bv1"           
[29] "Bv2"            "Bv3"            "Bv4"            "Bv5"            "BS"             "Bsratio"        "PreClosePrc"   
[36] "OpenPrc"        "Hp"             "Lp"             "ClosePrc"       "UpperLmtPrc"    "LowerLmtPrc"    "Tq"            
[43] "Tm"             "PreOpnIntrst"   "OpnIntrst"      "PreStlmtPrc"    "StlmtPrc"       "PreDelta"       "Delta"         
[50] "SettleGroupID"  "SettleID"      
> sapply(df,class)
        IFLxID       IFLxName           Ifcd          Tdate          Ttime UpdateMillisec             Cp            Chg 
      "factor"       "factor"       "factor"      "integer"      "integer"      "numeric"      "numeric"      "numeric" 
        ChgPct             Cq             Cm             Oc             S5             S4             S3             S2 
     "numeric"      "numeric"      "numeric"       "factor"      "numeric"      "numeric"      "numeric"      "numeric" 
            S1             B1             B2             B3             B4             B5            Sv5            Sv4 
     "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
           Sv3            Sv2            Sv1            Bv1            Bv2            Bv3            Bv4            Bv5 
     "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
            BS        Bsratio    PreClosePrc        OpenPrc             Hp             Lp       ClosePrc    UpperLmtPrc 
      "factor"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
   LowerLmtPrc             Tq             Tm   PreOpnIntrst      OpnIntrst    PreStlmtPrc       StlmtPrc       PreDelta 
     "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
         Delta  SettleGroupID       SettleID 
     "numeric"      "logical"      "numeric" 
> df$UpdateMillisec[1]
[1] 500
```

The results produced by `data.table::fread` is as follows:

``` rconsole
> dt <- data.table::fread("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T)
Read 100000 rows and 51 (of 51) columns from 0.041 GB file in 00:00:10
> colnames(dt)
 [1] "IFLxID"          "IFLxName"        "Ifcd  "          "Tdate   "        "Ttime "          "UpdateMillisec" 
 [7] "Cp     "         "Chg    "         "ChgPct"          "Cq "             "Cm           "   "Oc  "           
[13] "S5     "         "S4     "         "S3     "         "S2     "         "S1     "         "B1     "        
[19] "B2     "         "B3     "         "B4     "         "B5     "         "Sv5   "          "Sv4   "         
[25] "Sv3   "          "Sv2   "          "Sv1   "          "Bv1   "          "Bv2   "          "Bv3   "         
[31] "Bv4   "          "Bv5   "          "BS"              "Bsratio"         "PreClosePrc"     "OpenPrc"        
[37] "Hp     "         "Lp     "         "ClosePrc"        "UpperLmtPrc"     "LowerLmtPrc"     "Tq    "         
[43] "Tm             " "PreOpnIntrst"    "OpnIntrst"       "PreStlmtPrc"     "StlmtPrc"        "PreDelta"       
[49] "Delta"           "SettleGroupID"   "SettleID"       
> sapply(dt,class)
         IFLxID        IFLxName          Ifcd          Tdate             Ttime   UpdateMillisec         Cp      
    "character"     "character"     "character"       "integer"       "integer"     "character"       "numeric" 
        Chg              ChgPct             Cq    Cm                       Oc           S5              S4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        S3              S2              S1              B1              B2              B3              B4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        B5               Sv5             Sv4             Sv3             Sv2             Sv1             Bv1    
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
         Bv2             Bv3             Bv4             Bv5                 BS         Bsratio     PreClosePrc 
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        OpenPrc         Hp              Lp             ClosePrc     UpperLmtPrc     LowerLmtPrc          Tq     
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
Tm                 PreOpnIntrst       OpnIntrst     PreStlmtPrc        StlmtPrc        PreDelta           Delta 
    "character"     "character"     "character"     "character"     "character"     "character"       "numeric" 
  SettleGroupID        SettleID 
      "integer"     "character" 
> dt$UpdateMillisec[1]
[1] "500           "
```

Even when `sep="\t"` is explicitly specified, things remain the same:

``` rconsole
> dt <- data.table::fread("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T,sep = "\t")
Read 100000 rows and 51 (of 51) columns from 0.041 GB file in 00:00:09
> colnames(dt)
 [1] "IFLxID"          "IFLxName"        "Ifcd  "          "Tdate   "        "Ttime "          "UpdateMillisec" 
 [7] "Cp     "         "Chg    "         "ChgPct"          "Cq "             "Cm           "   "Oc  "           
[13] "S5     "         "S4     "         "S3     "         "S2     "         "S1     "         "B1     "        
[19] "B2     "         "B3     "         "B4     "         "B5     "         "Sv5   "          "Sv4   "         
[25] "Sv3   "          "Sv2   "          "Sv1   "          "Bv1   "          "Bv2   "          "Bv3   "         
[31] "Bv4   "          "Bv5   "          "BS"              "Bsratio"         "PreClosePrc"     "OpenPrc"        
[37] "Hp     "         "Lp     "         "ClosePrc"        "UpperLmtPrc"     "LowerLmtPrc"     "Tq    "         
[43] "Tm             " "PreOpnIntrst"    "OpnIntrst"       "PreStlmtPrc"     "StlmtPrc"        "PreDelta"       
[49] "Delta"           "SettleGroupID"   "SettleID"       
> sapply(dt,class)
         IFLxID        IFLxName          Ifcd          Tdate             Ttime   UpdateMillisec         Cp      
    "character"     "character"     "character"       "integer"       "integer"     "character"       "numeric" 
        Chg              ChgPct             Cq    Cm                       Oc           S5              S4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        S3              S2              S1              B1              B2              B3              B4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        B5               Sv5             Sv4             Sv3             Sv2             Sv1             Bv1    
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
         Bv2             Bv3             Bv4             Bv5                 BS         Bsratio     PreClosePrc 
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        OpenPrc         Hp              Lp             ClosePrc     UpperLmtPrc     LowerLmtPrc          Tq     
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
Tm                 PreOpnIntrst       OpnIntrst     PreStlmtPrc        StlmtPrc        PreDelta           Delta 
    "character"     "character"     "character"     "character"     "character"     "character"       "numeric" 
  SettleGroupID        SettleID 
      "integer"     "character" 
> dt$UpdateMillisec[1]
[1] "500           "
```

The white-spaces are represented by `20` in hex raw:

``` rconsole
> x <- dt$UpdateMillisec[1]
> x
[1] "500           "
> charToRaw(x)
 [1] 35 30 30 20 20 20 20 20 20 20 20 20 20 20
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Support variable length whitespace delimiter in fread, as read.table does #785

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Support variable length whitespace delimiter in fread, as read.table does #785

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions