Data.table: [Bug] fwrite in large environments (tables up to 100M rows)

Created on 21 Dec 2016  路  9Comments  路  Source: Rdatatable/data.table

I have been using fwrite in a large AWS environment detailed below:

Instance: m4.4xlarge
RAM: 64gb
Threads: 16
Cost: $0.862 hourly

I notice that when writing out numeric values, the written output is incorrect. When the same data is coerced to the integer class, the output seems to be correct. This does not seem to be a problem with setDTthreads(1). However, errors start to creep in when setDTthreads(2), albeit less errors than setDTthreads(16). I have detailed the two scenarios below.

Writing numeric output with fwrite

# Set assumptions
library(data.table) # data.table 1.10.0 
# Also tested for 1.10.1 IN DEVELOPMENT built 
# 2016-12-09 03:17:34 UTC
N=1e8
set.seed(1)

# Make test data
check <- data.table(ID=1:N, Time=0:49)
check[, paste0("V",1:5) := lapply(rep(N, 5), rpois, .N)]
sapply(check, class)

# Modify type of test data
check[, names(check) := lapply(.SD, function(x) x+0.1)]
sapply(check, class)
check[, names(check) := lapply(.SD, function(x) x-0.1)]

# Write data
fwrite(check, "check.csv")
rm(check)
gc()

# Read written data
checkWrite <- fread("check.csv")
sapply(checkWrite, class)

# Tests
checkWrite[, .N, by=.(Time)]

#           Time       N
#    1:        0 2000000
#    2:        1 1986674
#    3:        2 1985449
#    4:        3 1984278
#    5:        4 1984690
#   ---                 
#44725: 99983607       1
#44726: 99980978       1
#44727: 99646303       1
#44728: 99649539       1
#44729: 99653656       1

checkWrite[V1 != round(V1,0)]

#             ID Time           V1        V2        V3        V4        V5
#    1:      449   48  97664.38965  99985524  99998767 100009754 100010494
#    2:     2509    8  97643.65527  99996565  99999968  99992995  99974462
#    3:     2751    0     23.84322  99996050 100007735 100008407 100005063
#    4:     6701    0     23.84024  99998435  99993597 100000237  99999963
#    5:     8101    0 781300.35938  99991390  99998825 100005920  99994596
#   ---                                                                   
#22599: 99968305    4     47.67970  99988021 100012376  99996538 100012572
#22600: 99970551    0     23.84325  99996674  99995685 100014894 100013937
#22601: 99973932   31     11.92025 100012133  99990838  99993337  99984441
#22602: 99975660    9     47.68194  99985576  99992718  99982681  99991041
#22603: 99977051    0     47.68710  99998134 100004288 100007002  99990199

Writing integer output with fwrite

# Set assumptions
library(data.table) # data.table 1.10.0
N=1e8
set.seed(1)

# Make test data
check <- data.table(ID=1:N, Time=0:49)
check[, paste0("V",1:5) := lapply(rep(N, 5), rpois, .N)]
sapply(check, class)

# Modify type of test data
check[, names(check) := lapply(.SD, function(x) x+0.1)]
sapply(check, class)
check[, names(check) := lapply(.SD, function(x) x-0.1)]

# Convert classes back to integer
check[, names(check) := lapply(.SD, as.integer)]
sapply(check, class)

# Write data
fwrite(check, "check.csv")
rm(check)
gc()

# Read written data
checkWrite <- fread("check.csv")
sapply(checkWrite, class)

# Tests
checkWrite[, .N, by=.(Time)]

#    Time       N
# 1:    0 2000000
# 2:    1 2000000
# 3:    2 2000000
# 4:    3 4000000
# 5:    5 2000000
#   ---     
# 46:   45 2000000
# 47:   46 2000000
# 48:   47 2000000
# 49:   48 2000000
# 50:   49 2000000

checkWrite[V1 != round(V1,0)]

# Empty data.table (0 rows) of 7 cols: ID,Time,V1,V2,V3,V4...
fwrite

Most helpful comment

I have the same problem, reproducible under conditions where multiple threads are used and for numeric values only (integers seem to work fine) as @mgahan wrote. It interestingly appears to be isolated to instances where there's > 1 column in the data table:

library(data.table) # data.table 1.10.0
set.seed(42)
n <- 250000 # number of rows
setDTthreads(4)

# With single variable alone ------------------------------

dt <- data.table(as.numeric(sample(0:1, n, replace = T)))
names(dt) <- "zero_one"

fwrite(dt, "fwrite.csv")
dt_fwrite <- fread("fwrite.csv")

write.csv(dt, "writecsv.csv", row.names = F)
dt_writecsv <- fread("writecsv.csv")

> unique(dt$zero_one)
[1] 1 0
> unique(dt_fwrite$zero_one)
[1] 1 0
> unique(dt_writecsv$zero_one)
[1] 1 0

# With additional numeric variable -----------------------

dt <- data.table(runif(n))
dt[, zero_one := as.numeric(sample(0:1, n, replace = T))]

fwrite(dt, "fwrite.csv")
dt_fwrite <- fread("fwrite.csv")

write.csv(dt, "writecsv.csv", row.names = F)
dt_writecsv <- fread("writecsv.csv")

> unique(dt$zero_one)
[1] 0 1
> unique(dt_fwrite$zero_one)
[1] 0.000000000 1.000000000 0.062500000 0.001953125 0.250000000 0.125000000
[7] 0.500000000
> unique(dt_writecsv$zero_one)
[1] 0 1

All 9 comments

I have the same problem, reproducible under conditions where multiple threads are used and for numeric values only (integers seem to work fine) as @mgahan wrote. It interestingly appears to be isolated to instances where there's > 1 column in the data table:

library(data.table) # data.table 1.10.0
set.seed(42)
n <- 250000 # number of rows
setDTthreads(4)

# With single variable alone ------------------------------

dt <- data.table(as.numeric(sample(0:1, n, replace = T)))
names(dt) <- "zero_one"

fwrite(dt, "fwrite.csv")
dt_fwrite <- fread("fwrite.csv")

write.csv(dt, "writecsv.csv", row.names = F)
dt_writecsv <- fread("writecsv.csv")

> unique(dt$zero_one)
[1] 1 0
> unique(dt_fwrite$zero_one)
[1] 1 0
> unique(dt_writecsv$zero_one)
[1] 1 0

# With additional numeric variable -----------------------

dt <- data.table(runif(n))
dt[, zero_one := as.numeric(sample(0:1, n, replace = T))]

fwrite(dt, "fwrite.csv")
dt_fwrite <- fread("fwrite.csv")

write.csv(dt, "writecsv.csv", row.names = F)
dt_writecsv <- fread("writecsv.csv")

> unique(dt$zero_one)
[1] 0 1
> unique(dt_fwrite$zero_one)
[1] 0.000000000 1.000000000 0.062500000 0.001953125 0.250000000 0.125000000
[7] 0.500000000
> unique(dt_writecsv$zero_one)
[1] 0 1

@mgahan @jmosser Great find - thanks for reporting. Would you mind testing it's ok now please.

@mattdowle I just tested it out and my tests check out. Thanks for all the hard work!

@mgahan Excellent - thanks!

@mattdowle @mgahan which version of the package has this bug fix?

@mattdowle I am not sure that this issue is resolved. Seeing it in DT v 10.4.0:

Browse[1]> nrow(dt[population<1])
[1] 0
Browse[1]> nrow(newdt[population<1])
[1] 53

Browse[1]> fwrite(dt, file.path(worker.dir, paste0(country, '.csv')))
Written 36.9% of 8140 rows in 3 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 5 secs. Written 74.7% of 8140 rows in 4 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 1 secs.

Browse[1]> newdt <- fread(file.path(worker.dir, paste0(country, '.csv')))
Read 8140 rows and 3022 (of 3022) columns from 0.401 GB file in 00:00:08

Browse[1]> nrow(newdt[population<1])
[1] 38

Browse[1]> fwrite(dt, file.path(worker.dir, paste0(country, '.csv')))
Written 11.4% of 8140 rows in 2 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 15 secs.Written 49.2% of 8140 rows in 3 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 3 secs.

Browse[1]> newdt <- fread(file.path(worker.dir, paste0(country, '.csv')))
Read 8140 rows and 3022 (of 3022) columns from 0.401 GB file in 00:00:08

Browse[1]> nrow(newdt[population<1])
[1] 32

Browse[1]> fwrite(dt, file.path(worker.dir, paste0(country, '.csv')))
Written 0.9% of 8140 rows in 2 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 209 secs.Written 38.8% of 8140 rows in 3 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 4 secs.

Browse[1]> newdt <- fread(file.path(worker.dir, paste0(country, '.csv')))
Read 8140 rows and 3022 (of 3022) columns from 0.401 GB file in 00:00:08

Browse[1]> nrow(newdt[population<1])
[1] 35

Browse[1]> fwrite(dt, file.path(worker.dir, paste0(country, '.csv')))
Written 0.9% of 8140 rows in 2 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 209 secs.Written 38.8% of 8140 rows in 3 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 4 secs.

Browse[1]> newdt <- fread(file.path(worker.dir, paste0(country, '.csv')))
Read 8140 rows and 3022 (of 3022) columns from 0.401 GB file in 00:00:08

Browse[1]> nrow(newdt[population<1])
[1] 26

Browse[1]> sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS release 6.8 (Final)

locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base

other attached packages:
[1] jsonlite_1.1 RMySQL_0.10.9 DBI_0.5-1 plyr_1.8.4
[5] dplyr_0.5.0 readxl_1.0.0 stringr_1.1.0 magrittr_1.5
[9] lme4_1.1-12 Matrix_1.2-7.1 ggplot2_2.2.0 gridExtra_2.2.1
[13] data.table_1.10.4

loaded via a namespace (and not attached):
[1] Rcpp_0.12.8 splines_3.3.2 MASS_7.3-45 munsell_0.4.3
[5] colorspace_1.3-1 lattice_0.20-34 R6_2.2.0 minqa_1.2.4
[9] tools_3.3.2 grid_3.3.2 gtable_0.2.0 nlme_3.1-128
[13] pacman_0.4.1 lazyeval_0.2.0 assertthat_0.1 tibble_1.2
[17] nloptr_1.0.4 stringi_1.1.2 cellranger_1.1.0 scales_0.4.1
Browse[1]>

Resolution is not on CRAN. Install the current development version (1.10.5)

@MichaelChirico I don't think so. Just installed dev version, same behavior:

Browse[1]> newdt <- fread(file.path(worker.dir, paste0(country, '.csv')))
Read 8140 rows and 3022 (of 3022) columns from 0.401 GB file in 00:00:08

Browse[1]> fwrite(dt, file.path(worker.dir, paste0(country, '.csv')))
Written 0.9% of 8140 rows in 2 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 209 secs.Written 76.6% of 8140 rows in 3 secs using 40 threads. anyBufferGrown=no; maxBuffUsed=48%. Finished in 0 secs.

Browse[1]> newdt <- fread(file.path(worker.dir, paste0(country, '.csv')))

Browse[1]> nrow(dt[population<1])
[1] 0

Browse[1]> nrow(newdt[population<1])
[1] 26

Browse[1]> sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS release 6.8 (Final)

locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base

other attached packages:
[1] jsonlite_1.1 RMySQL_0.10.9 DBI_0.5-1 plyr_1.8.4
[5] dplyr_0.5.0 readxl_1.0.0 stringr_1.1.0 magrittr_1.5
[9] lme4_1.1-12 Matrix_1.2-7.1 ggplot2_2.2.0 gridExtra_2.2.1
[13] data.table_1.10.5

loaded via a namespace (and not attached):
[1] Rcpp_0.12.8 splines_3.3.2 MASS_7.3-45 munsell_0.4.3
[5] colorspace_1.3-1 lattice_0.20-34 R6_2.2.0 minqa_1.2.4
[9] tools_3.3.2 grid_3.3.2 gtable_0.2.0 nlme_3.1-128
[13] pacman_0.4.1 lazyeval_0.2.0 assertthat_0.1 tibble_1.2
[17] nloptr_1.0.4 stringi_1.1.2 cellranger_1.1.0 scales_0.4.1

Potentially could be specific to our cluster environment but others at my institute are seeing this same issue. Seems to be related to columns being accidentally reordered inside a row.

I would recommend that people be very cautious using fwrite in production code at this stage, this bug seems to be pervasive and is really difficult to track down in large outputs.

Was this page helpful?
0 / 5 - 0 ratings