Chapter 15 Working with data●table
Last update: Thu Nov 19 14:24:08 2020 -0600 (ca4f8b4a0)
15.1 Load PyTorch libraries
15.2 Load dataset
15.3 Datasets without normalization
train_dataset = torchvision$datasets$ImageFolder(root = train_data_path,
transform = torchvision$transforms$ToTensor()
)
print(train_dataset)
#> Dataset ImageFolder
#> Number of datapoints: 60000
#> Root location: ./mnist_png_full/training/
#> StandardTransform
#> Transform: ToTensor()
15.4 Using data.table
library(data.table)
library(tictoc)
tic()
fun_list <- list(
numel = c("numel"),
sum = c("sum", "item"),
mean = c("mean", "item"),
std = c("std", "item"),
med = c("median", "item"),
max = c("max", "item"),
min = c("min", "item")
)
idx <- seq(0L, 599L)
fun_get_tensor <- function(x) py_get_item(train_dataset, x)[[0]]
stat_fun <- function(x, str_fun) {
fun_var <- paste0("fun_get_tensor(x)", "$", str_fun, "()")
sapply(idx, function(x)
ifelse(is.numeric(eval(parse(text = fun_var))), # size return character
eval(parse(text = fun_var)), # all else are numeric
as.character(eval(parse(text = fun_var)))))
}
dt <- data.table(ridx = idx+1,
do.call(data.table,
lapply(
sapply(fun_list, function(x) paste(x, collapse = "()$")),
function(y) stat_fun(1, y)
)
)
)
Summary statistics:
#> ridx numel sum mean std med max min
#> 1: 1 2352 366 0.156 0.329 0 1 0
#> 2: 2 2352 284 0.121 0.297 0 1 0
#> 3: 3 2352 645 0.274 0.420 0 1 0
#> 4: 4 2352 410 0.174 0.355 0 1 0
#> 5: 5 2352 321 0.137 0.312 0 1 0
#> 6: 6 2352 654 0.278 0.421 0 1 0
Elapsed time per size of sample:
toc()
# 60 1.266 sec elapsed
# 600 11.798 sec elapsed;
# 6000 119.256 sec elapsed;
# 60000 1117.619 sec elapsed
#> 14.8 sec elapsed