Участник:Riabenko/R
Материал из MachineLearning.
Содержание |
Фильтрация data.table по нескольким полям
library(nycflights13) flights <- data.table(flights) subDT <- data.table(sub_orig = c("EWR", "LGA"), sub_dest = c("IAH", "ATL")) res <- flights[subDT, on = c(dest = "sub_dest", origin = "sub_orig")]
caret::train
Методы построения регрессии и пропуски:
в X\в Y | Можно | Нельзя |
Можно | bayesglm, cubist, evtree, gam, gamboost, gamLoess, glm, kernelpls, lm, M5, M5Rules, nnet, pcr, pls, rlm, rpart, rpart2, simpls, treebag, widekernelpls | bstLs, bstTree, cforest, ctree, ctree2, gamboost, gamSpline, glmboost, rlm |
Нельзя | avNNet, blackboost, gam, gamLoess, gamSpline, glmStepAIC, kknn, lmStepAIC, partDSA, pcaNNet |
Способы взятия подмножеств
> x <- data.frame(x1 = sample(10, 1e6, TRUE), x2=sample(10, 1e6, TRUE)) > library(microbenchmark) > microbenchmark(subset(x, x1>4), subset(x, !x1>4), x[x$x1>4,], x[! x$x1<=4,], x[which(x$x1>4),], x[-which(x$x1<=4),], times = 100) Unit: milliseconds expr min lq mean median uq max neval subset(x, x1 > 4) 69.359939 71.7853810 78.93602891 75.4477780 85.3696460 98.923690 100 subset(x, !x1 > 4) 59.964936 62.3175835 67.68606984 65.2725525 69.9231070 91.424414 100 x[x$x1 > 4, ] 46.198018 48.0494585 55.83072897 52.3508610 55.8897145 264.015716 100 x[!x$x1 <= 4, ] 47.672936 49.8780910 56.15222692 52.7657765 59.4456740 79.069311 100 x[which(x$x1 > 4), ] 38.071142 39.1720085 45.19482621 41.0301015 48.0811995 67.067353 100 x[-which(x$x1 <= 4), ] 54.677658 56.6674680 65.09798336 59.6876295 65.6659910 271.607745 100
Бенчмарк по фильтрации data.table
library(data.table) library(microbenchmark) x <- data.table(x1 = sample(100, 1e7, TRUE), x2=factor(sample(1000, 1e7, TRUE)), keep = TRUE) y <- data.table(x1 = sample(100, 10, TRUE), x2=factor(sample(1000, 10, TRUE), levels=levels(x$x2))) y <- y[, .(min(x1)), by=.(x2)] setnames(y,"V1","x1") microbenchmark(for (i in 1:nrow(y)){x[x2 == y$x2[i] & x1>= y$x1[i]]$keep <- FALSE}, for (i in 1:nrow(y)){x[keep & x2 == y$x2[i] & x1>= y$x1[i]]$keep <- FALSE}, { omit <- c() for (i in 1:nrow(y)){ omit <- union(omit, which(x$x2 == y$x2[i] & x$x1>= y$x1[i])) } }, { omit <- c() for (i in 1:nrow(y)){ omit <- unique(c(omit, which(x$x2 == y$x2[i] & x$x1>= y$x1[i]))) } }, { keep <- 1:nrow(x) for (i in 1:nrow(y)){ keep <- setdiff(keep, which(x$keep & x$x2 == y$x2[i] & x$x1>= y$x1[i])) } }, { keep <- rep(TRUE, nrow(x)) for (i in 1:nrow(y)){ keep <- keep & !(x$x2 == y$x2[i] & x$x1>= y$x1[i]) } }, times = 10) min lq mean median uq max neval 8.590386 9.233665 11.911363 10.822779 15.088464 16.971543 10 9.012157 9.562583 11.859151 11.422780 13.038409 17.203784 10 4.158011 4.515759 5.168060 4.891767 5.677421 7.269970 10 3.889579 4.410539 4.962094 4.592407 5.963611 6.857607 10 11.598566 13.211678 14.910529 14.406133 15.388422 20.403169 10 5.194291 5.238900 6.511597 6.302388 6.954968 8.843804 10
Странные кодировки файлов
iconvlist() - список всех кодировок
Чтение файлов со странной кириллицей:
fread("cat 001_20150317060301_INFO01000201503170042.OUT | iconv -f 'Windows-1251' -t 'UTF-8", sep='\n', header=F)