Large scale machine learning projects with r suite

Post on 22-Jan-2018

84 views 1 download

Transcript of Large scale machine learning projects with r suite

○○

> rsuite install Detecting repositories ... Will use repositories: CRAN.CRAN = https://mran.microsoft.com/snapshot/2017-10-15 CRAN.CRANextra = http://www.stats.ox.ac.uk/pub/RWin Other = http://wlog-rsuite.s3.amazonaws.com Installing RSuite(v0.17x) package ... installing the source package 'RSuite' All done.

> rsuite proj start -n spmf

Commands: update Checks if newest version of RSuite CLI is installed. If not installer for newest version is downloaded and installation is initiated. install Install RSuite with all the dependencies. proj Use it to manage project, its dependencies, and build project packages. repo Use to manage repositories. e.g. upload packages. pkgzip Use to create PKGZIP packages to fillup remove repository. version Show RSuite CLI version. help Show this message and exit. Call 'rsuite [command] help' to get information on acceptable [args].

logs/.gitignore

PARAMETERS

●●

○○○○

●●●

LogLevel: INFON_days: 365solver_max_iterations: 10solver_opt_horizon: 8

●●

○ main○ if __name__ == "__main__":

predmodel

● ==● >=● <=

master.R

spmf/libs

packages_import.R

master.R

import_training.R (I)

● import/<session_id>/● work/<session_id>/

library(predmodel)

import_path <- file.path(script_path, "../import")work_path <- file.path(script_path, "../work")

# requiredsession_id <- args$get(name = "session_id", default = "201711122000", required = FALSE)

loginfo("--> Session id:%s", session_id)

session_work <- file.path(work_path, session_id)

if(!dir.exists(session_work)) { dir.create(session_work)}

import_training_data(file.path(import_path, session_id), session_work)

import_training.R (II)

devtools

import_training_data

#' @exportimport_training_data <- function(import_path, work_path) { pkg_loginfo("Importing from %s into %s", import_path, work_path)

n <- 10000 dt <- data.table(feature1 = rnorm(n), feature2 = rnorm(n)) m <- round(n*0.3) dt[, resp := c(rep(1, m), rep(0, n - m))] fwrite(x = dt, file = file.path(work_path, "training.csv"), sep = ";")}

estimate_model.R (I)

●●library(predmodel)

work_path <- file.path(script_path, "../work")# requiredsession_id <- args$get(name = "session_id", required = FALSE, default = "201710111655")loginfo("--> Session id:%s", session_id)session_work <- file.path(work_path, session_id)

h2o.init(max_mem_size = "4g", nthreads = 2)

logdebug("---> H2O started")

train_file <- file.path(session_work, "training.csv")

stopifnot(file.exists(train_file))

train_file %>% transform_training() %>% estimate_model(session_id) %>% save_model(session_work)

transform_training

#' @exporttransform_training <- function(train_file) { dt <- h2o.importFile(path = train_file, destination_frame = "train_dt", parse = TRUE, header = TRUE, sep = ";") dt$resp <- as.factor(dt$resp) dt <- h2o.assign(data=dt, key = "train_dt")

return(dt)}

estimate_model

#'@exportestimate_model <- function(dt, session_id) { model <- h2o.gbm(x = colnames(dt), y = "resp", training_frame = dt, model_id = sprintf("gbm_%s", session_id), ntrees = 10, learn_rate = 0.1)}

save_model

#' @exportsave_model <- function(model, session_work) { h2o.saveModel(model, path = session_work, force =TRUE)}

import_test.R (I)

● import/<session_id>/● work/<session_id>/

library(predmodel)

import_path <- file.path(script_path, "../import")work_path <- file.path(script_path, "../work")

# requiredsession_id <- args$get(name = "session_id", default = "201711122000", required = FALSE)

loginfo("--> Session id:%s", session_id)

session_work <- file.path(work_path, session_id)

if(!dir.exists(session_work)) { dir.create(session_work)}

import_test_data(file.path(import_path, session_id), session_work)

import_test_data

#' @exportimport_test_data <- function(import_path, work_path) { pkg_loginfo("Importing from %s into %s", import_path, work_path)

n <- 1000 dt <- data.table(feature1 = rnorm(n), feature2 = rnorm(n)) fwrite(x = dt, file = file.path(work_path, "test.csv"), sep = ";")}

score_model.R (I)

● work/<score_session_id>● work/<train_session_id>● export/<score_session_id>

score_model.R (II)

library(h2o)library(magrittr)library(predmodel)

work_path <- file.path(script_path, "../work")export_path <- file.path(script_path, "../export")

# requiredtrain_session_id <- args$get(name = "train_session_id", required = FALSE, default = "201710111655")score_session_id <- args$get(name = "score_session_id", required = FALSE, default = "201710111655")

loginfo("--> train session id:%s", train_session_id)loginfo("--> score session id:%s", score_session_id)

score_session_export <- export_pathtrain_session_work <- file.path(work_path, train_session_id)score_session_work <- file.path(work_path, score_session_id)

h2o.init(max_mem_size = "4g", nthreads = 2)

logdebug("---> H2O started")

test_file <- file.path(score_session_work, "test.csv")model_file <- file.path(train_session_work, sprintf("gbm_%s", train_session_id))

stopifnot(file.exists(test_file))stopifnot(file.exists(model_file))

test_dt <- test_file %>% transform_test()

score_model(test_dt = test_dt, model_path = model_file) %>% export_score(export_path = export_path, score_session_id = score_session_id)

transform_test

#' @exporttransform_test <- function(test_file) { h2o.importFile(path = test_file, destination_frame = "test_dt", parse = TRUE, header = TRUE, sep = ";")}

score_model

#' @exportscore_model <- function(test_dt, model_path) { model <- h2o.loadModel(model_path) pred_dt <- h2o.predict(model, test_dt) pred_dt}

export_score

#' @exportexport_score <- function(score_dt, score_session_id, export_path) { score_dt <- as.data.table(score_dt) score_dt[, score_session_id := score_session_id] fwrite(x = score_dt, file = file.path(export_path, "score.csv"), sep = ";", append = TRUE)}

Productionspmf_0.1_001.zipProduction/spmf import export

work

Production/spmf/R

a. Rscript import_training.Rb. Rscript estimate_model.Rc. Rscript import_test.Rd. Rscript score_model.R

Production/spmf/export

print

loginfo("Phase 1 passed")

logdebug("Iter %d done", i)

logtrace("Iter %d done", i)

logwarning("Are you sure?")

logerror("I failed :(")

Packages

pkg_loginfo("Phase 1 passed")

pkg_logdebug("Iter %d done", i)

pkg_logtrace("Iter %d done", i)

pkg_logwarning("Are you sure?")

pkg_logerror("I failed :(")

2017-11-13 13:47:03 INFO::--> Session id:201711122000

2017-11-13 13:47:03 INFO:predmodel:Importing from C:/Workplace/Sandbox/Production/spmf/R/../import/201711122000 into C:/Workplace/Sandbox/Production/spmf/R/../work/201711122000

2017-11-13 13:47:14 INFO::--> Session id:201711122000

2017-11-13 13:47:51 INFO::--> Session id:201711131000

2017-11-13 13:47:51 INFO:predmodel:Importing from C:/Workplace/Sandbox/Production/spmf/R/../import/201711131000 into C:/Workplace/Sandbox/Production/spmf/R/../work/201711131000

2017-11-13 13:47:57 INFO::--> train session id:201711122000

2017-11-13 13:47:57 INFO::--> score session id:201711131000

LogLevel: INFO

LogLevel: DEBUG

LogLevel: TRACE

import_training.R

tests/test_spmf.R

library(predmodel)library(testthat)

context("Testing context")

test_that(desc = "Test", code = { expect_true(5 > 3) expect_true(pi < 3) })