The following is a script file containing all R code of all sections in this chapter.
library(DBI)
## The DBMS-specific code starts here
library(RMySQL)
drv <- dbDriver("MySQL")  # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB",  # connecting to the DBMS
                 username="myuser",password="mypasswd",
                 host="localhost")
## The DBMS-specific code ends here
## getting the results of an SQL query as a data frame
data <- dbGetQuery(con,"SELECT * FROM clients") 
## closing up stuff
dbDisconnect(con)  
dbUnloadDriver(drv)library(DBI)
library(RMySQL)
drv <- dbDriver("MySQL")  # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB",  # connecting to the DBMS
                 username="myuser",password="mypasswd",
                 host="localhost")
res  <- dbSendQuery(con,"SELECT * FROM transactions") 
while (!dbHasCompleted(res)) {
    # get the next 50 records on a data frame
    someData <- fetch(res, n = 50)  
    # call some function that handles the current chunk
    myProcessingFunction(someData)  
}
dbClearResult(res) # clear the results set
dbDisconnect(con)  # closing up stuff
dbUnloadDriver(drv)library(readr)
std <- read_delim("stud.txt", delim=" ", 
                  skip=1, col_names=c("StudentName","Math","English"))
stdstd2 <- read_delim("stud2.txt", delim=" ",
                   skip=1, col_names=c("StudentName","Math","English","Degree_Year"))
std2
std2L <- gather(std2, Subject, Grade, Math:English)
std2L <- separate(std2L, Degree_Year, c("Degree","Year"))
std2Llibrary(lubridate)
ymd("20151021")
ymd("2015/11/30")
myd("11.2012.3")
dmy_hms("2/12/2013 14:05:01")
mdy("120112")dates <- c(20120521, "2010-12-12", "2007/01/5", "2015-2-04", 
           "Measured on 2014-12-6", "2013-7+ 25")
dates <- ymd(dates)
datesdata.frame(Dates=dates,WeekDay=wday(dates),nWeekDay=wday(dates,label=TRUE),
           Year=year(dates),Month=month(dates,label=TRUE))library(dplyr)
library(stringr)
library(readr)
uci.repo <- "https://archive.ics.uci.edu/ml/machine-learning-databases/"
dataset <- "audiology/audiology.standardized"
dataF <- str_c(uci.repo,dataset,".data")
namesF <- str_c(uci.repo,dataset,".names")
## Reading the data file
data <- read_csv(url(dataF), col_names=FALSE, na="?")
data
dim(data)
## Now reading the names file
text <- read_lines(url(namesF))
text[1:3]
length(text)
text[67:70]library(dplyr)
data(iris)
iris.stand <- cbind(scale(select(iris,-Species)),select(iris,Species))
summary(iris.stand)mxs <- apply(select(iris,-Species), 2, max, na.rm=TRUE)
mns <- apply(select(iris,-Species), 2, min, na.rm=TRUE)
iris.norm <- cbind(scale(select(iris,-Species), center=mns, scale=mxs-mns),
                   select(iris,Species))
summary(iris.norm)library(Hmisc) # for cut2()
data(Boston, package="MASS") # loading the data
summary(Boston$age) # the numeric variable we are going to discretizelibrary(lubridate)
library(xts)
sp500 <- xts(c(1102.94,1104.49,1115.71,1118.31),
             ymd(c("2010-02-25","2010-02-26","2010-03-01","2010-03-02"),
                 tz=Sys.getenv("TZ"))
             )
sp500createEmbedDS <- function(s, emb=4) {
    d <- dim(s)
    if (!is.null(d) && d[2] > 1) stop("Only applicable to uni-variate time series")
    if (emb < 2 || emb > length(s)) stop("Invalid embed size")
    e <- embed(s,emb)
    colnames(e) <- c("T",paste("T",1:(emb-1),sep="_"))
    if (is.xts(s)) return(xts(e,index(s)[emb:length(s)])) else return(e)
}
dataSet <- createEmbedDS(ap,emb=5)
head(dataSet)library(sp)
library(dplyr)
spatialCoords <- select(ff,long=x,lat=y) # the contextual data
firesData <- select(ff,ano2000) # the behavioral data
coordRefSys <- CRS("+proj=longlat +ellps=WGS84")
fires2000 <- SpatialPointsDataFrame(spatialCoords,
                                    firesData,
                                    proj4string=coordRefSys)
fires2000[1:3,]library(ggmap)
library(tibble)
mapPT <- get_map("Portugal",zoom=7)
d4plot <- as_tibble(cbind(coordinates(fires2000),burnt=fires2000$ano2000))
ggmap(mapPT) + 
    geom_point(data=filter(d4plot, burnt==1),aes(x=long,y=lat),col="orange")data(iris)
prop2sample <- 0.5
rowIDs <- sample(1:nrow(iris), as.integer(prop2sample*nrow(iris)))
iris.sample <- iris[rowIDs,]data(iris)
prop2sample <- 0.5
rowIDs <- sample(1:nrow(iris), as.integer(prop2sample*nrow(iris)), replace=TRUE)
iris.sample <- iris[rowIDs,]nrLinesFile <- function(f) {
    if (.Platform$OS.type == "unix") 
        as.integer(strsplit(trimws(system(paste("wc -l",f),intern=TRUE)),
                            " ")[[1]][1]) 
    else 
        stop("This function requires unix-based systems")
}
sampleCSV <- function(file, percORn, nrLines, header=TRUE, mxPerc=0.5) {
    if (.Platform$OS.type != "unix") 
        stop("This function requires unix-based systems")
    require(readr, quietly=TRUE)
    
    if (missing(nrLines)) nrLines <- nrLinesFile(file)
    
    if (percORn < 1)
        if (percORn > mxPerc) 
            stop("This function is not adequate for that big samples.")
        else percORn <- as.integer(percORn*nrLines)
    perc <- min(2*percORn/nrLines, mxPerc)
    
    system(paste0("perl -ne 'print if (rand() < ",perc,")' ",file,
                  " > ",file,".tmp.csv"))
    dt <- read_csv(paste0(file,".tmp.csv"),col_names=header, n_max=percORn)
    file.remove(paste0(file,".tmp.csv"))
    if (nrow(dt) != percORn) 
        warning(paste("Expecting",percORn,"rows, but got",nrow(dt)))
    dt
}library(DBI)
library(RMySQL)
drv <- dbDriver("MySQL")  # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB",  
                 username="myuser",password="mypassword",
                 host="localhost")sampleDBMS <- function(dbConn, tbl, percORn, mxPerc=0.5) {
    nrRecords <- unlist(dbGetQuery(dbConn, paste("select count(*) from",tbl)))
    
    if (percORn < 1)
        if (percORn > mxPerc) 
            stop("This function is not adequate for that big samples.")
        else percORn <- as.integer(percORn*nrRecords)
    perc <- min(2*percORn/nrRecords, mxPerc)
    
    dt <- dbGetQuery(dbConn,paste("select * from (select * from",tbl,
                                  "where rand() <= ",perc,") as t limit ",percORn))
    if (nrow(dt) != percORn) 
        warning(paste("Expecting",percORn,"rows, but got",nrow(dt)))
    dt
}library(CORElearn)
data(iris)
attrEval(Species ~ ., iris, estimator="GainRatio")
attrEval(Species ~ ., iris, estimator="InfGain")
attrEval(Species ~ ., iris, estimator="Gini")
attrEval(Species ~ ., iris, estimator="MDL")data(algae, package ="DMwR2")
attrEval(a1 ~ ., algae[,1:12], estimator="MSEofMean")
attrEval(a1 ~ ., algae[,1:12], estimator="RReliefFexpRank")
infoCore(what="attrEvalReg")data(algae,package="DMwR2")
mean(algae$a1)
mean(algae$NO3)
mean(algae$NO3, na.rm=TRUE)
median(algae$a3)
median(algae$mxPH, na.rm=TRUE)library(dplyr)
alg <- tbl_df(algae)
summarise(alg, avgNO3=mean(NO3,na.rm=TRUE), medA1=median(a1))
select(alg, mxPH:Cl) %>%  
    summarise_each(funs(mean(.,na.rm=TRUE),median(.,na.rm=TRUE)))group_by(alg, season, size) %>% 
    summarize(nObs=n(), mA7=median(a7)) %>% 
        ungroup() %>% arrange(desc(mA7))Mode <- function(x, na.rm = FALSE) {
  if(na.rm) x <- x[!is.na(x)]
  ux <- unique(x)
  return(ux[which.max(tabulate(match(x, ux)))])
}
Mode(algae$mxPH, na.rm=TRUE)
Mode(algae$season)var(algae$a1)
sd(algae$Cl, na.rm=TRUE)
IQR(algae$mxPH, na.rm=TRUE)
quantile(algae$a3)
quantile(algae$a3, probs=c(0.2,0.8))
range(algae$a1)
max(algae$a5)-min(algae$a5)data(algae, package="DMwR2")
nasRow <- apply(algae,1,function(r) sum(is.na(r)))
cat("The Algae dataset contains ",sum(nasRow)," NA values.\n")
cat("There are ",sum(!complete.cases(algae)),
    " rows that have at least one NA value.\n")bpRule <- function(x, const=1.5, positions=FALSE) { 
    x <- x[!is.na(x)]
    qs <- quantile(x,probs = c(0.25,0.75))
    iqr <- qs[2]-qs[1]
    if (!positions) x[x < qs[1]-const*iqr | x > qs[2]+const*iqr]
    else which(x < qs[1]-const*iqr | x > qs[2]+const*iqr)
}
bpRule(algae$a1)
bpRule(algae$NO3)
bpRule(algae$NO3, positions=TRUE)library(ggplot2)
data(iris)
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=Species)) + geom_point(size=4)library(ggplot2)
data(algae, package="DMwR2")
## Plot on the left (standard)
freqOcc <- table(algae$season)
barplot(freqOcc,main='Frequency of the Seasons')
## Plot on the right (ggplot2)
ggplot(algae,aes(x=season)) + geom_bar() + ggtitle("Frequency of the Seasons")library(ggplot2)
library(grid)
data(algae, package="DMwR2")
freqOcc <- table(algae$season)
gh <- ggplot(algae,aes(x=season)) + geom_bar() + 
    ggtitle("Frequency of the Seasons")
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
               just=c("left","top"),y=1,x=0.5)
barplot(freqOcc,main='Frequency of the Seasons')
print(gh,vp=vp)library(ggplot2)
data(iris)
## Plot on the left (standard)
hist(iris$Petal.Length,xlab='Petal Length')
## Plot on the right (ggplot2)
ggplot(iris,aes(x=Petal.Length)) + geom_histogram() + xlab("Petal Length")library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=Petal.Length)) + geom_histogram() +xlab("Petal Length")
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
               just=c("left","top"),y=1,x=0.5)
hist(iris$Petal.Length,xlab='Petal Length')
print(g,vp=vp)library(ggplot2)
data(iris)
## Plot on the left (standard)
boxplot(iris$Sepal.Width, ylab='Sepal Width')
## Plot on the right (ggplot2)
ggplot(iris, aes(x=factor(0), y=Sepal.Width)) + geom_boxplot() +
    xlab("") + ylab("Sepal Width") + theme(axis.text.x=element_blank())library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=factor(0),y=Sepal.Width)) + geom_boxplot() +
    xlab("") + ylab("Sepal Width") + theme(axis.text.x=element_blank())
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
               just=c("left","top"),y=1,x=0.5)
boxplot(iris$Sepal.Width,ylab='Sepal Width')
print(g,vp=vp)library(ggplot2)
data(iris)
## Plot on the left (standard)
boxplot(Sepal.Length ~ Species, iris, ylab="Sepal.Length")
## Plot on the right (ggplot2)
ggplot(iris,aes(x=Species,y=Sepal.Length)) + geom_boxplot()library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=Species,y=Sepal.Length)) + geom_boxplot()
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
               just=c("left","top"),y=1,x=0.5)
boxplot(Sepal.Length ~ Species, iris, ylab="Sepal.Length")
print(g,vp=vp)library(ggplot2)
data(algae, package="DMwR2")
ggplot(algae,aes(x=a1)) + geom_histogram() + facet_grid(size ~ speed)library(ggplot2)
data(algae, package="DMwR2")
ggplot(algae,aes(x=a1)) + geom_histogram() + facet_grid(size ~ speed)library(ggplot2)
data(iris)
## Plot on the left (standard)
plot(iris$Sepal.Length,iris$Sepal.Width,
     main="Relationship between Sepal Length and Width", 
     xlab="Sepal Length", ylab="Sepal Width")
## Plot on the right (ggplot2)
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width)) + geom_point() +
    xlab("Sepal Length") + ylab("Sepal Width") + 
    ggtitle("Relationship between Sepal Length and Width")library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width)) + geom_point() +
    xlab("Sepal Length") + ylab("Sepal Width") + 
    ggtitle("Relationship between Sepal Length and Width")
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
               just=c("left","top"),y=1,x=0.5)
plot(iris$Sepal.Length,iris$Sepal.Width,
     main="Relationship between Sepal Length and Width", 
     xlab="Sepal Length", ylab="Sepal Width")
print(g,vp=vp)data(algae, package="DMwR2")
## Plot on the left (standard)
plot(algae$a1, algae$a2, col=algae$season,
     main="Relationship between A1 and A2 by Season", 
     xlab="A1", ylab="A2")
legend("topright",legend=levels(algae$season),
       col=palette()[1:nlevels(algae$season)],pch=1)
## Plot on the right (ggplot2)
plot(algae$a4, algae$a7, pch=as.integer(algae$speed), 
     main="Relationship between A4 and A7 by River Speed", 
     xlab="A4", ylab="A7")
legend("topright",legend=levels(algae$speed),
      pch=1:nlevels(algae$season))par(mfrow=c(1,2))
data(algae, package="DMwR2")
## Plot on the left (standard)
plot(algae$a1, algae$a2, col=algae$season,
     main="Relationship between A1 and A2 by Season", 
     xlab="A1", ylab="A2")
legend("topright",legend=levels(algae$season),
       col=palette()[1:nlevels(algae$season)],pch=1)
## Plot on the right (ggplot2)
plot(algae$a4, algae$a7, pch=as.integer(algae$speed), 
     main="Relationship between A4 and A7 by River Speed", 
     xlab="A4", ylab="A7")
legend("topright",legend=levels(algae$speed),
      pch=1:nlevels(algae$season))library(arules)
library(dplyr)
data(Boston,package="MASS")
b <- Boston
b$chas <- factor(b$chas,labels=c("river","noriver"))
b$rad <- factor(b$rad)
b$black <- cut(b$black,breaks=4,labels=c(">31.5%","18.5-31.5%","8-18.5%","<8%"))
discr <- function(x) cut(x,breaks=4, labels=c("low","medLow","medHigh","high"))
b <- select(b,-one_of(c("chas","rad","black"))) %>% 
     mutate_each(funs(discr)) %>% 
     bind_cols(select(b,one_of(c("chas","rad","black"))))
b <- as(b,"transactions")
bsomerules <- subset(ars,
               subset=rhs %in% c("medv=high","medv=medHigh") & confidence>0.75)
plot(somerules, method="matrix", measure="lift")set.seed(1234)
randDat <- matrix(rnorm(50), nrow=5)
dist(randDat) # Euclidean distance (default)
dist(randDat, method="manhattan")
dist(randDat, method="minkowski", p=4)set.seed(1234) # setting a seed for the random number generator
data(iris)
ir3 <- kmeans(iris[,-5], centers=3, iter.max=200) # not using Species info.
ir3set.seed(1234)
d <- dist(iris[,-5])
avgS <- c()
for(k in 2:6) {
   cl <- kmeans(iris[,-5],centers=k,iter.max=200)
   s <- silhouette(cl$cluster,d)
   avgS <- c(avgS,mean(s[,3]))
}
data.frame(nClus=2:6,Silh=avgS)library(cluster)
set.seed(1234)
pc <- pam(iris[,-5],k=3)
(cm <- table(pc$clustering, iris$Species))
100*(1-sum(diag(cm))/sum(cm))
pc$silinfo$avg.widthset.seed(1234)
d <- dist(scale(iris[,-5]))
methds <- c('complete','single','average')
avgS <- matrix(NA,ncol=3,nrow=5,
               dimnames=list(2:6,methds))
for(k in 2:6) 
  for(m in seq_along(methds)) {
    h <- hclust(d,meth=methds[m])
    c <- cutree(h,k)
    s <- silhouette(c,d)
    avgS[k-1,m] <- mean(s[,3])
  }
avgSgrubbs.outliers <- function(x, p.thresh=0.05) {
    require(outliers, quietly=TRUE)
    x <- x[!is.na(x)]
    n <- length(x)
    zs <- abs(x - mean(x)) / sd(x)
    outs <- 1 - sapply(zs, function(z) pgrubbs(z, n, type=10))
    posOuts <- which(outs <= p.thresh)
    return(list(zs=zs, 
                pvals=outs, 
                outliers=x[posOuts],
                positions=posOuts))
}
data(algae, package="DMwR2")
grubbs.outliers(algae$a2)$outliersdbscan.outliers <- function(data, ...) {
    require(fpc, quietly=TRUE)
    cl <- dbscan(data, ...)
    posOuts <- which(cl$cluster == 0)
    list(positions = posOuts,
         outliers = data[posOuts,], 
         dbscanResults = cl)
}library(dplyr)
library(forcats)
data(Glass, package="mlbench")
count(Glass,Type)  # a dplyr  alternative to "table(Glass$Type)"
g <- mutate(Glass,
            Type=fct_collapse(Type,
                              rare   = as.character(c(3,5,6)),
                              normal = as.character(c(1,2,7))
                              )
            )
g %>% count(Type) %>% mutate(prop=100*n/nrow(g))outs <- dbscan.outliers(g[,-10], eps=1, scale=TRUE)
head(outs$outliers)
nrow(outs$outliers)
slice(g, outs$positions) %>%  count(Type)
count(g, Type)library(DMwR2)
library(dplyr)
og <- outliers.ranking(select(g, -Type))
slice(g, og$rank.outliers[1:40]) %>%  count(Type)library(DMwR2)
library(dplyr)
lof.scores <- lofactor(select(g, -Type),10)
slice(g, order(lof.scores,decreasing=TRUE)[1:40]) %>%  count(Type)library(UBL)
library(dplyr)
count(g,Type)
## Undersampling the largest class
newg <- RandUnderClassif(Type ~ ., g)
count(newg,Type)
## Now specifying the degree of undersampling by hand
newg2 <- RandUnderClassif(Type ~ ., g, list(normal=0.4, rare=1))
count(newg2,Type)
## Oversampling the minority class
newg3 <- RandOverClassif(Type ~ .,g)
count(newg3,Type)library(DMwR2)
set.seed(1234)
data(iris)
ct1 <- rpartXse(Species ~ ., iris)
ct2 <- rpartXse(Species ~ ., iris, se=0)library(rpart.plot)
prp(ct1, type=0, extra=101)  # left tree
prp(ct2, type=0, extra=101)  # right treelibrary(rpart.plot)
par( mfrow=c(1,2) )
prp(ct1,type=0,extra=101) 
prp(ct2,type=0,extra=101)
par(mfrow = c(1,1))set.seed(1234)
rndSample <- sample(1:nrow(iris),100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
ct <- rpartXse(Species ~ ., tr, se=0.5)
ps1 <- predict(ct, ts)
head(ps1)
ps2 <- predict(ct, ts, type="class")
head(ps2)
(cm <- table(ps2, ts$Species))
100*(1-sum(diag(cm))/sum(cm))  # the error ratelibrary(e1071)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
s <- svm(Species ~ ., tr)
ps <- predict(s, ts)
(cm <- table(ps, ts$Species))
100*(1-sum(diag(cm))/sum(cm))  # the error rates2 <- svm(Species ~ ., tr, cost=10, kernel="polynomial", degree=3)
ps2 <- predict(s2, ts)
(cm2 <- table(ps2, ts$Species))
100*(1-sum(diag(cm2))/sum(cm2))  # the error ratedata(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
s1 <- svm(medv ~ ., tr)
ps1 <- predict(s1, ts)
mean(abs(ps1-ts$medv))
s2 <- svm(medv ~ ., tr, kernel="radial", cost=10, epsilon=0.02, gamma=0.01)
ps2 <- predict(s2, ts)
mean(abs(ps2-ts$medv))library(nnet)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
n <- nnet(Species ~ ., tr, size=6 ,trace=FALSE, maxit=1000)
ps <- predict(n, ts, type="class")
(cm <- table(ps, ts$Species))
100*(1-sum(diag(cm))/sum(cm))  # the error ratedata(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
nr <- nnet(medv ~ ., tr, linout=TRUE, trace=FALSE, size=6, decay=0.01, maxit=2000)
psnr <- predict(nr, ts)
mean(abs(psnr-ts$medv))library(ggplot2)
library(NeuralNetTools)
## Feature importance (left graph)
garson(nr) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
## Network diagram (rigth graph)
plotnet(nr)    data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
trH  <- as.h2o(iris[rndSample, ],"trH")
tsH <- as.h2o(iris[-rndSample, ],"tsH")
mdl <- h2o.deeplearning(x=1:4, y=5, training_frame=trH)
preds <- h2o.predict(mdl,tsH)[,"predict"]library(h2o)
h2oInstance <- h2o.init(ip="localhost") # start H2O instance locally
data(Boston,package="MASS")
rndSample <- sample(1:nrow(Boston), 354)
trH <- as.h2o(Boston[rndSample, ],"trH")
tsH <- as.h2o(Boston[-rndSample, ],"tsH")
mdl <- h2o.deeplearning(x=1:13, y=14, training_frame=trH, 
                        hidden=c(100,100,100, 100), epochs=500)
preds <- as.vector(h2o.predict(mdl,tsH))library(adabag)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
 m <- bagging(Species ~ ., tr, mfinal=500)
ps <- predict(m,ts)
names(ps)
ps$confusion
ps$error*100 # percentage of errorslibrary(ipred)
data(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
m <- bagging(medv ~ ., tr, nbagg=500)
ps <- predict(m, ts)
mean(abs(ps-ts$medv))library(randomForest)
library(DMwR2)
data(BreastCancer, package="mlbench")
bc <- cbind(knnImputation(BreastCancer[,-c(1,11)]), # column 1 is an ID
            Class=BreastCancer$Class)
set.seed(1234)
rndSample <- sample(1:nrow(bc), 500)
tr <- bc[rndSample, ] 
ts <- bc[-rndSample, ]
m <- randomForest(Class ~ ., tr, ntree=750)
ps <- predict(m, ts)
(cm <- table(ps, ts$Class))
100*(1-sum(diag(cm))/sum(cm))  # the error ratelibrary(adabag)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
m1 <- boosting(Species ~ ., tr,  mfinal=500) # AdaBoost.M1
ps1 <- predict(m1,ts)
ps1$confusion
ps1$error*100 
m2 <- boosting(Species ~ ., tr, coeflearn="Zhu", mfinal=500) # SAMME
ps2 <- predict(m2,ts)
ps2$confusion
ps2$error*100 library(gbm)
data(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
m <- gbm(medv ~ ., data=tr, n.trees=5000)
ps <- predict(m, ts, n.trees=5000)
mean(abs(ps-ts$medv))data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
m <- gbm(Species ~ ., data=tr, n.trees=10000, 
         cv.folds=5, n.cores=4)
(best <- gbm.perf(m, plot.it=FALSE, method="cv"))
ps <- predict(m, ts, n.trees=best, type="response")[,,1]
ps <- as.factor(colnames(ps)[max.col(ps)])
(cm <- table(ps, ts$Species))
100*(1-sum(diag(cm))/sum(cm))  # the error ratelibrary(performanceEstimation)
library(DMwR2)
data(Boston, package="MASS")
r <- performanceEstimation(
         PredTask(medv ~ ., Boston),
         workflowVariants(learner="rpartXse",
                          learner.pars=list(se=c(0,0.25,0.5,1,2))),
         EstimationTask(metrics=c("mse","mae"),
                        method=CV(nReps=3,nFolds=10))
     )
rankWorkflows(r, top=3)library(performanceEstimation)
library(DMwR2)
library(e1071)
data(iris)
data(BreastCancer, package="mlbench")
bc <- cbind(knnImputation(BreastCancer[,-c(1,11)]), 
            Class=BreastCancer$Class)
r <- performanceEstimation(
         c(PredTask(Species ~ ., iris), 
           PredTask(Class ~ ., bc)),
         workflowVariants(learner="svm",
                          learner.pars=list(cost=c(1,5,10),
                                            gamma=c(0.01,0.001))),
         EstimationTask(metrics="acc",
                        method=Bootstrap(nReps=200,type=".632"))
    )