The following is a script file containing all R code of all sections in this chapter.
library(DBI)
## The DBMS-specific code starts here
library(RMySQL)
drv <- dbDriver("MySQL") # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB", # connecting to the DBMS
username="myuser",password="mypasswd",
host="localhost")
## The DBMS-specific code ends here
## getting the results of an SQL query as a data frame
data <- dbGetQuery(con,"SELECT * FROM clients")
## closing up stuff
dbDisconnect(con)
dbUnloadDriver(drv)
library(DBI)
library(RMySQL)
drv <- dbDriver("MySQL") # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB", # connecting to the DBMS
username="myuser",password="mypasswd",
host="localhost")
res <- dbSendQuery(con,"SELECT * FROM transactions")
while (!dbHasCompleted(res)) {
# get the next 50 records on a data frame
someData <- fetch(res, n = 50)
# call some function that handles the current chunk
myProcessingFunction(someData)
}
dbClearResult(res) # clear the results set
dbDisconnect(con) # closing up stuff
dbUnloadDriver(drv)
library(readr)
std <- read_delim("stud.txt", delim=" ",
skip=1, col_names=c("StudentName","Math","English"))
std
std2 <- read_delim("stud2.txt", delim=" ",
skip=1, col_names=c("StudentName","Math","English","Degree_Year"))
std2
std2L <- gather(std2, Subject, Grade, Math:English)
std2L <- separate(std2L, Degree_Year, c("Degree","Year"))
std2L
library(lubridate)
ymd("20151021")
ymd("2015/11/30")
myd("11.2012.3")
dmy_hms("2/12/2013 14:05:01")
mdy("120112")
dates <- c(20120521, "2010-12-12", "2007/01/5", "2015-2-04",
"Measured on 2014-12-6", "2013-7+ 25")
dates <- ymd(dates)
dates
data.frame(Dates=dates,WeekDay=wday(dates),nWeekDay=wday(dates,label=TRUE),
Year=year(dates),Month=month(dates,label=TRUE))
library(dplyr)
library(stringr)
library(readr)
uci.repo <- "https://archive.ics.uci.edu/ml/machine-learning-databases/"
dataset <- "audiology/audiology.standardized"
dataF <- str_c(uci.repo,dataset,".data")
namesF <- str_c(uci.repo,dataset,".names")
## Reading the data file
data <- read_csv(url(dataF), col_names=FALSE, na="?")
data
dim(data)
## Now reading the names file
text <- read_lines(url(namesF))
text[1:3]
length(text)
text[67:70]
library(dplyr)
data(iris)
iris.stand <- cbind(scale(select(iris,-Species)),select(iris,Species))
summary(iris.stand)
mxs <- apply(select(iris,-Species), 2, max, na.rm=TRUE)
mns <- apply(select(iris,-Species), 2, min, na.rm=TRUE)
iris.norm <- cbind(scale(select(iris,-Species), center=mns, scale=mxs-mns),
select(iris,Species))
summary(iris.norm)
library(Hmisc) # for cut2()
data(Boston, package="MASS") # loading the data
summary(Boston$age) # the numeric variable we are going to discretize
library(lubridate)
library(xts)
sp500 <- xts(c(1102.94,1104.49,1115.71,1118.31),
ymd(c("2010-02-25","2010-02-26","2010-03-01","2010-03-02"),
tz=Sys.getenv("TZ"))
)
sp500
createEmbedDS <- function(s, emb=4) {
d <- dim(s)
if (!is.null(d) && d[2] > 1) stop("Only applicable to uni-variate time series")
if (emb < 2 || emb > length(s)) stop("Invalid embed size")
e <- embed(s,emb)
colnames(e) <- c("T",paste("T",1:(emb-1),sep="_"))
if (is.xts(s)) return(xts(e,index(s)[emb:length(s)])) else return(e)
}
dataSet <- createEmbedDS(ap,emb=5)
head(dataSet)
library(sp)
library(dplyr)
spatialCoords <- select(ff,long=x,lat=y) # the contextual data
firesData <- select(ff,ano2000) # the behavioral data
coordRefSys <- CRS("+proj=longlat +ellps=WGS84")
fires2000 <- SpatialPointsDataFrame(spatialCoords,
firesData,
proj4string=coordRefSys)
fires2000[1:3,]
library(ggmap)
library(tibble)
mapPT <- get_map("Portugal",zoom=7)
d4plot <- as_tibble(cbind(coordinates(fires2000),burnt=fires2000$ano2000))
ggmap(mapPT) +
geom_point(data=filter(d4plot, burnt==1),aes(x=long,y=lat),col="orange")
data(iris)
prop2sample <- 0.5
rowIDs <- sample(1:nrow(iris), as.integer(prop2sample*nrow(iris)))
iris.sample <- iris[rowIDs,]
data(iris)
prop2sample <- 0.5
rowIDs <- sample(1:nrow(iris), as.integer(prop2sample*nrow(iris)), replace=TRUE)
iris.sample <- iris[rowIDs,]
nrLinesFile <- function(f) {
if (.Platform$OS.type == "unix")
as.integer(strsplit(trimws(system(paste("wc -l",f),intern=TRUE)),
" ")[[1]][1])
else
stop("This function requires unix-based systems")
}
sampleCSV <- function(file, percORn, nrLines, header=TRUE, mxPerc=0.5) {
if (.Platform$OS.type != "unix")
stop("This function requires unix-based systems")
require(readr, quietly=TRUE)
if (missing(nrLines)) nrLines <- nrLinesFile(file)
if (percORn < 1)
if (percORn > mxPerc)
stop("This function is not adequate for that big samples.")
else percORn <- as.integer(percORn*nrLines)
perc <- min(2*percORn/nrLines, mxPerc)
system(paste0("perl -ne 'print if (rand() < ",perc,")' ",file,
" > ",file,".tmp.csv"))
dt <- read_csv(paste0(file,".tmp.csv"),col_names=header, n_max=percORn)
file.remove(paste0(file,".tmp.csv"))
if (nrow(dt) != percORn)
warning(paste("Expecting",percORn,"rows, but got",nrow(dt)))
dt
}
library(DBI)
library(RMySQL)
drv <- dbDriver("MySQL") # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB",
username="myuser",password="mypassword",
host="localhost")
sampleDBMS <- function(dbConn, tbl, percORn, mxPerc=0.5) {
nrRecords <- unlist(dbGetQuery(dbConn, paste("select count(*) from",tbl)))
if (percORn < 1)
if (percORn > mxPerc)
stop("This function is not adequate for that big samples.")
else percORn <- as.integer(percORn*nrRecords)
perc <- min(2*percORn/nrRecords, mxPerc)
dt <- dbGetQuery(dbConn,paste("select * from (select * from",tbl,
"where rand() <= ",perc,") as t limit ",percORn))
if (nrow(dt) != percORn)
warning(paste("Expecting",percORn,"rows, but got",nrow(dt)))
dt
}
library(CORElearn)
data(iris)
attrEval(Species ~ ., iris, estimator="GainRatio")
attrEval(Species ~ ., iris, estimator="InfGain")
attrEval(Species ~ ., iris, estimator="Gini")
attrEval(Species ~ ., iris, estimator="MDL")
data(algae, package ="DMwR2")
attrEval(a1 ~ ., algae[,1:12], estimator="MSEofMean")
attrEval(a1 ~ ., algae[,1:12], estimator="RReliefFexpRank")
infoCore(what="attrEvalReg")
data(algae,package="DMwR2")
mean(algae$a1)
mean(algae$NO3)
mean(algae$NO3, na.rm=TRUE)
median(algae$a3)
median(algae$mxPH, na.rm=TRUE)
library(dplyr)
alg <- tbl_df(algae)
summarise(alg, avgNO3=mean(NO3,na.rm=TRUE), medA1=median(a1))
select(alg, mxPH:Cl) %>%
summarise_each(funs(mean(.,na.rm=TRUE),median(.,na.rm=TRUE)))
group_by(alg, season, size) %>%
summarize(nObs=n(), mA7=median(a7)) %>%
ungroup() %>% arrange(desc(mA7))
Mode <- function(x, na.rm = FALSE) {
if(na.rm) x <- x[!is.na(x)]
ux <- unique(x)
return(ux[which.max(tabulate(match(x, ux)))])
}
Mode(algae$mxPH, na.rm=TRUE)
Mode(algae$season)
var(algae$a1)
sd(algae$Cl, na.rm=TRUE)
IQR(algae$mxPH, na.rm=TRUE)
quantile(algae$a3)
quantile(algae$a3, probs=c(0.2,0.8))
range(algae$a1)
max(algae$a5)-min(algae$a5)
data(algae, package="DMwR2")
nasRow <- apply(algae,1,function(r) sum(is.na(r)))
cat("The Algae dataset contains ",sum(nasRow)," NA values.\n")
cat("There are ",sum(!complete.cases(algae)),
" rows that have at least one NA value.\n")
bpRule <- function(x, const=1.5, positions=FALSE) {
x <- x[!is.na(x)]
qs <- quantile(x,probs = c(0.25,0.75))
iqr <- qs[2]-qs[1]
if (!positions) x[x < qs[1]-const*iqr | x > qs[2]+const*iqr]
else which(x < qs[1]-const*iqr | x > qs[2]+const*iqr)
}
bpRule(algae$a1)
bpRule(algae$NO3)
bpRule(algae$NO3, positions=TRUE)
library(ggplot2)
data(iris)
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=Species)) + geom_point(size=4)
library(ggplot2)
data(algae, package="DMwR2")
## Plot on the left (standard)
freqOcc <- table(algae$season)
barplot(freqOcc,main='Frequency of the Seasons')
## Plot on the right (ggplot2)
ggplot(algae,aes(x=season)) + geom_bar() + ggtitle("Frequency of the Seasons")
library(ggplot2)
library(grid)
data(algae, package="DMwR2")
freqOcc <- table(algae$season)
gh <- ggplot(algae,aes(x=season)) + geom_bar() +
ggtitle("Frequency of the Seasons")
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
just=c("left","top"),y=1,x=0.5)
barplot(freqOcc,main='Frequency of the Seasons')
print(gh,vp=vp)
library(ggplot2)
data(iris)
## Plot on the left (standard)
hist(iris$Petal.Length,xlab='Petal Length')
## Plot on the right (ggplot2)
ggplot(iris,aes(x=Petal.Length)) + geom_histogram() + xlab("Petal Length")
library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=Petal.Length)) + geom_histogram() +xlab("Petal Length")
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
just=c("left","top"),y=1,x=0.5)
hist(iris$Petal.Length,xlab='Petal Length')
print(g,vp=vp)
library(ggplot2)
data(iris)
## Plot on the left (standard)
boxplot(iris$Sepal.Width, ylab='Sepal Width')
## Plot on the right (ggplot2)
ggplot(iris, aes(x=factor(0), y=Sepal.Width)) + geom_boxplot() +
xlab("") + ylab("Sepal Width") + theme(axis.text.x=element_blank())
library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=factor(0),y=Sepal.Width)) + geom_boxplot() +
xlab("") + ylab("Sepal Width") + theme(axis.text.x=element_blank())
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
just=c("left","top"),y=1,x=0.5)
boxplot(iris$Sepal.Width,ylab='Sepal Width')
print(g,vp=vp)
library(ggplot2)
data(iris)
## Plot on the left (standard)
boxplot(Sepal.Length ~ Species, iris, ylab="Sepal.Length")
## Plot on the right (ggplot2)
ggplot(iris,aes(x=Species,y=Sepal.Length)) + geom_boxplot()
library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=Species,y=Sepal.Length)) + geom_boxplot()
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
just=c("left","top"),y=1,x=0.5)
boxplot(Sepal.Length ~ Species, iris, ylab="Sepal.Length")
print(g,vp=vp)
library(ggplot2)
data(algae, package="DMwR2")
ggplot(algae,aes(x=a1)) + geom_histogram() + facet_grid(size ~ speed)
library(ggplot2)
data(algae, package="DMwR2")
ggplot(algae,aes(x=a1)) + geom_histogram() + facet_grid(size ~ speed)
library(ggplot2)
data(iris)
## Plot on the left (standard)
plot(iris$Sepal.Length,iris$Sepal.Width,
main="Relationship between Sepal Length and Width",
xlab="Sepal Length", ylab="Sepal Width")
## Plot on the right (ggplot2)
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width)) + geom_point() +
xlab("Sepal Length") + ylab("Sepal Width") +
ggtitle("Relationship between Sepal Length and Width")
library(ggplot2)
library(grid)
data(iris)
g <- ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width)) + geom_point() +
xlab("Sepal Length") + ylab("Sepal Width") +
ggtitle("Relationship between Sepal Length and Width")
par(mfrow=c(1,2))
vp <- viewport(height=unit(1,"npc"),width=unit(0.5,"npc"),
just=c("left","top"),y=1,x=0.5)
plot(iris$Sepal.Length,iris$Sepal.Width,
main="Relationship between Sepal Length and Width",
xlab="Sepal Length", ylab="Sepal Width")
print(g,vp=vp)
data(algae, package="DMwR2")
## Plot on the left (standard)
plot(algae$a1, algae$a2, col=algae$season,
main="Relationship between A1 and A2 by Season",
xlab="A1", ylab="A2")
legend("topright",legend=levels(algae$season),
col=palette()[1:nlevels(algae$season)],pch=1)
## Plot on the right (ggplot2)
plot(algae$a4, algae$a7, pch=as.integer(algae$speed),
main="Relationship between A4 and A7 by River Speed",
xlab="A4", ylab="A7")
legend("topright",legend=levels(algae$speed),
pch=1:nlevels(algae$season))
par(mfrow=c(1,2))
data(algae, package="DMwR2")
## Plot on the left (standard)
plot(algae$a1, algae$a2, col=algae$season,
main="Relationship between A1 and A2 by Season",
xlab="A1", ylab="A2")
legend("topright",legend=levels(algae$season),
col=palette()[1:nlevels(algae$season)],pch=1)
## Plot on the right (ggplot2)
plot(algae$a4, algae$a7, pch=as.integer(algae$speed),
main="Relationship between A4 and A7 by River Speed",
xlab="A4", ylab="A7")
legend("topright",legend=levels(algae$speed),
pch=1:nlevels(algae$season))
library(arules)
library(dplyr)
data(Boston,package="MASS")
b <- Boston
b$chas <- factor(b$chas,labels=c("river","noriver"))
b$rad <- factor(b$rad)
b$black <- cut(b$black,breaks=4,labels=c(">31.5%","18.5-31.5%","8-18.5%","<8%"))
discr <- function(x) cut(x,breaks=4, labels=c("low","medLow","medHigh","high"))
b <- select(b,-one_of(c("chas","rad","black"))) %>%
mutate_each(funs(discr)) %>%
bind_cols(select(b,one_of(c("chas","rad","black"))))
b <- as(b,"transactions")
b
somerules <- subset(ars,
subset=rhs %in% c("medv=high","medv=medHigh") & confidence>0.75)
plot(somerules, method="matrix", measure="lift")
set.seed(1234)
randDat <- matrix(rnorm(50), nrow=5)
dist(randDat) # Euclidean distance (default)
dist(randDat, method="manhattan")
dist(randDat, method="minkowski", p=4)
set.seed(1234) # setting a seed for the random number generator
data(iris)
ir3 <- kmeans(iris[,-5], centers=3, iter.max=200) # not using Species info.
ir3
set.seed(1234)
d <- dist(iris[,-5])
avgS <- c()
for(k in 2:6) {
cl <- kmeans(iris[,-5],centers=k,iter.max=200)
s <- silhouette(cl$cluster,d)
avgS <- c(avgS,mean(s[,3]))
}
data.frame(nClus=2:6,Silh=avgS)
library(cluster)
set.seed(1234)
pc <- pam(iris[,-5],k=3)
(cm <- table(pc$clustering, iris$Species))
100*(1-sum(diag(cm))/sum(cm))
pc$silinfo$avg.width
set.seed(1234)
d <- dist(scale(iris[,-5]))
methds <- c('complete','single','average')
avgS <- matrix(NA,ncol=3,nrow=5,
dimnames=list(2:6,methds))
for(k in 2:6)
for(m in seq_along(methds)) {
h <- hclust(d,meth=methds[m])
c <- cutree(h,k)
s <- silhouette(c,d)
avgS[k-1,m] <- mean(s[,3])
}
avgS
grubbs.outliers <- function(x, p.thresh=0.05) {
require(outliers, quietly=TRUE)
x <- x[!is.na(x)]
n <- length(x)
zs <- abs(x - mean(x)) / sd(x)
outs <- 1 - sapply(zs, function(z) pgrubbs(z, n, type=10))
posOuts <- which(outs <= p.thresh)
return(list(zs=zs,
pvals=outs,
outliers=x[posOuts],
positions=posOuts))
}
data(algae, package="DMwR2")
grubbs.outliers(algae$a2)$outliers
dbscan.outliers <- function(data, ...) {
require(fpc, quietly=TRUE)
cl <- dbscan(data, ...)
posOuts <- which(cl$cluster == 0)
list(positions = posOuts,
outliers = data[posOuts,],
dbscanResults = cl)
}
library(dplyr)
library(forcats)
data(Glass, package="mlbench")
count(Glass,Type) # a dplyr alternative to "table(Glass$Type)"
g <- mutate(Glass,
Type=fct_collapse(Type,
rare = as.character(c(3,5,6)),
normal = as.character(c(1,2,7))
)
)
g %>% count(Type) %>% mutate(prop=100*n/nrow(g))
outs <- dbscan.outliers(g[,-10], eps=1, scale=TRUE)
head(outs$outliers)
nrow(outs$outliers)
slice(g, outs$positions) %>% count(Type)
count(g, Type)
library(DMwR2)
library(dplyr)
og <- outliers.ranking(select(g, -Type))
slice(g, og$rank.outliers[1:40]) %>% count(Type)
library(DMwR2)
library(dplyr)
lof.scores <- lofactor(select(g, -Type),10)
slice(g, order(lof.scores,decreasing=TRUE)[1:40]) %>% count(Type)
library(UBL)
library(dplyr)
count(g,Type)
## Undersampling the largest class
newg <- RandUnderClassif(Type ~ ., g)
count(newg,Type)
## Now specifying the degree of undersampling by hand
newg2 <- RandUnderClassif(Type ~ ., g, list(normal=0.4, rare=1))
count(newg2,Type)
## Oversampling the minority class
newg3 <- RandOverClassif(Type ~ .,g)
count(newg3,Type)
library(DMwR2)
set.seed(1234)
data(iris)
ct1 <- rpartXse(Species ~ ., iris)
ct2 <- rpartXse(Species ~ ., iris, se=0)
library(rpart.plot)
prp(ct1, type=0, extra=101) # left tree
prp(ct2, type=0, extra=101) # right tree
library(rpart.plot)
par( mfrow=c(1,2) )
prp(ct1,type=0,extra=101)
prp(ct2,type=0,extra=101)
par(mfrow = c(1,1))
set.seed(1234)
rndSample <- sample(1:nrow(iris),100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
ct <- rpartXse(Species ~ ., tr, se=0.5)
ps1 <- predict(ct, ts)
head(ps1)
ps2 <- predict(ct, ts, type="class")
head(ps2)
(cm <- table(ps2, ts$Species))
100*(1-sum(diag(cm))/sum(cm)) # the error rate
library(e1071)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
s <- svm(Species ~ ., tr)
ps <- predict(s, ts)
(cm <- table(ps, ts$Species))
100*(1-sum(diag(cm))/sum(cm)) # the error rate
s2 <- svm(Species ~ ., tr, cost=10, kernel="polynomial", degree=3)
ps2 <- predict(s2, ts)
(cm2 <- table(ps2, ts$Species))
100*(1-sum(diag(cm2))/sum(cm2)) # the error rate
data(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
s1 <- svm(medv ~ ., tr)
ps1 <- predict(s1, ts)
mean(abs(ps1-ts$medv))
s2 <- svm(medv ~ ., tr, kernel="radial", cost=10, epsilon=0.02, gamma=0.01)
ps2 <- predict(s2, ts)
mean(abs(ps2-ts$medv))
library(nnet)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
n <- nnet(Species ~ ., tr, size=6 ,trace=FALSE, maxit=1000)
ps <- predict(n, ts, type="class")
(cm <- table(ps, ts$Species))
100*(1-sum(diag(cm))/sum(cm)) # the error rate
data(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
nr <- nnet(medv ~ ., tr, linout=TRUE, trace=FALSE, size=6, decay=0.01, maxit=2000)
psnr <- predict(nr, ts)
mean(abs(psnr-ts$medv))
library(ggplot2)
library(NeuralNetTools)
## Feature importance (left graph)
garson(nr) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Network diagram (rigth graph)
plotnet(nr)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
trH <- as.h2o(iris[rndSample, ],"trH")
tsH <- as.h2o(iris[-rndSample, ],"tsH")
mdl <- h2o.deeplearning(x=1:4, y=5, training_frame=trH)
preds <- h2o.predict(mdl,tsH)[,"predict"]
library(h2o)
h2oInstance <- h2o.init(ip="localhost") # start H2O instance locally
data(Boston,package="MASS")
rndSample <- sample(1:nrow(Boston), 354)
trH <- as.h2o(Boston[rndSample, ],"trH")
tsH <- as.h2o(Boston[-rndSample, ],"tsH")
mdl <- h2o.deeplearning(x=1:13, y=14, training_frame=trH,
hidden=c(100,100,100, 100), epochs=500)
preds <- as.vector(h2o.predict(mdl,tsH))
library(adabag)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
m <- bagging(Species ~ ., tr, mfinal=500)
ps <- predict(m,ts)
names(ps)
ps$confusion
ps$error*100 # percentage of errors
library(ipred)
data(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
m <- bagging(medv ~ ., tr, nbagg=500)
ps <- predict(m, ts)
mean(abs(ps-ts$medv))
library(randomForest)
library(DMwR2)
data(BreastCancer, package="mlbench")
bc <- cbind(knnImputation(BreastCancer[,-c(1,11)]), # column 1 is an ID
Class=BreastCancer$Class)
set.seed(1234)
rndSample <- sample(1:nrow(bc), 500)
tr <- bc[rndSample, ]
ts <- bc[-rndSample, ]
m <- randomForest(Class ~ ., tr, ntree=750)
ps <- predict(m, ts)
(cm <- table(ps, ts$Class))
100*(1-sum(diag(cm))/sum(cm)) # the error rate
library(adabag)
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
m1 <- boosting(Species ~ ., tr, mfinal=500) # AdaBoost.M1
ps1 <- predict(m1,ts)
ps1$confusion
ps1$error*100
m2 <- boosting(Species ~ ., tr, coeflearn="Zhu", mfinal=500) # SAMME
ps2 <- predict(m2,ts)
ps2$confusion
ps2$error*100
library(gbm)
data(Boston,package='MASS')
set.seed(1234)
sp <- sample(1:nrow(Boston),354)
tr <- Boston[sp,]
ts <- Boston[-sp,]
m <- gbm(medv ~ ., data=tr, n.trees=5000)
ps <- predict(m, ts, n.trees=5000)
mean(abs(ps-ts$medv))
data(iris)
set.seed(1234)
rndSample <- sample(1:nrow(iris), 100)
tr <- iris[rndSample, ]
ts <- iris[-rndSample, ]
m <- gbm(Species ~ ., data=tr, n.trees=10000,
cv.folds=5, n.cores=4)
(best <- gbm.perf(m, plot.it=FALSE, method="cv"))
ps <- predict(m, ts, n.trees=best, type="response")[,,1]
ps <- as.factor(colnames(ps)[max.col(ps)])
(cm <- table(ps, ts$Species))
100*(1-sum(diag(cm))/sum(cm)) # the error rate
library(performanceEstimation)
library(DMwR2)
data(Boston, package="MASS")
r <- performanceEstimation(
PredTask(medv ~ ., Boston),
workflowVariants(learner="rpartXse",
learner.pars=list(se=c(0,0.25,0.5,1,2))),
EstimationTask(metrics=c("mse","mae"),
method=CV(nReps=3,nFolds=10))
)
rankWorkflows(r, top=3)
library(performanceEstimation)
library(DMwR2)
library(e1071)
data(iris)
data(BreastCancer, package="mlbench")
bc <- cbind(knnImputation(BreastCancer[,-c(1,11)]),
Class=BreastCancer$Class)
r <- performanceEstimation(
c(PredTask(Species ~ ., iris),
PredTask(Class ~ ., bc)),
workflowVariants(learner="svm",
learner.pars=list(cost=c(1,5,10),
gamma=c(0.01,0.001))),
EstimationTask(metrics="acc",
method=Bootstrap(nReps=200,type=".632"))
)