The following is a script file containing all R code of all sections in this chapter.
ggplot(algae,aes(x=mxPH)) +
geom_histogram(aes(y=..density..)) +
geom_density(color="red") + geom_rug() +
ggtitle("The Histogram of mxPH (maximum pH)") +
xlab("") + ylab("")
library(car)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")
library(car)
gh <- ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() + ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("")
par(mfrow=c(1,2))
vpL <- viewport(height=unit(1, "npc"), width=unit(0.5, "npc"),
just="left",
y=0.5, x=0)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH')
print(gh,vp=vpL)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")
ggplot(algae,aes(x=factor(0),y=oPO4)) +
geom_boxplot() + geom_rug() +
geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)),
linetype=2,colour="red") +
ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)
ggplot(algae,aes(x=factor(0),y=oPO4)) +
geom_boxplot() + geom_rug() +
geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)),linetype=2,colour="red") +
ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)
plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T), lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)
identify(algae$NH4)
library(forcats)
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae,aes(x=size,y=a1)) +
geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")
data2graph <- filter(algae,!is.na(mnO2)) %>%
mutate(minO2=cut(mnO2, quantile(mnO2,c(0,0.25,.5,.75,1)), include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season, color=season)) + geom_point() +
facet_wrap(~ minO2) +
guides(color=FALSE)
data2graph <- filter(algae,!is.na(mnO2)) %>%
mutate(minO2=cut(mnO2,
quantile(mnO2,c(0,0.25,0.5,0.75,1)),
include.lowest=TRUE))
#data2graph <- algae[!is.na(algae$mnO2),]
#data2graph <- cbind(data2graph,
# minO2=cut(data2graph$mnO2,
# quantile(data2graph$mnO2,c(0,0.25,.5,.75,1)),
# include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season,col=season)) + geom_point() + facet_wrap(~ minO2) + guides(color=FALSE)
library(corrplot)
cm <- cor(algae[,4:18], use="complete.obs")
corrplot(cm, type="upper", tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number",
diag=FALSE, tl.pos="n", cl.pos="n")
library(corrplot)
cm <- cor(algae[,4:18],use="complete.obs")
corrplot(cm,type="upper",tl.pos="d",tl.cex=0.75)
corrplot(cm,add=TRUE, type="lower", method="number",tl.cex=0.75, diag=FALSE,tl.pos="n", cl.pos="n")
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
fillPO4 <- function(oP) ifelse(is.na(oP),NA,42.897 + 1.293 * oP)
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4), "oPO4"], fillPO4)
library(ggplot2)
library(forcats)
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + facet_wrap(~ season)
library(ggplot2)
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + facet_wrap(~ season)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[["a1"]])))
(mse.a1.lm <- mean((lm.predictions.a1 - algae[["a1"]])^2))
(mse.a1.rt <- mean((rt.predictions.a1 - algae[["a1"]])^2))
(nmse.a1.lm <- mean((lm.predictions.a1-algae[['a1']])^2)/
mean((mean(algae[['a1']])-algae[['a1']])^2))
(nmse.a1.rt <- mean((rt.predictions.a1-algae[['a1']])^2)/
mean((mean(algae[['a1']])-algae[['a1']])^2))
library(ggplot2)
dg <- data.frame(lm.a1=lm.predictions.a1,
rt.a1=rt.predictions.a1,
true.a1=algae[["a1"]])
ggplot(dg,aes(x=lm.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") +
ggtitle("Linear Model")
ggplot(dg,aes(x=rt.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") +
ggtitle("Regression Tree")
plot(lm.predictions.a1,algae[['a1']],main="Linear Model",
xlab="Predictions",ylab="True Values")
abline(0,1,col="red")
algae[identify(lm.predictions.a1,algae[['a1']]),]
sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0, 0, lm.predictions.a1)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
(smae.a1.lm <- mean(abs(sensible.lm.predictions.a1 - algae[["a1"]])))
library(performanceEstimation)
res <- performanceEstimation(
PredTask(a1 ~ ., algae[, 1:12], "a1"),
c(Workflow(learner="lm",pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1)))),
EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10))
)
DSs <- sapply(names(algae)[12:18],
function(x,names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae[,c(names.attrs,x)], x, copy=TRUE)
},
names(algae)[1:11])
res.all <- performanceEstimation(
DSs,
c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))),
EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))
library(randomForest)
res.all <- performanceEstimation(
DSs,
c(Workflow(learner="lm", pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",
learner.pars=list(se=c(0,0.5,1))),
workflowVariants(learner="randomForest", pre="knnImp",
learner.pars=list(ntree=c(200,500,700)))),
EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))
wfs <- sapply(taskNames(res.all),
function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]
wfs[["a7"]]
full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2),
dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
for(i in 1:7) {
res <- runWorkflow(wfs[[i]],
as.formula(paste(names(wfs)[i],"~.")),
algae[,c(1:11,11+i)],
full.test.algae[,c(1:11,11+i)])
pts[,i,"trues"] <- res$trues
pts[,i,"preds"] <- res$preds
}