How to evaluate sentiment classifiers for Twitter time-ordered data?

In : PLOS ONE, (13), 3, pp. e0194317, 2018

Reproducibility of the Results in the Paper

To reproduce the results shown in the paper (both Figures and Tables) you need to have a current working installation of R (www.r-project.org).

On top of this installation of R, you also need to install a series of extra packages, which we will load below.

The following code snipet includes basic settings that you need to execute before proceding with replicating our results. Among these settings you have the loading of the necessary extra packages (that you must first install before executing this code), and also the loading of a script file that you can download here.

## Font size used on graphs
## txtSz <- 14  # this was used for the PDF figures of the paper
txtSz <- 10

## required packages
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(performanceEstimation)
library(tidyr)

## Loading a script containing some auxiliary functions
source("auxFuncs.R")

You also need to download the Rdata files that contain the R objects with the results of the estimation procedures we compare in the paper. Download the two following files:

The results concerning the Alpha Metric
The results concerning the F Metric

In summary, you should now have on your current R working directory the following files:

kScores.Rdata
fScores.Rdata
auxFuncs.R
And you should have executed the above code snipet, after installing the packages that are loaded therein using the library()function.

Table 4

The following code produces the results show in Table 4 of the paper. This code will show the table in this page, but for the paper we generated a Latex version of the table, whose code is commented in the code.

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

medScores <- group_by(res,Strat2,Lang) %>% 
  summarise(medDif = median(dif2Gold)) %>% 
  ungroup()
tab4 <- spread(medScores,Strat2,medDif)
tab4[,-1] <- round(tab4[,-1],3)
## Last row with medians
tab4 <- rbind(tab4, c("Median",round(apply(tab4[,-1],2,median),3)))
## Table 4
kable(tab4)

Lang	xval(9:1, strat, block)	xval(9:1, no-strat, block)	xval(9:1, strat, rand)	seq(9:1, 20, equi)	seq(9:1, 10, equi)	seq(2:1, 10, semi-equi)
alb	0.052	0.036	0.206	0.001	0.001	0.001
bul	0.009	0.013	0.046	-0.019	-0.025	-0.043
eng	-0.016	-0.017	-0.01	-0.04	-0.042	-0.039
ger	0.037	0.049	0.059	0.009	0.01	0.001
hun	0.009	0.013	0.025	-0.011	-0.007	-0.007
pol	0.011	0.016	0.054	-0.02	-0.017	-0.031
por	-0.048	-0.048	-0.015	-0.04	-0.045	-0.085
rus	0.008	0.008	0.029	-0.027	-0.029	-0.045
scb	-0.046	-0.051	0.026	-0.047	-0.043	-0.069
slk	0.018	0.015	0.055	-0.025	-0.023	-0.039
slv	0.003	-0.004	0.04	-0.029	-0.026	-0.031
spa	-0.008	0.031	0.07	0.012	0.011	-0.011
swe	0.055	0.057	0.106	0.011	0.006	-0.028
Median	0.009	0.013	0.046	-0.02	-0.023	-0.031

## Latex version of the table (uncomment next line to get the latex code)
## Hmisc:::latex(tab4,file="",rowname=NULL)

Table 5

Similar code, now for Table 5.

metr <- "F"
## Loading the results of the selected metric 
loadAndPrepare(metr)

medScores <- group_by(res,Strat2,Lang) %>% 
  summarise(medDif = median(dif2Gold)) %>% ungroup()
tab5 <- spread(medScores,Strat2,medDif)
tab5[,-1] <- round(tab5[,-1],3)
## Last row with medians
tab5 <- rbind(tab5, c("Median",round(apply(tab5[,-1],2,median),3)))
kable(tab5)

Lang	xval(9:1, strat, block)	xval(9:1, no-strat, block)	xval(9:1, strat, rand)	seq(9:1, 20, equi)	seq(9:1, 10, equi)	seq(2:1, 10, semi-equi)
alb	0.026	0.016	0.137	-0.014	-0.007	-0.009
bul	0.02	0.024	0.047	0.003	-0.002	-0.019
eng	-0.019	-0.02	-0.015	-0.027	-0.027	-0.028
ger	0.056	0.058	0.072	0.025	0.028	0.014
hun	0.022	0.022	0.03	-0.006	-0.009	-0.005
pol	0.013	0.02	0.044	-0.001	0	-0.007
por	-0.05	-0.045	-0.04	-0.049	-0.056	-0.092
rus	0.008	0.01	0.025	-0.019	-0.018	-0.021
scb	-0.034	-0.037	0	-0.03	-0.032	-0.05
slk	0.005	0.008	0.025	-0.013	-0.015	-0.013
slv	0.003	0	0.029	-0.022	-0.026	-0.032
spa	-0.001	0.024	0.06	0.007	0.01	0.012
swe	0.03	0.037	0.071	0.008	0.006	-0.011
Median	0.008	0.016	0.03	-0.013	-0.009	-0.013

## Latex version of the table (uncomment next line to get the latex code)
## Hmisc:::latex(tab5,file="",rowname=NULL)

Figure 3

The following code reproduces the graph shown in Figure 3.

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## dif2Gold is = Estimate - GoldStandard
g <- ggplot(res, aes(x=Lang,y=dif2Gold,col=Lang)) + 
  geom_boxplot() + geom_hline(yintercept=0, alpha=0.6) +
  facet_wrap(~ Strat2) + 
  guides(col=FALSE) + 
  xlab("") +
  ylab("Distribution of the Difference to Gold Standard") +
  theme(text=element_text(size=txtSz), axis.text.x=element_text(angle=90,vjust=0.3))

## THe graph of figure 3
g

Figure 4

The following code reproduces the graph shown in Figure 4.

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## dif2Gold is = Estimate - GoldStandard
g <- ggplot(res, aes(x=Strat2,y=dif2Gold)) + 
  geom_boxplot() + geom_hline(yintercept=0, col= "red", alpha=0.5, lwd=1.1) +
  guides(col=FALSE) + ylab("Distribution of the Difference to Gold Standard") + 
  xlab("") +
  theme(text=element_text(size=txtSz))
#  theme(text=element_text(size=txtSz), axis.text.x=element_text(angle=25,vjust=0.3))

## Graph on figure 4
g

Figure 5

The following code reproduces the graph shown in Figure 5.

metr <- "F"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## dif2Gold is = Estimate - GoldStandard
g <- ggplot(res, aes(x=Strat2,y=dif2Gold)) + 
  geom_boxplot() + geom_hline(yintercept=0, col= "red", alpha=0.5, lwd=1.1) +
  guides(col=FALSE) + ylab("Distribution of the Difference to Gold Standard") + 
  xlab("") +
  theme(text=element_text(size=txtSz))
#  theme(text=element_text(size=txtSz), axis.text.x=element_text(angle=25,vjust=0.3))

## Graph on figure 5
g

Figure 6

The following code reproduces the graph shown in Figure 6.

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

thrsh1 <- 5
thrsh2 <- 30
d1 <- select(res,Strat2,Lang,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1), 
         ok = between(percDiff,-thrsh1, thrsh1)) %>% 
  filter(ok) %>%
  group_by(Strat2,Lang) %>% tally()
d2 <- select(res,Strat2,Lang,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1), 
         bad = !between(percDiff,-thrsh2, thrsh2)) %>% 
  filter(bad) %>%
  group_by(Strat2,Lang) %>% tally()
d <- select(res,Strat2,Lang,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1)) %>% 
  group_by(Strat2,Lang) %>% tally()
tots <- left_join(d,d1,by=c("Strat2","Lang")) %>% 
  rename("Total" = n.x, "TotalOK" = n.y) %>% 
  left_join(d2,by=c("Strat2","Lang")) %>%
  rename("TotalBad" = n) 
tots[is.na(tots)] <- 0
percs <- mutate(tots, OK=100*TotalOK/Total, 
         Bad=100*TotalBad/Total, 
         Rest=100-OK-Bad) %>% 
  select(Strat2,Lang,OK,Bad,Rest)
dt <- gather(percs,Type,Perc,OK:Rest)
dt$Type <- factor(dt$Type,levels=c("Bad","Rest","OK"),
                  labels=c(paste(">",thrsh2),paste("[",thrsh1,",",thrsh2,"]"),paste("<",thrsh1)))
g <- ggplot(dt,aes(x=Lang,y=Perc,fill=Type)) + 
  geom_col(position="fill") + 
  facet_wrap(~ Strat2) +
  ylab(paste("Proportion of Iterations with Difference to Gold Standard between",thrsh1,"and",thrsh2))
  xlab("") +
  theme(text=element_text(size=txtSz), axis.text.x=element_text(angle=90,vjust=0.3))

## NULL

## Graph on screen
g

Figure 7

The following code reproduces the graph shown in Figure 7.

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

thrsh1 <- 5
thrsh2 <- 30
d1 <- select(res,Strat2,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1), 
         ok = between(percDiff,-thrsh1, thrsh1)) %>% 
  filter(ok) %>%
  group_by(Strat2) %>% tally()
d2 <- select(res,Strat2,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1), 
         bad = !between(percDiff,-thrsh2, thrsh2)) %>% 
  filter(bad) %>%
  group_by(Strat2) %>% tally()
d <- select(res,Strat2,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1)) %>% 
  group_by(Strat2) %>% tally()
tots <- left_join(d,d1,by=c("Strat2")) %>% 
  rename("Total" = n.x, "TotalOK" = n.y) %>% 
  left_join(d2,by=c("Strat2")) %>%
  rename("TotalBad" = n) 
tots[is.na(tots)] <- 0
percs <- mutate(tots, OK=100*TotalOK/Total, 
         Bad=100*TotalBad/Total, 
         Rest=100-OK-Bad) %>% 
  select(Strat2,OK,Bad,Rest)
dt <- gather(percs,Type,Perc,OK:Rest)
dt$Type <- factor(dt$Type,levels=c("Bad","Rest","OK"),
                  labels=c(paste(">",thrsh2),paste("[",thrsh1,",",thrsh2,"]"),paste("<",thrsh1)))
g <- ggplot(dt,aes(x=Strat2,y=Perc,fill=Type)) + 
  geom_col(position="fill") +
  ylab(paste("Proportion of Differences to Gold Standard between",thrsh1,"and",thrsh2)) + 
  xlab("") +
  theme(text=element_text(size=txtSz), axis.text.x=element_text(angle=25,vjust=0.3))

## Graph on screen
g

Figure 8

The following code reproduces the graph shown in Figure 8.

metr <- "F"
## Loading the results of the selected metric 
loadAndPrepare(metr)

thrsh1 <- 5
thrsh2 <- 30
d1 <- select(res,Strat2,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1), 
         ok = between(percDiff,-thrsh1, thrsh1)) %>% 
  filter(ok) %>%
  group_by(Strat2) %>% tally()
d2 <- select(res,Strat2,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1), 
         bad = !between(percDiff,-thrsh2, thrsh2)) %>% 
  filter(bad) %>%
  group_by(Strat2) %>% tally()
d <- select(res,Strat2,Iter,rat2Gold) %>% 
  mutate(percDiff=100*(rat2Gold-1)) %>% 
  group_by(Strat2) %>% tally()
tots <- left_join(d,d1,by=c("Strat2")) %>% 
  rename("Total" = n.x, "TotalOK" = n.y) %>% 
  left_join(d2,by=c("Strat2")) %>%
  rename("TotalBad" = n) 
tots[is.na(tots)] <- 0
percs <- mutate(tots, OK=100*TotalOK/Total, 
         Bad=100*TotalBad/Total, 
         Rest=100-OK-Bad) %>% 
  select(Strat2,OK,Bad,Rest)
dt <- gather(percs,Type,Perc,OK:Rest)
dt$Type <- factor(dt$Type,levels=c("Bad","Rest","OK"),
                  labels=c(paste(">",thrsh2),paste("[",thrsh1,",",thrsh2,"]"),paste("<",thrsh1)))
g <- ggplot(dt,aes(x=Strat2,y=Perc,fill=Type)) + 
  geom_col(position="fill") +
  ylab(paste("Proportion of Differences to Gold Standard between",thrsh1,"and",thrsh2)) + 
  xlab("") +
  theme(text=element_text(size=txtSz), axis.text.x=element_text(angle=25,vjust=0.3))

## Graph on screen
g

Figure 9

The following code reproduces the graph shown in Figure 9.

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## The source raw data for the tests (the results on each iteration)
rawScores <- mutate(res, absDif=abs(dif2Gold)) %>% select(Strat2, Lang, absDif)
## Now building a performanceEstimation ComparisonResults object
ds <- unique(rawScores$Lang)  # different languages correspond to diff data sets
nds <- length(ds)
tasks <- sapply(ds, function(d) PredTask(x ~.,data.frame(x=1:2,y=2:3),d))
al <- unique(as.character(rawScores$Strat2)) # different strategies correspond to diff algorithms
nal <- length(al)
workflows <- sapply(al, function(a) Workflow(wfID=a))
estTask <- EstimationTask(metrics=metr)
peRes <- vector("list",nds)
names(peRes) <- ds
for(i in 1:nds) {
  peRes[[i]] <- vector("list",nal)
  names(peRes[[i]]) <- al
}
for(d in ds) {
  for(a in al) {
    peRes[[d]][[a]] <- EstimationResults(tasks[[d]],workflows[[a]],estTask,
                                         as.matrix(filter(rawScores,Lang==d,Strat2==a) %>% select(absDif)),list())
  }
}
obj <- ComparisonResults(peRes)

## Now onto the tests
comp <- pairedComparisons(obj, baseline="xval(9:1, strat, rand)")

CDdiagram.Nemenyi(comp)

Figure 10

The following code reproduces the graph shown in Figure 10.

metr <- "F"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## The source raw data for the tests (the results on each iteration)
rawScores <- mutate(res, absDif=abs(dif2Gold)) %>% select(Strat2, Lang, absDif)
## Now building a performanceEstimation ComparisonResults object
ds <- unique(rawScores$Lang)  # different languages correspond to diff data sets
nds <- length(ds)
tasks <- sapply(ds, function(d) PredTask(x ~.,data.frame(x=1:2,y=2:3),d))
al <- unique(as.character(rawScores$Strat2)) # different strategies correspond to diff algorithms
nal <- length(al)
workflows <- sapply(al, function(a) Workflow(wfID=a))
estTask <- EstimationTask(metrics=metr)
peRes <- vector("list",nds)
names(peRes) <- ds
for(i in 1:nds) {
  peRes[[i]] <- vector("list",nal)
  names(peRes[[i]]) <- al
}
for(d in ds) {
  for(a in al) {
    peRes[[d]][[a]] <- EstimationResults(tasks[[d]],workflows[[a]],estTask,
                                         as.matrix(filter(rawScores,Lang==d,Strat2==a) %>% select(absDif)),list())
  }
}
obj <- ComparisonResults(peRes)

## Now onto the tests
comp <- pairedComparisons(obj, baseline="xval(9:1, strat, rand)")

CDdiagram.Nemenyi(comp)

Results supporting the graphs of Figure 11

The following code carries out the Wilcoxon signed rank tests supporting the graphs shown in Figure 11.

First for the F metric

metr <- "F"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## The source raw data for the tests (the results on each iteration)
rawScores <- mutate(res, absDif=abs(dif2Gold)) %>% select(Strat2, Lang, absDif)
## Now building a performanceEstimation ComparisonResults object
task <- PredTask(x ~.,data.frame(x=1:2,y=2:3)) # a fake task
al <- unique(as.character(rawScores$Strat2)) # different strategies correspond to diff algorithms
nal <- length(al)
workflows <- sapply(al, function(a) Workflow(wfID=a))
estTask <- EstimationTask(metrics="alpha")
## One single data set
nds <- 1
peRes3 <- vector("list",nds)
names(peRes3) <- "Langs"
for(i in 1:nds) {
  peRes3[[i]] <- vector("list",nal)
  names(peRes3[[i]]) <- al
}

avgScores <- group_by(rawScores,Strat2,Lang) %>% summarize(avgAbsDif = mean(absDif)) %>% ungroup()
for(a in al) {
  peRes3[[1]][[a]] <- EstimationResults(task,workflows[[a]],estTask, as.matrix(filter(avgScores,Strat2==a) %>% select(avgAbsDif)),list())
}
objW <- ComparisonResults(peRes3)

## Now onto the tests

## Using xval(9:1, strat, block) as baseline
compX <- pairedComparisons(objW, baseline="xval(9:1, strat, block)")
compX$alpha$WilcoxonSignedRank.test

## , , Langs
## 
##                              MedScore DiffMedScores    p.value
## xval(9:1, strat, block)    0.02740000            NA         NA
## xval(9:1, strat, rand)     0.05042105   -0.02302105 0.01049805
## xval(9:1, no-strat, block) 0.02625000    0.00115000 0.02661133
## seq(2:1, 10, semi-equi)    0.03200000   -0.00460000 0.24389648
## seq(9:1, 20, equi)         0.03050000   -0.00310000 0.73535156
## seq(9:1, 10, equi)         0.03200000   -0.00460000 0.41430664

## Using xval(9:1, strat, block) as baseline
compS <- pairedComparisons(objW, baseline="seq(9:1, 20, equi)")
compS$alpha$WilcoxonSignedRank.test

## , , Langs
## 
##                              MedScore DiffMedScores    p.value
## seq(9:1, 20, equi)         0.03050000            NA         NA
## xval(9:1, strat, block)    0.02740000    0.00310000 0.73535156
## xval(9:1, strat, rand)     0.05042105   -0.01992105 0.02148438
## xval(9:1, no-strat, block) 0.02625000    0.00425000 0.68481445
## seq(2:1, 10, semi-equi)    0.03200000   -0.00150000 0.05901713
## seq(9:1, 10, equi)         0.03200000   -0.00150000 0.02536986

Now for the Alpha metric

metr <- "Alpha"
## Loading the results of the selected metric 
loadAndPrepare(metr)

## The source raw data for the tests (the results on each iteration)
rawScores <- mutate(res, absDif=abs(dif2Gold)) %>% select(Strat2, Lang, absDif)
## Now building a performanceEstimation ComparisonResults object
task <- PredTask(x ~.,data.frame(x=1:2,y=2:3)) # a fake task
al <- unique(as.character(rawScores$Strat2)) # different strategies correspond to diff algorithms
nal <- length(al)
workflows <- sapply(al, function(a) Workflow(wfID=a))
estTask <- EstimationTask(metrics="alpha")
## One single data set
nds <- 1
peRes3 <- vector("list",nds)
names(peRes3) <- "Langs"
for(i in 1:nds) {
  peRes3[[i]] <- vector("list",nal)
  names(peRes3[[i]]) <- al
}

avgScores <- group_by(rawScores,Strat2,Lang) %>% summarize(avgAbsDif = mean(absDif)) %>% ungroup()
for(a in al) {
  peRes3[[1]][[a]] <- EstimationResults(task,workflows[[a]],estTask, as.matrix(filter(avgScores,Strat2==a) %>% select(avgAbsDif)),list())
}
objW <- ComparisonResults(peRes3)

## Now onto the tests

## Using xval(9:1, strat, block) as baseline
compX <- pairedComparisons(objW, baseline="xval(9:1, strat, block)")
compX$alpha$WilcoxonSignedRank.test

## , , Langs
## 
##                              MedScore DiffMedScores     p.value
## xval(9:1, strat, block)    0.03327273            NA          NA
## xval(9:1, strat, rand)     0.06184211  -0.028569378 0.001708984
## xval(9:1, no-strat, block) 0.03454545  -0.001272727 0.094238281
## seq(2:1, 10, semi-equi)    0.03533333  -0.002060606 0.339599609
## seq(9:1, 20, equi)         0.03140000   0.001872727 0.892578125
## seq(9:1, 10, equi)         0.02980000   0.003472727 0.497314453

## Using xval(9:1, strat, block) as baseline
compS <- pairedComparisons(objW, baseline="seq(9:1, 20, equi)")
compS$alpha$WilcoxonSignedRank.test

## , , Langs
## 
##                              MedScore DiffMedScores     p.value
## seq(9:1, 20, equi)         0.03140000            NA          NA
## xval(9:1, strat, block)    0.03327273  -0.001872727 0.892578125
## xval(9:1, strat, rand)     0.06184211  -0.030442105 0.013427734
## xval(9:1, no-strat, block) 0.03454545  -0.003145455 1.000000000
## seq(2:1, 10, semi-equi)    0.03533333  -0.003933333 0.006103516
## seq(9:1, 10, equi)         0.02980000   0.001600000 0.068115234