User:Ben Moore/figures

Figures not listed on this page are probably in this github repo, which should contain all scripts, data and figures I've made.

This page shows the methods and data I've used to create figures uploaded to wikimedia commons for use in articles. The idea is to make it as easy as possible for anyone else to replace the figures I've made with better ones! The code provided is in the R programming language, and I recommend the great open source IDE RStudio for playing about with it. Let me know on my talk page if you have any questions or criticisms.

United States employment statistics[edit]

Data[edit]

Two data sets offered by the US gov (.xls excel spreadsheets):

NB. These seem to be kept reasonably up to date, so these plots can be updated as new data becomes available.

Code[edit]

R code

require("zoo")
# first save xls as csv, then...
# unemployment
u <- read.csv("Unemployment.csv", header=T)
u <- melt(u, "Year")

unemployment <- data.frame(date=as.yearmon(do.call(paste, u[,1:2]), "%Y %b"),
                           rate=u$value)
unemployment <- unemployment[unemployment$date > as.yearmon("2008-12"),]
u2 <- unemployment
colnames(u2) <- c("date", "rate")

# net change in jobs
n <- read.csv("netChange.csv", header=T)
n <- melt(n, "Year")

# Check (!) dates are the same in each input, else repeat parse 
all.equal(net.change$date, unemployment$date) # TRUE
net.change <- data.frame(date=unemployment$date,
                         change=n$value)
net.change <- net.change[net.change$date > as.yearmon("2008-12"),]

dev.off()
svg("new.svg", 7.5, 5)
par(mar=c(5.5,4.5,5,3.8), mgp=c(1.8,.7,0))
# sort by date
net.change <- net.change[order(net.change$date),]
# original starts at 09
net.change <- net.change[net.change$date > as.yearmon("2008-12"),]

#par(yaxs="i")
bpos <- barplot(net.change$change, plot=F)
x <- as.Date(net.change$date)
# scale will be integer (== month)
plot(1:length(x), net.change$change, type="n", ylim=c(-1000, 600),
     frame=F, axes=F, xlab="", ylab="", xlim=c(2.7, length(x)-1.7))
rect(xleft=1:length(x)-.3, xright=1:length(x)+.3, 
     ybottom=0, ytop=net.change$change, col="#4F81BD", lend=2)
abline(h=0, lwd=1)

## Awful code, avert your eyes
labs <- format(as.Date(as.character(cut(as.Date(seq(x[1], x[length(x)], length.out=10), 
              format= "%M %Y"), breaks="months"))), "%b %Y")

axis(1, at=seq(1, length(x), length.out=length(labs)), 
     labels=labs, las=3)
axis(2, at=seq(-1000, 600, by=200), las=1)
mtext("Net change in employment per month (1000s of jobs)", 
      side=2, col="#1F497D", line=3)

## now unemployment
unemployment <- unemployment[order(unemployment$date),]
unemployment <- unemployment[unemployment$date > as.yearmon("2008-12"),]
par(new=T)
# set up same plot
plot(1:length(x), unemployment$rate, type="n", ylim=c(6.5, 10),
     frame=F, axes=F, xlab="", ylab="", xlim=c(2.7, length(x)-1.7))
lines(1:length(x), unemployment$rate, col="#C0504D", lwd=3.5)
axis(4, las=1)

text(length(x)*1.1, (par("usr")[4] + par("usr")[3])/2, "Percent unemployed", 
     srt = 270, xpd = TRUE, col="#C0504D")
text(length(x)*.85, ((par("usr")[4] + par("usr")[3])/4) *1.65, "Unemployment rate", 
     xpd = TRUE, col="#C0504D")
text(length(x)*.215, ((par("usr")[4] + par("usr")[3])/4) *1.75, "Jobs created or lost", 
     xpd = TRUE, col="#4F81BD")

mtext("United States Employment Statistics\n Jan 2009 - Dec 2013", side=3, 
      col="#1F497D", cex=1.4, line=2, font=2)
mtext("Monthly change, seasonally adjusted", side=3, cex=1.1, line=1)
dev.off()

Figure[edit]

Original (Raster)	New (Vector)

Cost of genome sequencing[edit]

Data[edit]

Table from NHGRI, should be updated "frequently".

Code[edit]

R code

library(zoo)
library(scales)

# from http://www.genome.gov/pages/der/sequencing_cost_apr13.xls
seq <- read.table("<filename.csv>", stringsAsFactors=F, header=T)
seq$CostPerMb <- NULL

seq <- melt(seq, 1)
seq$Date <- as.yearmon(seq$Date, format="%b-%y")

options(scipen=99)
svg("figs/genome_cost.svg", 5, 4.5)
ggplot(seq, aes(x=as.Date(Date), y=value)) +
  labs(y="Cost per genome (USD)", x="", col="") +
  geom_hline(yintercept=1000, linetype="dashed", 
             col=I(muted("red"))) + 
  geom_line(size=1.5, col=I(muted("blue"))) + 
  scale_y_log10(limits=c(900,100000000),
                breaks=c(1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
                labels=comma) +
  ggtitle("Genome sequencing cost as estimated by NHGRI\n
          (September 2001 to April 2013)") +
  theme(plot.title=element_text(size=12))
dev.off()

Figure[edit]

Comments[edit]

The source code is no longer accurate, as I updated and redid the data using gnuplot. (It seemed easier for simple cases. Now that I'm learning R, that might not be the case further on, but regardless, the code on the image page doesn't match what's above. grendel|khan 05:51, 27 July 2016 (UTC)

History (and predicted future) size of the Sequence Read Archive[edit]

Data[edit]

A .csv containing the growth information up to (an extrapolated) month in late 2013 can be downloaded here from the NCBI website. The sequencing platform data is hardcoded below but was retreived from this publication.

Code[edit]

R code

# R (2.15.1)
library(plotrix)
library(sfsmisc)
library(TeachingDemos)

# Pie chart data
slices <- c(84, 12, 2, 2) 
lbls <- c("Illumina GA", "SOLiD", "454", "Other")

# Suppress scientific notation
options(scipen=5)

###### Draw plot ###### 

svg("~/SRA_growth.svg", 6,5)
par(mar=c(2,4,2,2))
sra_stat <- read.csv("~/<DATA_LOCATION>/sra_stat.csv")
sra_stat[,1] <- as.Date(sra_stat[,1], format="%m/%d/%Y")
sra_stat <- sra_stat[2:nrow(sra_stat),]

# Line plot
plot(sra_stat[,1], sra_stat[,2], xlab="", ylab="Number of Bases", 
     type="l", log="y", col="darkgreen", lwd=2, yaxt="n")
points(sra_stat[,1], sra_stat[,3], type="l", col="lightgreen", lwd=2)
axis(2,at=axTicks(2),label=axTexpr(2))
legend("topleft", fill=c("darkgreen", "lightgreen"), 
       legend=c("All Bases", "Open Access"))

# Add pie chart inset

par(cex=0.8)

subplot(pie(slices, labels=lbls, main="", col = c(topo.colors(4))), 

        as.Date("2011/07/14"), 5E11, size=c(2.5,2.5))
dev.off()

Figure[edit]

Comments[edit]

The dark green line could be raised above the light green
Pie charts are 'bad'
- Yellow = other?
- This pie-chart isn't particularly bad. There are only 4 categories. I don't think a barchart would be much better for illustrating the point.
  Thanks for the comments Ppgardne, I know you're a bit of a graph pro! Yeah yellow is other but I couldn't find any combination of parameters that would include this label, not run over the others and still remain a readable size… There's probably some adj or offset I've overlooked—haven't played with pie charts much before Jebus989 ^✰ 22:28, 13 January 2013 (UTC)
  a simple "text(x,y,'Other')" might do the trick. No tick mark though. Or refine the plot in inkscape -- that should keep all your SVG loveliness. --Paul (talk) 00:11, 14 January 2013 (UTC)
  BTW, that "pars=(cex=0.6)" is not doing what you think it is. Remove it and do a "par(cex=0.6)" before "subplot" instead. --Paul (talk) 03:56, 16 January 2013 (UTC)
  You're right cheers for picking that up. When the labels are actually scaled down they are too tiny, especially for a thumbnail. I reckon using text is a good option, maybe with a custom tickmark via segments if I find myself with time to waste—I wanna try keep it reproducible if I can so no Inkscape for now Jebus989 ^✰ 22:34, 16 January 2013 (UTC)
  Thanks for sharing this. I've already used it for a few projects. What if you scale "cex" down and increase "size"? You have heaps of space and 1.5 is a little small. --Paul (talk) 04:33, 17 January 2013 (UTC)
  Done Good idea! Now all segments labelled, cheers. And glad it came in use to someone! Jebus989 ^✰ 10:53, 17 January 2013 (UTC)
Redraw with ggplot2?

EMBL-Bank Growth[edit]

Data[edit]

The size of EMBL-Bank in nucleotides is reported in each set of Release Notes. This graph was drawn from Release Notes for version 114 (currently available here as of Jan 2013). For use in the code below, the relevant table was copied from the report and pasted to a file (EMBLBankGrowth below).

Code[edit]

R code

# R (2.15.1)
library(zoo)
library(sfsmisc)
svg("~/other/EMBLBank_growth3.svg", 6, 6)
options(scipen=3)
par(mar=c(2.5,4.1,2.5,2.1), xaxs="r", lend=2)

# Load and prepare data
ebg <- as.data.frame(read.table("~/<DATA-LOCATION>/EMBLBankGrowth", 
    header=T, quote="\"", colClasses=c("numeric", "character","numeric", "numeric")))
ebg[,2] <- as.yearmon(gsub("/", "-", ebg[,2]), "%m-%Y")
ebg <- as.data.frame(read.table("~/Desktop/EMBLBankGrowth", header=T, quote="\"",

# Set up plot 
plot(ebg[,2], ebg[,4], type="l", log="y", col="blue", lwd=2, xlab="", 
     ylab="Number of Entries or Nucleotides", yaxt="n", xaxt="n", 
     ylim=c(min(ebg[,3]), 1E12), xlim=c(1982,2013), main="Growth of EMBL-Bank (1982-2012)")
axis(2,at=axTicks(2),label=axTexpr(2))
axis(1, at=seq(1980,2015,5))
grid(lty=1)

# Add line for entries
points(ebg[,2], ebg[,3], type="l", log="y", col="red", lwd=2)
legend("topleft", fill=c("blue","red"), legend=c("Nucleotides", "Entries"), cex=0.8)
dev.off()

Figure[edit]

Comments[edit]

The two lines probably don't need to be on different axes
Done
Minor tickmarks highlighting the log scale might look nice
Points of interest could be labelled; i.e. Human Genome Project; milestones
Not done coule be a WP:SYNTH issue

Wikipedia Editor numbers analysis[edit]

Data[edit]

Data retrieved from stats.wikimedia.org. Copy and paste table into a plaintext file to keep relative column numbering. Data used in the figure runs from January 2001 up to December 2012.

Code[edit]

R code

library(zoo)
wiki <- read.delim("~/<DATA>", header=F)
par(mar=c(2,4.5,1,2))

svg("~/Desktop/wiki_stats.svg",7,7 )
wiki[,1] <- as.yearmon(wiki[,1],"%b %Y")

# Active (10 edits / month)
plot(wiki[,1], wiki[,4], type="l", xaxt="n", ylab="Number of editors", lwd=4,
     col="darkblue", xlab="")

# Really active (100 edits / month)
points(wiki[,1], wiki[,5], type="l", lwd=4, col="darkgreen")
axis(1, at=seq(2001,2012,1), cex.axis=0.8)
legend("topleft", c("10 edits / month", "100 edits / month"), 
       col=c("darkblue","darkgreen"), bty="n", lwd=4)

# Regression from 2007 onwards:
library(car)
recent  <- wiki[wiki[,1] > 2007,]
reg <- lm(recent[,4] ~ recent[,1])
regLine(reg, lwd=2)
text(2011,45000,paste("~", signif(abs(reg$coefficients[[2]]),2), 
     " fewer per year", sep=""), col="darkblue")
text(2011,43000,paste(signif(abs(reg$coefficients[[2]]) / max(recent[,4]) * 100,3), 
     "% of max.", sep=""), col="darkblue")

reg2 <- lm(recent[,5] ~ recent[,1])
regLine(reg2, lwd=2, col="black")
text(2011,10000,paste("~", signif(abs(reg2$coefficients[[2]]),2), 
     " fewer per year", sep=""), col="darkgreen")
text(2011,8000,paste(signif(abs(reg2$coefficients[[2]]) / max(recent[,5]) * 100,3), 
     "% of max.", sep=""), col="darkgreen")
dev.off()

Figure[edit]

Comments[edit]

RfA voting analysis (old data)[edit]

Data[edit]

Data found in this table from 2011, tab-separated version used in this script available at github.

Code[edit]

R code

# relative path if sourced with master.R
a <- read.table("../data/wvotes.tsv", fill=T, stringsAsFactors=F, header=T)

# format messy string into date
a$Firstedit <- as.Date(a$Firstedit, "%B_%d,_%Y")

# Do support/oppose levels differ between admins and non-admins?
ggplot(a, aes(x=Opposes, y=Supports, col=Admin, group=Admin)) + 
 geom_jitter(width=4, height=4) + geom_smooth(method="lm", se=F) + theme_bw() 

# Filter users who haven't voted much
b <- a[a$Supports + a$Opposes + a$Neutrals > 10,]
library(gridExtra)
svg("../figs/RfAvote_analysis.svg",15, 5)
grid.arrange(
 # Do those with higher edit counts support more?
 ggplot(b, aes(x=Editcount+1, y=100*(Supports/(Supports+Opposes+Neutrals)), col=Admin)) + 
 geom_jitter() + scale_x_log10() + geom_smooth(method="lm", se=T, size=1.5) + 
 theme_bw() + xlab("Number of edits") + ylab("Support percentage (Users with >10 votes)"),
 # Do old-timers support more than newer editors?
 ggplot(b, aes(x=Firstedit+1, y=100*(Supports/(Supports+Opposes+Neutrals)), col=Admin)) + 
 geom_jitter() + geom_smooth(method="lm", se=T, size=1.5) + 
 theme_bw() + xlab("Date of first edit") + ylab("Support percentage (Users with >10 votes)"),
 # Do admins on the whole support more RfAs than editors?
 ggplot(b, aes(x=Admin, y=100*(Supports/(Supports+Opposes+Neutrals)), fill=Admin)) + 
 geom_violin(alpha=I(.5), notch=T) + xlab("Admin") + theme_bw() +
 ylab("Support percentage (Users with >10 votes)")
 , ncol=3)
dev.off()

##Let's do some stats, do admins support more or less than non-admins?
admin <- b[b$Admin == "Yes",]
editor <- b[b$Admin == "No",]

adminvotes <- admin$Supports / admin$RfAVotes
editorvotes <- editor$Supports / editor$RfAVotes
wilcox.test(adminvotes, editorvotes)
# W = 14184.5, p-value = 0.002663

wilcox.test(admin$Supports, editor$Supports)
# W = 12256, p-value = 0.5884

wilcox.test(admin$RfAvotes, editor$RfAvotes)
# W = 11318, p-value = 0.5117

wilcox.test(admin$Neutrals + admin$Opposes, editor$Neutrals + editor$Opposes)
# W = 9463, p-value = 0.002441

## Commentary:
"""
Admins !voters have a significantly higher support % than non-admins (p < 0.01), 
though they haven't actually !voted support more times than the average editor 
(mean supports, admin: 22.99, editor: 22.76) so possibly just !vote a little 
less often and give fewer oppose + neutral !votes 
(mean # oppose + neutrals, admin: 7.89, editor: 10.09,  p < 0.01).
"""

Figure[edit]

Comments[edit]

I did wonder why this kind of analysis hadn't be done with the data, but maybe it was and it was seen to be not very interesting!