forked from hadley/r-on-github
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2-languages.r
43 lines (32 loc) · 1.37 KB
/
2-languages.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
library(ggplot2)
library(plyr)
library(reshape2)
repos <- llply(dir("cache-repo", full.names = TRUE), readRDS)
names(repos) <- vapply(repos, function(x) x$info$full_name, character(1))
lang <- lapply(repos, function(x) x$lang)
lang_collapse <- function(x) {
x <- unlist(x)
data.frame(t(prop.table(x)), total = sum(x), check.names = FALSE)
}
langs <- ldply(lang, lang_collapse)
# Explore total repo size ------------------------------------------------------
qplot(total, data = langs, binwidth = 0.05) + scale_x_log10()
# Only look at repos with at least 5k of code
mean(langs$total > 5e3)
langs$.id[langs$total < 5e3][1:10]
langs <- subset(langs, total > 5e3)
qplot(total, data = langs, binwidth = 0.05) + scale_x_log10()
# Big repos: over a megabyte of R code
big <- subset(langs, langs$total * langs$R > 1e6)
big <- big[(vapply(big, function(x) !all(is.na(x)), logical(1)))]
big$.id
# Look at distribution of other languages --------------------------------------
qplot(R, data = langs, binwidth = 0.01)
mean(langs$R == 1)
# ~70% are only R
langm <- melt(langs, id = c(".id", "total", "R"), na.rm = TRUE)
other_lang <- count(langm, "variable")
pop_lang <- match_df(langm, subset(other_lang, freq >= 20))
qplot(value, data = pop_lang, binwidth = 0.05) + facet_wrap(~ variable)
qplot(value, ..density.., data = pop_lang, binwidth = 0.05, geom = "histogram") +
facet_wrap(~ variable)