forked from benmarwick/dayofarchaeology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
004_generate_topic_model.r
102 lines (80 loc) · 3.76 KB
/
004_generate_topic_model.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#' Topic modelling with MALLET using clean fulltext
#' based on http://www.cs.princeton.edu/~mimno/R/
require(mallet)
documents <- data.frame(text = blogtext$text,
id = make.unique(blogtext$author),
class = blogtext$year,
stringsAsFactors=FALSE)
mallet.instances <- mallet.import(documents$id, documents$text, "C:/mallet-2.0.7/stoplists/en.txt", token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
## Create a topic trainer object.
n.topics <- 30
topic.model <- MalletLDA(n.topics)
## Load our documents. We could also pass in the filename of a
## saved instance list file that we build from the command-line tools.
topic.model$loadDocuments(mallet.instances)
## Get the vocabulary, and some statistics about word frequencies.
## These may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
## Optimize hyperparameters every 20 iterations,
## after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
## Now train a model. Note that hyperparameter optimization is on, by default.
## We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(200)
## NEW: run through a few iterations where we pick the best topic for each token,
## rather than sampling from the posterior distribution.
topic.model$maximize(10)
## Get the probability of topics in documents and the probability of words in topics.
## By default, these functions return raw word counts. Here we want probabilities,
## so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
## transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
## Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
# create data.frame with columns as authors and rows as topics
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- documents$id
# find top n topics for a certain author
df1 <- t(topic_docs[,grep("Sarah Bennett", names(topic_docs))])
colnames(df1) <- topics.labels
require(reshape2)
topic.proportions.df <- melt(cbind(data.frame(df1),
document=factor(1:nrow(df1))),
variable.name="topic",
id.vars = "document")
# plot for each doc by that author
require(ggplot2)
ggplot(topic.proportions.df, aes(topic, value, fill=document)) +
geom_bar(stat="identity") +
ylab("proportion") +
theme(axis.text.x = element_text(angle=90, hjust=1)) +
coord_flip() +
facet_wrap(~ document, ncol=5)
## cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)
## How do topics differ across different years?
topic_docs_t <- data.frame(t(topic_docs))
topic_docs_t$year <- documents$class
df3 <- aggregate(topic_docs_t, by=list(topic_docs_t$year), FUN=mean)
df3 <- data.frame(t(df3[-3,-length(df3)]), stringsAsFactors = FALSE)
names(df3) <- c("y2012", "y2013")
df3 <- df3[-1,]
df3 <- data.frame(apply(df3, 2, as.numeric, as.character))
df3$topic <- 1:n.topics
# which topics differ the most?
df3$diff <- df3[,1] - df3[,2]
df3[with(df3, order(-abs(diff))), ]
# plot
df3m <- melt(df3[,-4], id = 3)
ggplot(df3m, aes(fill = as.factor(topic), topic, value)) +
geom_bar(stat="identity") +
coord_flip() +
facet_wrap(~ variable)