-
Notifications
You must be signed in to change notification settings - Fork 0
/
topicmodel.sh
executable file
·42 lines (37 loc) · 1.57 KB
/
topicmodel.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/sh
#
# This script runs MALLET on a corpus with different numbers of
# topics and also generates detailed diagnostic output for
# analysis of the topic quality
#
# It generates however many sub-directories it needs to store
# the different outputs (determined by $n_topics)
#
# Set the variables as needed for your project
#
# Mike Widner <[email protected]>
#
#####
### VARIABLES ###
mallet=/Applications/mallet/bin/mallet
n_topics=(10 20 40 80 100)
project_name=SomeName
inputdir=SomeDir
outputdir=SomeOtherDir
extra_stopwords=EXTRA_STOPWORDS # wherever they live
### IMPORT ###
mallet_import="$mallet import-dir --input $inputdir --output $outputdir/${project_name}.vectors --remove-stopwords --keep-sequence"
# if [ ! -z $extra_stopwords ]; then
# mallet_import="$mallet_import --extra-stopwords $extra_stopwords"
# fi
$mallet_import
### TRAIN TOPICS ###
for topics in ${n_topics[@]}
do
topics_output="$outputdir/$topics"
if [ ! -d $topics_output ];
then
mkdir -p $topics_output
fi
$mallet run cc.mallet.topics.tui.TopicTrainer --input $outputdir/${project_name}.vectors --num-topics $topics --optimize-interval 20 --diagnostics-file $topics_output/diagnostics.xml --output-topic-keys $topics_output/topic-keys.txt --output-doc-topics $topics_output/doc-topics.txt --xml-topic-phrase-report $topics_output/topic-phrase-report.xml --xml-topic-report $topics_output/topic-report.xml --topic-word-weights-file $topics_output/topic-word-weights.txt --word-topic-counts-file $topics_output/word-topic-counts.txt --output-state $topics_output/state.gz
done