-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
128 lines (107 loc) · 4 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
include data.config
include ../Make.helper
CORPORA := $(shell find * -maxdepth 0 -type d)
all: $(CORPORA)
#canterbury corpus
canterbury: $(CANTERBURY)
$(CANTERBURY):
curl http://www.data-compression.info/files/corpora/canterburycorpus.zip -o canterbury/cc.zip
unzip -u canterbury/cc.zip -d canterbury
rm canterbury/cc.zip
#large canterbury corpus
largecanterbury: $(LARGECANTERBURY)
$(LARGECANTERBURY):
curl http://www.data-compression.info/files/corpora/largecanterburycorpus.zip -o largecanterbury/lcc.zip
unzip -u largecanterbury/lcc.zip -d largecanterbury
rm largecanterbury/lcc.zip
#silesia corpus
silesia: $(SILESIA)
$(SILESIA):
curl http://sun.aei.polsl.pl/~sdeor/corpus/$(@F).bz2 -o [email protected]
bunzip2 -f [email protected]
#pizza chili corpus
pizzachili: $(PIZZACHILI)
$(PIZZACHILI):
$(eval fdir := \
$(if $(findstring $(basename $(@F)),sources),code,\
$(if $(findstring $(basename $(@F)),pitches),music,\
$(if $(findstring $(basename $(@F)),proteins),protein,\
$(if $(findstring $(basename $(@F)),dna),dna,\
$(if $(findstring $(basename $(@F)),english),nlang,\
$(if $(findstring $(basename $(@F)),dblp.xml),xml,\
$(error unknown pizza chili category of $(@F) ))))))))
curl http://pizzachili.dcc.uchile.cl/texts/$(fdir)/$(@F).gz -o [email protected]
gunzip -f [email protected]
#repetitive corpus
repetitive: $(REPETITIVE)
$(REPETITIVE):
curl http://pizzachili.dcc.uchile.cl/repcorpus/real/$(@F).gz -o [email protected]
gunzip -f [email protected]
#genomes
genomes: $(GENOMES)
$(GENOMES):
curl ftp://hgdownload.cse.ucsc.edu/goldenPath/$(@F)/bigZips/$(@F).2bit -o [email protected]
chmod a+x twoBitToFa
./twoBitToFa -noMask [email protected] [email protected]
tr -d -c 'ACGT' < [email protected] > $@
#statistics
statistics: filestat.pdf
filestat.x: filestat.cpp
$(MY_CXX) -Wall -Wextra $(MY_CXX_FLAGS) $(MY_CXX_OPT_FLAGS) $(C_OPTIONS) filestat.cpp -o filestat.x
filestat.dat: filestat.x $(CORPORA)
@echo "file sigma newlines size category source" > filestat.dat
@echo "Computing stats of Canterbury corpus..."
@for file in $(CANTERBURY) ; do \
basename $$file | tr '_' '-' | tr -d '\n' >> filestat.dat; \
echo -n " " >> filestat.dat; \
./filestat.x $$file >> filestat.dat; \
echo " canterbury http://www.data-compression.info/Corpora/CanterburyCorpus/index.html" >> filestat.dat; \
done
@echo "Computing stats of large Canterbury corpus..."
@for file in $(LARGECANTERBURY) ; do \
basename $$file | tr '_' '-' | tr -d '\n' >> filestat.dat; \
echo -n " " >> filestat.dat; \
./filestat.x $$file >> filestat.dat; \
echo " largecanterbury http://www.data-compression.info/Corpora/CanterburyCorpus/index.html" >> filestat.dat; \
done
@echo "Computing stats of Silesia corpus..."
@for file in $(SILESIA) ; do \
basename $$file | tr '_' '-' | tr -d '\n' >> filestat.dat; \
echo -n " " >> filestat.dat; \
./filestat.x $$file >> filestat.dat; \
echo " silesia http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia" >> filestat.dat; \
done
@echo "Computing stats of Pizza and Chili corpus..."
@for file in $(PIZZACHILI) ; do \
basename $$file | tr '_' '-' | tr -d '\n' >> filestat.dat; \
echo -n " " >> filestat.dat; \
./filestat.x $$file >> filestat.dat; \
echo " pizzachili http://pizzachili.dcc.uchile.cl/texts.html" >> filestat.dat; \
done
@echo "Computing stats of Repetitive corpus..."
@for file in $(REPETITIVE) ; do \
basename $$file | tr '_' '-' | tr -d '\n' >> filestat.dat; \
echo -n " " >> filestat.dat; \
./filestat.x $$file >> filestat.dat; \
echo " repetitive http://pizzachili.dcc.uchile.cl/repcorpus.html" >> filestat.dat; \
done
@echo "Computing stats of Genomes..."
@for file in $(GENOMES) ; do \
basename $$file | tr '_' '-' | tr -d '\n' >> filestat.dat; \
echo -n " " >> filestat.dat; \
./filestat.x $$file >> filestat.dat; \
echo " genomes http://hgdownload.soe.ucsc.edu/downloads.html" >> filestat.dat; \
done
filestat.pdf: filestat.tex filestat.dat
pdflatex filestat.tex
rm -f filestat.log
rm -f filestat.out
rm -f filestat.aux
#cleaners
clean-%:
rm -f $*/*
cleanall: $(addprefix clean-,$(CORPORA))
rm -f filestat.x
rm -f *.dat