-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.config
68 lines (61 loc) · 1.68 KB
/
data.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# this file contains lists of corpus texts. To fully load a corpus cpname,
# use 'make cpname'. To download only a specific file named fname of corpus cpname,
# use 'make cpname/fname'.
# http://www.data-compression.info/Corpora/CanterburyCorpus/index.html
CANTERBURY = \
canterbury/alice29.txt \
canterbury/asyoulik.txt \
canterbury/cp.html \
canterbury/fields.c \
canterbury/grammar.lsp \
canterbury/kennedy.xls \
canterbury/lcet10.txt \
canterbury/plrabn12.txt \
canterbury/ptt5 \
canterbury/sum \
canterbury/xargs.1
# http://www.data-compression.info/Corpora/CanterburyCorpus/index.html
LARGECANTERBURY = \
largecanterbury/bible.txt \
largecanterbury/E.coli \
largecanterbury/world192.txt
# http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
SILESIA = \
silesia/dickens \
silesia/mozilla \
silesia/mr \
silesia/nci \
silesia/ooffice \
silesia/osdb \
silesia/reymont \
silesia/samba \
silesia/sao \
silesia/webster \
silesia/xml \
silesia/x-ray
# http://pizzachili.dcc.uchile.cl/texts.html
PIZZACHILI = \
pizzachili/sources \
pizzachili/pitches \
pizzachili/proteins \
pizzachili/dna \
pizzachili/english \
pizzachili/dblp.xml
# http://pizzachili.dcc.uchile.cl/repcorpus.html (only REAL texts)
REPETITIVE = \
repetitive/Escherichia_Coli \
repetitive/cere \
repetitive/coreutils \
repetitive/einstein.de.txt \
repetitive/einstein.en.txt \
repetitive/influenza \
repetitive/kernel \
repetitive/para \
repetitive/world_leaders
# http://hgdownload.soe.ucsc.edu/downloads.html (only a selection)
GENOMES = \
genomes/hg38 \
genomes/mm10 \
genomes/rn6
# all files in one macro
ALLTESTDATA = $(CANTERBURY) $(LARGECANTERBURY) $(SILESIA) $(PIZZACHILI) $(REPETITIVE) $(GENOMES)