-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathlv-ner-train.prop
57 lines (47 loc) · 1.9 KB
/
lv-ner-train.prop
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
trainFileList = ner_train-jan.tab
#location where you would like to save (serialize to) your
#classifier; adding .gz at the end automatically gzips the file,
#making it faster and smaller
serializeTo = lv-ner-model.ser.gz
#structure of your training file; this tells the classifier
#that the word is in column 0 and the correct answer is in
#column 1
map = word=1,lemma=2,answer=7,morphologyFeatureString=5,idx=0,nerFeatureString=8
#these are the features we'd like to train with
#some are discussed below, the rest can be
#understood by looking at NERFeatureFactory
useClassFeature=true
#useWord=true
useNGrams=true
#no ngrams will be included that do not contain either the
#beginning or end of the word
noMidNGrams=true
useDisjunctive=true
maxNGramLeng=6
usePrev=true
useNext=true
useSequences=true
usePrevSequences=true
maxLeft=1
#the next 4 deal with word shape features
useTypeSeqs=true
useTypeSeqs2=true
useTypeySequences=true
#wordShape=chris2useLC
#no maarcha
wordShape=dan2useLC
saveFeatureIndexToDisk = true
#PP - test
useTags=true
useLemmas=true
gazette = ./Gazetteer/LV_LOC_GAZETTEER.txt,./Gazetteer/LV_PERS_GAZETTEER.txt,./Gazetteer/PP_Onomastica_surnames.txt,./Gazetteer/PP_Onomastica_geonames_lem.txt,./Gazetteer/PP_valstis_lem.txt,./Gazetteer/PP_orgnames.txt,./Gazetteer/PP_org_elements.txt,./Gazetteer/AZ_profesijas.txt,./Gazetteer/AZ_profesijas_full_lem.txt,./Gazetteer/AZ_roles.txt,./Gazetteer/AZ_ORG_common.txt,./Gazetteer/LV_ORG_INIT_GAZETTEER.txt,./Gazetteer/DB_organizations.txt,./Gazetteer/DB_locations.txt,./Gazetteer/DB_persons.txt,./Gazetteer/DB_professions.txt,./Gazetteer/AZ_valsts_parvaldes_struktura_lem.txt
sloppyGazette=true
cleanGazette=true
printFeatures=train
useMorphologyFeatures=true
useBeginSent=true
useOccurrencePatterns=true
useDistSim = true
casedDistSim = true
#numberEquivalenceDistSim = true
distSimLexicon = ./distsim/words.83M.200