lv-ner-train.prop

trainFileList = ner_train-jan.tab
#location where you would like to save (serialize to) your
#classifier; adding .gz at the end automatically gzips the file,
#making it faster and smaller
serializeTo = lv-ner-model.ser.gz

#structure of your training file; this tells the classifier
#that the word is in column 0 and the correct answer is in
#column 1
map = word=1,lemma=2,answer=7,morphologyFeatureString=5,idx=0,nerFeatureString=8

#these are the features we'd like to train with
#some are discussed below, the rest can be
#understood by looking at NERFeatureFactory
useClassFeature=true
#useWord=true
useNGrams=true

#no ngrams will be included that do not contain either the
#beginning or end of the word
noMidNGrams=true
useDisjunctive=true
maxNGramLeng=6
usePrev=true
useNext=true
useSequences=true
usePrevSequences=true
maxLeft=1

#the next 4 deal with word shape features
useTypeSeqs=true
useTypeSeqs2=true
useTypeySequences=true
#wordShape=chris2useLC
#no maarcha
wordShape=dan2useLC
saveFeatureIndexToDisk = true

#PP - test
useTags=true
useLemmas=true
gazette = ./Gazetteer/LV_LOC_GAZETTEER.txt,./Gazetteer/LV_PERS_GAZETTEER.txt,./Gazetteer/PP_Onomastica_surnames.txt,./Gazetteer/PP_Onomastica_geonames_lem.txt,./Gazetteer/PP_valstis_lem.txt,./Gazetteer/PP_orgnames.txt,./Gazetteer/PP_org_elements.txt,./Gazetteer/AZ_profesijas.txt,./Gazetteer/AZ_profesijas_full_lem.txt,./Gazetteer/AZ_roles.txt,./Gazetteer/AZ_ORG_common.txt,./Gazetteer/LV_ORG_INIT_GAZETTEER.txt,./Gazetteer/DB_organizations.txt,./Gazetteer/DB_locations.txt,./Gazetteer/DB_persons.txt,./Gazetteer/DB_professions.txt,./Gazetteer/AZ_valsts_parvaldes_struktura_lem.txt

sloppyGazette=true
cleanGazette=true

printFeatures=train

useMorphologyFeatures=true

useBeginSent=true
useOccurrencePatterns=true

useDistSim = true
casedDistSim = true
#numberEquivalenceDistSim = true
distSimLexicon = ./distsim/words.83M.200