From bd35e90ae509b45893684be022f5e5ddbf11b0a2 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sat, 12 Jan 2019 14:43:28 +0100 Subject: [PATCH 01/18] #858 - Out-of-tagset tags should map to the generic type - Make catch-all mapping consistent over all mappings --- .../ukp/dkpro/core/api/lexmorph/tagset/bn-utpal-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/de-stts-pos.map | 2 +- .../dkpro/core/api/lexmorph/tagset/de-tiger-rftagger-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-arktweet-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-brown-pos.map | 4 ++-- .../ukp/dkpro/core/api/lexmorph/tagset/en-browntei-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-c5-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-medpost-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-ptb-emory-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/en-ptb-tt-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/es-conll2009-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/et-tartu-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/fa-upc-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/fa-upc-reduced-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/fr-ftb-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/fr-stein-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/it-stein-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/it-tanl-pos.map | 2 +- .../tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/ru-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/sv-suc-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/zh-ctb-pos.map | 2 +- .../ukp/dkpro/core/api/lexmorph/tagset/zh-lcmc-pos.map | 2 +- 23 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/bn-utpal-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/bn-utpal-pos.map index 8089608dd1..a5c8a5509e 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/bn-utpal-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/bn-utpal-pos.map @@ -43,5 +43,5 @@ SYM=POS_SYM RDP=POS_X # UNK Unknown UNK=POS_X -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-stts-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-stts-pos.map index 6a5bbf1007..8aae1ffad8 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-stts-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-stts-pos.map @@ -12,7 +12,7 @@ __META_SOURCE_URL__=http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/ __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. # Catch-all rule -*=POS_X +*=POS # # $*LRB* # - diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-tiger-rftagger-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-tiger-rftagger-pos.map index c10fde96f5..3a034fd46b 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-tiger-rftagger-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/de-tiger-rftagger-pos.map @@ -38,4 +38,4 @@ VINF=POS_VERB SYM=POS_PUNCT SENT=POS_PUNCT -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-arktweet-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-arktweet-pos.map index 1ad97e69a7..b41667e576 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-arktweet-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-arktweet-pos.map @@ -3,7 +3,7 @@ __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. $=POS_NUM &=POS_CONJ -*=POS_X +*=POS ,=POS_PUNCT @=tweet.POS_AT A=POS_ADJ diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-brown-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-brown-pos.map index c410f40564..d3d9bdb0a3 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-brown-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-brown-pos.map @@ -10,7 +10,7 @@ __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. # right paren )=POS_X # not, n't -#*=POS_X +#*=POS # dash -=POS_X # comma @@ -185,4 +185,4 @@ NEG=POS_ADV NNSG=POS_NOUN NR$=POS_NOUN -*=POS_X \ No newline at end of file +*=POS \ No newline at end of file diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-browntei-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-browntei-pos.map index dcd1b38058..1ea33cdd7e 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-browntei-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-browntei-pos.map @@ -86,4 +86,4 @@ NRg=POS_NOUN RBT=POS_ADV UH=POS_PART WQL=POS_ADV -*=POS_X \ No newline at end of file +*=POS \ No newline at end of file diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-c5-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-c5-pos.map index 63f7cbcaf5..e8876f66bd 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-c5-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-c5-pos.map @@ -6,7 +6,7 @@ __META_SOURCE_URL__=http://www.natcorp.ox.ac.uk/docs/gramtag.html __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. # Default mapping -*=POS_X +*=POS # AJ0 Adjective (general or positive) (e.g. good, old, beautiful) AJ0=POS_ADJ diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map index 6cef87a51f..5a2359b395 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map @@ -5,7 +5,7 @@ __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS # punctuation mark, comma ,=POS_PUNCT -=POS_PUNCT diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-medpost-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-medpost-pos.map index 075007ef4f..b52804f7c5 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-medpost-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-medpost-pos.map @@ -124,4 +124,4 @@ VVGN=POS_VERB # ” right quote ''=POS_PUNCT -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-emory-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-emory-pos.map index 55022e9918..303ac44e58 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-emory-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-emory-pos.map @@ -7,7 +7,7 @@ __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS # punctuation mark, comma ,=POS_PUNCT -=POS_PUNCT diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-tt-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-tt-pos.map index a22f4e0e2e..c64ea12429 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-tt-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-ptb-tt-pos.map @@ -5,7 +5,7 @@ __META_SOURCE_URL__=http://faculty.washington.edu/dillon/GramResources/penntable.html __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS ,=POS_PUNCT -=POS_PUNCT .=POS_PUNCT diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/es-conll2009-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/es-conll2009-pos.map index 1e5a2e4b89..8e7c7d0ca2 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/es-conll2009-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/es-conll2009-pos.map @@ -5,7 +5,7 @@ __META_SOURCE_URL__=http://clic.ub.edu/corpus/webfm_send/18 __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS #Adjective a=POS_ADJ #Conjuction diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/et-tartu-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/et-tartu-pos.map index 94ae3f35b0..9a7f147498 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/et-tartu-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/et-tartu-pos.map @@ -584,4 +584,4 @@ Z.Osq=POS_PUNCT Z.Csq=POS_PUNCT Z.Sla=POS_PUNCT T=POS_X -*=POS_X \ No newline at end of file +*=POS \ No newline at end of file diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-pos.map index 21ace2b887..524ebaa670 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-pos.map @@ -67,5 +67,5 @@ V_COP=POS_VERB V_PRS=POS_VERB # V_SUB - Subjunctive verb V_SUB=POS_VERB -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-reduced-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-reduced-pos.map index d1139cc6e4..de83668a60 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-reduced-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fa-upc-reduced-pos.map @@ -39,4 +39,4 @@ STPOS=POS_X STR=POS_X # V - Verb V=POS_VERB -*=POS_X \ No newline at end of file +*=POS \ No newline at end of file diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-ftb-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-ftb-pos.map index 126dc5ccc6..5a2ed6d678 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-ftb-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-ftb-pos.map @@ -35,4 +35,4 @@ PUNC=POS_PUNCT \:=POS_PUNCT # V V=POS_VERB -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-stein-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-stein-pos.map index c5cfb61344..9693f06356 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-stein-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/fr-stein-pos.map @@ -7,7 +7,7 @@ __META_SOURCE_URL__=http://www.ims.uni-stuttgart.de/~schmid/french-tagset.html __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS # ABR abreviation ABR=POS_X # ADJ adjective diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-stein-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-stein-pos.map index 8c3bd03162..2cb974cef3 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-stein-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-stein-pos.map @@ -7,7 +7,7 @@ __META_SOURCE_URL__=ftp://ftp.ims.uni-stuttgart.de/pub/corpora/italian-tagset.txt __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS # ABR abbreviation ABR=POS_X # ADJ adjective diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-tanl-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-tanl-pos.map index 09c9f723bb..cbee1aba1c 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-tanl-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/it-tanl-pos.map @@ -5,7 +5,7 @@ __META_SOURCE_URL__=Source: http://medialab.di.unipi.it/wiki/Tanl_POS_Tagset __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS APn=POS_ADJ APp=POS_ADJ APs=POS_ADJ diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/ru-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/ru-pos.map index 267a955a7d..6773257b34 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/ru-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/ru-pos.map @@ -1,5 +1,5 @@ __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. -*=POS_X +*=POS A-NUM=POS_ADJ A-PRO=POS_PRON A=POS_ADJ diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/sv-suc-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/sv-suc-pos.map index cbd5911ada..20a89c5203 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/sv-suc-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/sv-suc-pos.map @@ -62,4 +62,4 @@ MID=POS_PUNCT PAD=POS_PUNCT # Catch-all rule -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-ctb-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-ctb-pos.map index 31dc2f21a3..ea6bdee02c 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-ctb-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-ctb-pos.map @@ -75,4 +75,4 @@ VV=POS_VERB # X X=POS_X -*=POS_X +*=POS diff --git a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-lcmc-pos.map b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-lcmc-pos.map index daa2fc6b58..96002150be 100644 --- a/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-lcmc-pos.map +++ b/dkpro-core-api-lexmorph-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/zh-lcmc-pos.map @@ -6,7 +6,7 @@ __META_SOURCE_URL__=http://www.lancs.ac.uk/fass/projects/corpus/LCMC/lcmc/lcmc_t __META_TYPE_BASE__=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos. # Catch-all rule -*=POS_X +*=POS # a adjective a=POS_ADJ From a8fb1840f36d6344f194d0bfaced973b6bd7196f Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 13 Jan 2019 19:21:14 +0100 Subject: [PATCH 02/18] #858 - Out-of-tagset tags should map to the generic type - Update unit test reference data --- .../dkpro/core/hunpos/HunPosTaggerTest.java | 4 +- .../src/test/resources/FX8.xml.dump | 54 +++++++------------ .../format4-with-coref-sample.export.dump | 40 +++++--------- 3 files changed, 33 insertions(+), 65 deletions(-) diff --git a/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java b/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java index f776b67acc..b6e1668918 100644 --- a/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java +++ b/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java @@ -169,7 +169,7 @@ public void testSwedish() runTest("sv", null, "Detta är ett test .", new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "DL_MAD" }, - new String[] { "POS_X", "POS_X", "POS_X", "POS_X", "POS_X" }); + new String[] { "POS", "POS", "POS", "POS", "POS" }); runTest("sv", "paroletags", "Detta är ett test .", new String[] { "PF@NS0@S", "V@IPAS", "DI@NS@S", "NCNSN@IS", "FE" }, @@ -178,7 +178,7 @@ public void testSwedish() runTest("sv", "suctags", "Detta är ett test .", new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "DL_MAD" }, - new String[] { "POS_X", "POS_X", "POS_X", "POS_X", "POS_X" }); + new String[] { "POS", "POS", "POS", "POS", "POS" }); // runTest("sv", "suc2x", "Detta är ett test .", // new String[] { "PN_NEU_SIN_DEF_SUB@OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", diff --git a/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump b/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump index 957e769113..24298fb387 100644 --- a/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump +++ b/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump @@ -609,12 +609,11 @@ Token PosValue: "VVI" coarseValue: "VERB" [to] -POS_X +POS sofa: _InitialView begin: 88 end: 90 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -631,12 +630,11 @@ Token begin: 88 end: 90 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 88 end: 90 PosValue: "TO0" - coarseValue: "X" [tell] POS_VERB sofa: _InitialView @@ -841,12 +839,11 @@ Token PosValue: "VVN" coarseValue: "VERB" [to] -POS_X +POS sofa: _InitialView begin: 120 end: 122 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -863,12 +860,11 @@ Token begin: 120 end: 122 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 120 end: 122 PosValue: "TO0" - coarseValue: "X" [call] POS_VERB sofa: _InitialView @@ -1317,12 +1313,11 @@ Token PosValue: "VVD" coarseValue: "VERB" [herself] -POS_X +POS sofa: _InitialView begin: 194 end: 201 PosValue: "PNX" - coarseValue: "X" [herself] Lemma sofa: _InitialView @@ -1339,12 +1334,11 @@ Token begin: 194 end: 201 value: "herself" - pos: POS_X + pos: POS sofa: _InitialView begin: 194 end: 201 PosValue: "PNX" - coarseValue: "X" [,] POS_PUNCT sofa: _InitialView @@ -1606,12 +1600,11 @@ Token PosValue: "VVD" coarseValue: "VERB" [to] -POS_X +POS sofa: _InitialView begin: 242 end: 244 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -1628,12 +1621,11 @@ Token begin: 242 end: 244 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 242 end: 244 PosValue: "TO0" - coarseValue: "X" [buy] POS_VERB sofa: _InitialView @@ -1693,12 +1685,11 @@ Token PosValue: "PNI" coarseValue: "PRON" [herself] -POS_X +POS sofa: _InitialView begin: 260 end: 267 PosValue: "PNX" - coarseValue: "X" [herself] Lemma sofa: _InitialView @@ -1715,12 +1706,11 @@ Token begin: 260 end: 267 value: "herself" - pos: POS_X + pos: POS sofa: _InitialView begin: 260 end: 267 PosValue: "PNX" - coarseValue: "X" [,] POS_PUNCT sofa: _InitialView @@ -2482,12 +2472,11 @@ Token PosValue: "VVB" coarseValue: "VERB" [to] -POS_X +POS sofa: _InitialView begin: 392 end: 394 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -2504,12 +2493,11 @@ Token begin: 392 end: 394 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 392 end: 394 PosValue: "TO0" - coarseValue: "X" [go] POS_VERB sofa: _InitialView @@ -2848,12 +2836,11 @@ Token PosValue: "NN1" coarseValue: "NOUN" [to] -POS_X +POS sofa: _InitialView begin: 444 end: 446 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -2870,12 +2857,11 @@ Token begin: 444 end: 446 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 444 end: 446 PosValue: "TO0" - coarseValue: "X" [go] POS_VERB sofa: _InitialView @@ -3324,12 +3310,11 @@ Token PosValue: "VVG" coarseValue: "VERB" [na] -POS_X +POS sofa: _InitialView begin: 502 end: 504 PosValue: "TO0" - coarseValue: "X" [na] Lemma sofa: _InitialView @@ -3346,12 +3331,11 @@ Token begin: 502 end: 504 value: "na" - pos: POS_X + pos: POS sofa: _InitialView begin: 502 end: 504 PosValue: "TO0" - coarseValue: "X" [get] POS_VERB sofa: _InitialView @@ -3411,12 +3395,11 @@ Token PosValue: "PRP" coarseValue: "ADP" [up] -POS_X +POS sofa: _InitialView begin: 512 end: 514 PosValue: "AVP" - coarseValue: "X" [up] Lemma sofa: _InitialView @@ -3433,12 +3416,11 @@ Token begin: 512 end: 514 value: "up" - pos: POS_X + pos: POS sofa: _InitialView begin: 512 end: 514 PosValue: "AVP" - coarseValue: "X" [to] POS_ADP sofa: _InitialView diff --git a/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump b/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump index a4ab4582a3..c86d22cd37 100644 --- a/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump +++ b/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump @@ -45,12 +45,11 @@ Constituent children: FSArray syntacticFunction: "--" [A] -POS_X +POS sofa: _InitialView begin: 0 end: 1 PosValue: "P1" - coarseValue: "X" [A] Lemma sofa: _InitialView @@ -87,12 +86,11 @@ Token begin: 0 end: 1 value: "A" - pos: POS_X + pos: POS sofa: _InitialView begin: 0 end: 1 PosValue: "P1" - coarseValue: "X" [A] Constituent sofa: _InitialView @@ -115,12 +113,11 @@ Constituent children: FSArray syntacticFunction: "-" [B] -POS_X +POS sofa: _InitialView begin: 2 end: 3 PosValue: "P2" - coarseValue: "X" [B] Lemma sofa: _InitialView @@ -150,19 +147,17 @@ Token begin: 2 end: 3 value: "B" - pos: POS_X + pos: POS sofa: _InitialView begin: 2 end: 3 PosValue: "P2" - coarseValue: "X" [C] -POS_X +POS sofa: _InitialView begin: 4 end: 5 PosValue: "P3" - coarseValue: "X" [C] Lemma sofa: _InitialView @@ -185,12 +180,11 @@ Token begin: 4 end: 5 value: "C" - pos: POS_X + pos: POS sofa: _InitialView begin: 4 end: 5 PosValue: "P3" - coarseValue: "X" [D E F 20] Constituent sofa: _InitialView @@ -206,12 +200,11 @@ Constituent children: FSArray syntacticFunction: "--" [D] -POS_X +POS sofa: _InitialView begin: 6 end: 7 PosValue: "P4" - coarseValue: "X" [D] Lemma sofa: _InitialView @@ -248,12 +241,11 @@ Token begin: 6 end: 7 value: "D" - pos: POS_X + pos: POS sofa: _InitialView begin: 6 end: 7 PosValue: "P4" - coarseValue: "X" [D] Constituent sofa: _InitialView @@ -276,12 +268,11 @@ Constituent children: FSArray syntacticFunction: "HD" [E] -POS_X +POS sofa: _InitialView begin: 8 end: 9 PosValue: "P5" - coarseValue: "X" [E] Lemma sofa: _InitialView @@ -304,12 +295,11 @@ Token begin: 8 end: 9 value: "E" - pos: POS_X + pos: POS sofa: _InitialView begin: 8 end: 9 PosValue: "P5" - coarseValue: "X" [F 20] Constituent sofa: _InitialView @@ -332,12 +322,11 @@ Constituent children: FSArray syntacticFunction: "-" [F] -POS_X +POS sofa: _InitialView begin: 10 end: 11 PosValue: "P6" - coarseValue: "X" [F] Lemma sofa: _InitialView @@ -381,12 +370,11 @@ Token begin: 10 end: 11 value: "F" - pos: POS_X + pos: POS sofa: _InitialView begin: 10 end: 11 PosValue: "P6" - coarseValue: "X" [F] Constituent sofa: _InitialView @@ -501,6 +489,4 @@ Constituent syntacticFunction: "-" -------- View _InitialView end ---------------------------------- -======== CAS 0 end ================================== - - +======== CAS 0 end ================================== \ No newline at end of file From 0f3c13d1fd260a50d1d9a1950aa4190c9ed9885d Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 15 Jan 2019 13:11:54 +0100 Subject: [PATCH 03/18] #1323 - File extension generated by BinaryCasWriter does not contain dot - Add missing dot --- .../tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java b/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java index 4117f7f769..fec63b0213 100644 --- a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java +++ b/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java @@ -212,7 +212,7 @@ public void initialize(UimaContext aContext) if (AUTO.equals(filenameExtension)) { try { - filenameExtension = SerialFormat.valueOf(format).getDefaultFileExtension(); + filenameExtension = "." + SerialFormat.valueOf(format).getDefaultFileExtension(); } catch (IllegalArgumentException e) { filenameExtension = ".bin"; From ceb749a82e15dc538793ae3fa0c06826dadbde24 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 12 Feb 2019 23:02:37 +0900 Subject: [PATCH 04/18] #1325 - Avoid datasets being extracted outside their target directory - Check that extracted files are under the target directory --- .../core/api/datasets/DatasetLoader.java | 15 +++++++-- .../datasets/internal/actions/Explode.java | 33 +++++++++++++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java index 4ae2138c07..c17183091e 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java +++ b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java @@ -24,6 +24,7 @@ import java.io.InputStream; import java.net.URL; import java.net.URLConnection; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -250,12 +251,20 @@ private void extract(File aArchive, ArchiveInputStream aArchiveStream, File aTar throw new IllegalStateException("Filename must not contain line break"); } - File out = new File(aTarget, name); + Path base = aTarget.toPath().toAbsolutePath(); + Path out = base.resolve(name).toAbsolutePath(); + + if (!out.startsWith(base)) { + // Ignore attempts to write outside the base + continue; + } + if (entry.isDirectory()) { - FileUtils.forceMkdir(out); + FileUtils.forceMkdir(out.toFile()); } else { - FileUtils.copyInputStreamToFile(new CloseShieldInputStream(aArchiveStream), out); + FileUtils.copyInputStreamToFile(new CloseShieldInputStream(aArchiveStream), + out.toFile()); } } } diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java index 37758aefe1..b3cb3ccb77 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java +++ b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java @@ -103,7 +103,7 @@ private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - String base = getBase(aCachedFile.getFileName().toString()); + Path base = Paths.get(getBase(aCachedFile.getFileName().toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -122,7 +122,13 @@ private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget } if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name); + Path out = aTarget.resolve(base).resolve(name).toAbsolutePath(); + if (!out.startsWith(base)) { + throw new IOException( + "Archive tries to generate file outside target folder: [" + name + + "]"); + } + if (entry.isDirectory()) { Files.createDirectories(out); } @@ -144,7 +150,7 @@ private void extractRar(ActionDescription aAction, Path aCachedFile, Path aTarge throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - String base = getBase(aCachedFile.getFileName().toString()); + Path base = Paths.get(getBase(aCachedFile.getFileName().toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -163,7 +169,13 @@ private void extractRar(ActionDescription aAction, Path aCachedFile, Path aTarge } if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name); + Path out = aTarget.resolve(base).resolve(name).toAbsolutePath(); + if (!out.startsWith(base)) { + throw new IOException( + "Archive tries to generate file outside target folder: [" + name + + "]"); + } + if (fh.isDirectory()) { Files.createDirectories(out); } @@ -185,7 +197,7 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea throws IOException { // We always extract archives into a subfolder. Figure out the name of the folder. - String base = getBase(aArchive.getFileName().toString()); + Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -203,7 +215,12 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea } if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name); + Path out = aTarget.resolve(base).resolve(name).toAbsolutePath(); + if (!out.startsWith(base)) { + throw new IOException( + "Archive tries to generate file outside target folder: [" + name + "]"); + } + if (entry.isDirectory()) { Files.createDirectories(out); } @@ -217,6 +234,10 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea private String stripLeadingFolders(String aName, int aLevels) { + if (aName == null) { + return null; + } + if (aLevels > 0) { Path p = Paths.get(aName); if (p.getNameCount() <= aLevels) { From 0d7048e7130ddc901445281b777207d97f74e664 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 12 Feb 2019 23:16:24 +0900 Subject: [PATCH 05/18] #1325 - Avoid datasets being extracted outside their target directory - Fix extration of base path from archive file location --- .../api/datasets/internal/actions/Explode.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java index b3cb3ccb77..53e95fff66 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java +++ b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java @@ -99,11 +99,11 @@ public void apply(ActionDescription aAction, DatasetDescription aDataset, } } - private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget) + private void extract7z(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aCachedFile.getFileName().toString())).toAbsolutePath(); + Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -111,7 +111,7 @@ private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); - try (SevenZFile archive = new SevenZFile(aCachedFile.toFile())) { + try (SevenZFile archive = new SevenZFile(aArchive.toFile())) { SevenZArchiveEntry entry = archive.getNextEntry(); while (entry != null) { String name = stripLeadingFolders(entry.getName(), strip); @@ -146,11 +146,11 @@ private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget } } - private void extractRar(ActionDescription aAction, Path aCachedFile, Path aTarget) + private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aCachedFile.getFileName().toString())).toAbsolutePath(); + Path base = Paths.get(getBase(aArchive.toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -158,7 +158,7 @@ private void extractRar(ActionDescription aAction, Path aCachedFile, Path aTarge AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); - try (Archive archive = new Archive(new FileVolumeManager(aCachedFile.toFile()))) { + try (Archive archive = new Archive(new FileVolumeManager(aArchive.toFile()))) { FileHeader fh = archive.nextFileHeader(); while (fh != null) { String name = stripLeadingFolders(fh.getFileNameString(), strip); @@ -197,7 +197,7 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea throws IOException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); + Path base = Paths.get(getBase(aArchive.toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; From 309ee31088b4a4a70e3104a82bc3f628b757b0dc Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 18 Feb 2019 06:11:28 +0100 Subject: [PATCH 06/18] #1327 - Update LIF support - Update LIF dependencies - Support LIF with and without envelope - Option do disable generation of timestamp metadata - Re-structure the code --- dkpro-core-io-lif-asl/pom.xml | 22 +- .../ukp/dkpro/core/io/lif/LifReader.java | 11 +- .../ukp/dkpro/core/io/lif/LifWriter.java | 35 +- .../dkpro/core/io/lif/internal/DKPro2Lif.java | 114 +- .../dkpro/core/io/lif/internal/Lif2DKPro.java | 305 ++- .../core/io/lif/LifReaderWriterTest.java | 40 +- .../ukp/dkpro/core/io/lif/LifWriterTest.java | 9 +- .../src/test/resources/README.txt | 5 + .../conll/2006/{fi-ref.json => fi-ref.lif} | 1 + ...e-ref.json => dependencystructure-ref.lif} | 1 + ...structure.json => dependencystructure.lif} | 0 ...cture-ref.json => phrasestructure-ref.lif} | 1 + ...rasestructure.json => phrasestructure.lif} | 0 .../resources/lif/stanford-pos-massaged.lif | 1 + .../test/resources/lif/stanford-pos-ref.lif | 1785 +++++++++++++ .../src/test/resources/lif/stanford-pos.lif | 2237 +++++++++++++++++ 16 files changed, 4361 insertions(+), 206 deletions(-) rename dkpro-core-io-lif-asl/src/test/resources/conll/2006/{fi-ref.json => fi-ref.lif} (99%) rename dkpro-core-io-lif-asl/src/test/resources/lif/{dependencystructure-ref.json => dependencystructure-ref.lif} (98%) rename dkpro-core-io-lif-asl/src/test/resources/lif/{dependencystructure.json => dependencystructure.lif} (100%) rename dkpro-core-io-lif-asl/src/test/resources/lif/{phrasestructure-ref.json => phrasestructure-ref.lif} (98%) rename dkpro-core-io-lif-asl/src/test/resources/lif/{phrasestructure.json => phrasestructure.lif} (100%) create mode 100644 dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif create mode 100644 dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif create mode 100644 dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif diff --git a/dkpro-core-io-lif-asl/pom.xml b/dkpro-core-io-lif-asl/pom.xml index 662c76f25f..e12a8b2fa9 100644 --- a/dkpro-core-io-lif-asl/pom.xml +++ b/dkpro-core-io-lif-asl/pom.xml @@ -73,23 +73,25 @@ org.lappsgrid serialization - 2.3.0 + 2.6.0 org.lappsgrid vocabulary - 2.3.0 + 2.4.1 org.lappsgrid discriminator - 2.2.1 + 2.3.3 + it.unimi.dsi fastutil @@ -103,6 +105,12 @@ junit test + + net.javacrumbs.json-unit + json-unit + 2.4.0 + test + de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.testing-asl @@ -114,6 +122,7 @@ test + org.codehaus.groovy:groovy-all @@ -133,4 +138,5 @@ + --> diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java index 1ecb71e58e..1c25638024 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java @@ -27,6 +27,7 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.lappsgrid.serialization.DataContainer; import org.lappsgrid.serialization.Serializer; import org.lappsgrid.serialization.lif.Container; @@ -72,7 +73,15 @@ public void getNext(JCas aJCas) Container container; try (InputStream is = res.getInputStream()) { String json = IOUtils.toString(res.getInputStream(), sourceEncoding); - container = Serializer.parse(json, Container.class); + try { + // First try parsing without the wire wrapper. + container = Serializer.parse(json, Container.class); + } + catch (Exception e) { + // If that fails, it might be because there is a wire wrapper around the actual + // data, so let's try that. + container = (Container) Serializer.parse(json, DataContainer.class).getPayload(); + } } new Lif2DKPro().convert(container, aJCas); diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java index de11f988ac..b18c69effe 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java @@ -26,8 +26,10 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.lappsgrid.serialization.DataContainer; import org.lappsgrid.serialization.Serializer; import org.lappsgrid.serialization.lif.Container; +import org.lappsgrid.serialization.lif.View; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; @@ -62,14 +64,29 @@ public class LifWriter private String targetEncoding; /** - * Specify the suffix of output files. Default value .json. If the suffix is not + * Specify the suffix of output files. Default value .lif. If the suffix is not * needed, provide an empty string as value. */ public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".json") + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".lif") private String filenameSuffix; + /** + * Write timestamp to view. + */ + public static final String PARAM_WRITE_TIMESTAMP = "writeTimestamp"; + @ConfigurationParameter(name = PARAM_WRITE_TIMESTAMP, mandatory = true, defaultValue = "true") + private boolean writeTimestamp; + + + /** + * Wrap as data object. + */ + public static final String PARAM_ADD_ENVELOPE = "wrapAsDataObject"; + @ConfigurationParameter(name = PARAM_ADD_ENVELOPE, mandatory = true, defaultValue = "false") + private boolean wrapAsDataObject; + @Override public void process(JCas aJCas) throws AnalysisEngineProcessException @@ -79,8 +96,20 @@ public void process(JCas aJCas) new DKPro2Lif().convert(aJCas, container); + // Clear timestamp if requested. + if (!writeTimestamp) { + for (View view : container.getViews()) { + view.setTimestamp(null); + } + } + + Object finalOutputObject = container; + if (wrapAsDataObject) { + finalOutputObject = new DataContainer(container); + } + try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { - String json = Serializer.toPrettyJson(container); + String json = Serializer.toPrettyJson(finalOutputObject); IOUtils.write(json, docOS, targetEncoding); } catch (Exception e) { diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java index 3c4d140c44..6ae657d549 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java @@ -65,69 +65,101 @@ public void convert(JCas aJCas, Container container) // Paragraph for (Paragraph p : select(aJCas, Paragraph.class)) { - view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), - p.getEnd()); + convertParagraph(view, p); } // Sentence for (Sentence s : select(aJCas, Sentence.class)) { - view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), - s.getEnd()); + convertSentence(view, s); } // Token, POS, Lemma for (Token t : select(aJCas, Token.class)) { - Annotation a = view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), - t.getEnd()); - if (t.getPos() != null) { - a.addFeature(Features.Token.POS, t.getPos().getPosValue()); - } - - if (t.getLemma() != null) { - a.addFeature(Features.Token.LEMMA, t.getLemma().getValue()); - } + convertToken(view, t); } // NamedEntity - for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) { - Annotation ne = view.newAnnotation(id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, - neAnno.getBegin(), neAnno.getEnd()); - ne.setLabel(neAnno.getValue()); + for (NamedEntity ne : select(aJCas, NamedEntity.class)) { + convertNamedEntity(view, ne); } - // Dependency + // Dependencies for (Sentence s : select(aJCas, Sentence.class)) { - Set depRelIds = new TreeSet<>(); - - for (Dependency dep : selectCovered(Dependency.class, s)) { - String depRelId = id(DEPENDENCY, dep); - // LAPPS dependencies inherit from Relation which has no offsets - Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); - depRel.setLabel(dep.getDependencyType()); - depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); - depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); - depRelIds.add(depRelId); - } - - if (!depRelIds.isEmpty()) { - Annotation depStruct = view.newAnnotation(id(DEPENDENCY_STRUCTURE, s), - Discriminators.Uri.DEPENDENCY_STRUCTURE, s.getBegin(), s.getEnd()); - depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); - } + convertDependencies(view, s); } // Constituents for (ROOT r : select(aJCas, ROOT.class)) { - Set constituents = new LinkedHashSet<>(); - convertConstituent(view, r, constituents); - - Annotation phraseStruct = view.newAnnotation(id(PHRASE_STRUCTURE, r), - Discriminators.Uri.PHRASE_STRUCTURE, r.getBegin(), r.getEnd()); - phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); + convertConstituents(view, r); } } + + private void convertParagraph(View aTarget, Paragraph aParagraph) + { + aTarget.newAnnotation(id(PARAGRAPH, aParagraph), Discriminators.Uri.PARAGRAPH, + aParagraph.getBegin(), aParagraph.getEnd()); + } + + private void convertSentence(View aTarget, Sentence aSentence) + { + aTarget.newAnnotation(id(SENTENCE, aSentence), Discriminators.Uri.SENTENCE, + aSentence.getBegin(), aSentence.getEnd()); + } + + private void convertToken(View aTarget, Token aToken) + { + Annotation a = aTarget.newAnnotation(id(TOKEN, aToken), Discriminators.Uri.TOKEN, + aToken.getBegin(), aToken.getEnd()); + if (aToken.getPos() != null) { + a.addFeature(Features.Token.POS, aToken.getPos().getPosValue()); + } + + if (aToken.getLemma() != null) { + a.addFeature(Features.Token.LEMMA, aToken.getLemma().getValue()); + } + } + + private void convertNamedEntity(View aTarget, NamedEntity aNamedEntity) + { + Annotation ne = aTarget.newAnnotation(id(NAMED_ENTITY, aNamedEntity), Discriminators.Uri.NE, + aNamedEntity.getBegin(), aNamedEntity.getEnd()); + ne.setLabel(aNamedEntity.getValue()); + } + + private void convertDependencies(View aView, Sentence aSentence) + { + Set depRelIds = new TreeSet<>(); + + for (Dependency dep : selectCovered(Dependency.class, aSentence)) { + String depRelId = id(DEPENDENCY, dep); + // LAPPS dependencies inherit from Relation which has no offsets + Annotation depRel = aView.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); + depRel.setLabel(dep.getDependencyType()); + depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); + depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); + depRelIds.add(depRelId); + } + if (!depRelIds.isEmpty()) { + Annotation depStruct = aView.newAnnotation(id(DEPENDENCY_STRUCTURE, aSentence), + Discriminators.Uri.DEPENDENCY_STRUCTURE, aSentence.getBegin(), + aSentence.getEnd()); + depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); + } + } + + private void convertConstituents(View aTarget, ROOT aRootConstituent) + { + Set constituents = new LinkedHashSet<>(); + convertConstituent(aTarget, aRootConstituent, constituents); + + Annotation phraseStruct = aTarget.newAnnotation(id(PHRASE_STRUCTURE, aRootConstituent), + Discriminators.Uri.PHRASE_STRUCTURE, aRootConstituent.getBegin(), + aRootConstituent.getEnd()); + phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); + } + private void convertConstituent(View aView, org.apache.uima.jcas.tcas.Annotation aNode, Set aConstituents) { diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java index f28a759112..821db0cbe7 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java @@ -50,8 +50,12 @@ public class Lif2DKPro { + private Map tokenIdx; + public void convert(Container aContainer, JCas aJCas) { + tokenIdx = new HashMap<>(); + aJCas.setDocumentLanguage(aContainer.getLanguage()); aJCas.setDocumentText(aContainer.getText()); @@ -60,169 +64,192 @@ public void convert(Container aContainer, JCas aJCas) // Paragraph view.getAnnotations().stream() .filter(a -> Discriminators.Uri.PARAGRAPH.equals(a.getAtType())) - .forEach(para -> { - Paragraph paraAnno = new Paragraph(aJCas, para.getStart().intValue(), - para.getEnd().intValue()); - paraAnno.addToIndexes(); - }); + .forEach(para -> convertParagraph(aJCas, para)); // Sentence view.getAnnotations().stream() .filter(a -> Discriminators.Uri.SENTENCE.equals(a.getAtType())) - .forEach(sent -> { - Sentence sentAnno = new Sentence(aJCas, sent.getStart().intValue(), - sent.getEnd().intValue()); - sentAnno.addToIndexes(); - }); + .forEach(sent -> convertSentence(aJCas, sent)); - Map tokenIdx = new HashMap<>(); - - // Token, POS, Lemma + // Token, POS, Lemma (builds token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.TOKEN.equals(a.getAtType())) - .forEach(token -> { - Token tokenAnno = new Token(aJCas, token.getStart().intValue(), token - .getEnd().intValue()); - String pos = token.getFeature(Features.Token.POS); - String lemma = token.getFeature(Features.Token.LEMMA); - - if (isNotEmpty(pos)) { - POS posAnno = new POS(aJCas, tokenAnno.getBegin(), tokenAnno.getEnd()); - posAnno.setPosValue(pos != null ? pos.intern() : null); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - tokenAnno.setPos(posAnno); - } - - if (isNotEmpty(lemma)) { - Lemma lemmaAnno = new Lemma(aJCas, tokenAnno.getBegin(), tokenAnno.getEnd()); - lemmaAnno.setValue(lemma); - lemmaAnno.addToIndexes(); - tokenAnno.setLemma(lemmaAnno); - } - - tokenAnno.addToIndexes(); - tokenIdx.put(token.getId(), tokenAnno); - }); + .forEach(token -> convertToken(aJCas, token)); // NamedEntity view.getAnnotations().stream() .filter(a -> Discriminators.Uri.NE.equals(a.getAtType())) - .forEach(ne -> { - NamedEntity neAnno = new NamedEntity(aJCas, ne.getStart().intValue(), - ne.getEnd().intValue()); - neAnno.setValue(ne.getLabel()); - neAnno.addToIndexes(); - }); + .forEach(ne -> convertNamedEntity(aJCas, ne)); - // Dependencies + // Dependencies (requires token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.DEPENDENCY.equals(a.getAtType())) - .forEach(dep -> { - String dependent = dep.getFeature(Features.Dependency.DEPENDENT); - String governor = dep.getFeature(Features.Dependency.GOVERNOR); - - if (isEmpty(governor) || governor.equals(dependent)) { - ROOT depAnno = new ROOT(aJCas); - depAnno.setDependencyType(dep.getLabel()); - depAnno.setDependent(tokenIdx.get(dependent)); - depAnno.setGovernor(tokenIdx.get(dependent)); - depAnno.setBegin(depAnno.getDependent().getBegin()); - depAnno.setEnd(depAnno.getDependent().getEnd()); - depAnno.addToIndexes(); - } - else { - Dependency depAnno = new Dependency(aJCas); - depAnno.setDependencyType(dep.getLabel()); - depAnno.setDependent(tokenIdx.get(dependent)); - depAnno.setGovernor(tokenIdx.get(governor)); - depAnno.setBegin(depAnno.getDependent().getBegin()); - depAnno.setEnd(depAnno.getDependent().getEnd()); - depAnno.addToIndexes(); - } - }); + .forEach(dep -> convertDependency(aJCas, dep)); - // Constituents + // Constituents (requires token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.PHRASE_STRUCTURE.equals(a.getAtType())) - .forEach(ps -> { - String rootId = findRoot(view, ps); - // Get the constituent IDs - Set constituentIDs; - constituentIDs = new HashSet<>( - getSetFeature(ps,Features.PhraseStructure.CONSTITUENTS)); - - List constituents = new ArrayList<>(); - Map constituentIdx = new HashMap<>(); + .forEach(ps -> convertConstituents(aJCas, view, ps)); + } + + private Object convertConstituents(JCas aJCas, View view, Annotation ps) + { + String rootId = findRoot(view, ps); + // Get the constituent IDs + Set constituentIDs = new HashSet<>( + getSetFeature(ps, Features.PhraseStructure.CONSTITUENTS)); - // Instantiate all the constituents - view.getAnnotations().stream() - .filter(a -> constituentIDs.contains(a.getId())) - .forEach(con -> { - if (Discriminators.Uri.CONSTITUENT.equals(con.getAtType())) { - Constituent conAnno; - if (rootId.equals(con.getId())) { - conAnno = new de.tudarmstadt.ukp.dkpro.core.api.syntax.type. - constituent.ROOT(aJCas); - } - else { - conAnno = new Constituent(aJCas); - } - if (con.getStart() != null) { - conAnno.setBegin(con.getStart().intValue()); - } - if (con.getEnd() != null) { - conAnno.setEnd(con.getEnd().intValue()); - } - conAnno.setConstituentType(con.getLabel()); - constituentIdx.put(con.getId(), conAnno); - constituents.add(con); - } - // If it is not a constituent, it must be a token ID - we already - // have created the tokens and recorded them in the tokenIdx - }); - - // Set parent and children features - constituents.forEach(con -> { - // Check if it is a constituent or token - Constituent conAnno = constituentIdx.get(con.getId()); - Set childIDs = getSetFeature(con, - Features.Constituent.CHILDREN); - - List children = new ArrayList<>(); - childIDs.forEach(childID -> { - Constituent conChild = constituentIdx.get(childID); - Token tokenChild = tokenIdx.get(childID); - if (conChild != null && tokenChild == null) { - conChild.setParent(conAnno); - children.add(conChild); + List constituents = new ArrayList<>(); + Map constituentIdx = new HashMap<>(); + + // Instantiate all the constituents + view.getAnnotations().stream().filter(a -> constituentIDs.contains(a.getId())) + .forEach(con -> { + if (Discriminators.Uri.CONSTITUENT.equals(con.getAtType())) { + Constituent conAnno; + if (rootId.equals(con.getId())) { + conAnno = new + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT( + aJCas); } - else if (conChild == null && tokenChild != null) { - tokenChild.setParent(conAnno); - children.add(tokenChild); + else { + conAnno = new Constituent(aJCas); } - else if (conChild == null && tokenChild == null) { - throw new IllegalStateException("ID [" + con.getId() - + "] not found"); + if (con.getStart() != null) { + conAnno.setBegin(con.getStart().intValue()); } - else { - throw new IllegalStateException("ID [" + con.getId() - + "] is constituent AND token? Impossible!"); + if (con.getEnd() != null) { + conAnno.setEnd(con.getEnd().intValue()); } - }); - - conAnno.setChildren(FSCollectionFactory.createFSArray(aJCas, children)); - }); - - // Percolate offsets - they might not have been set on the constituents! - Constituent root = constituentIdx.get(rootId); - percolateOffsets(root); - - // Add to indexes - constituentIdx.values().forEach(conAnno -> { - conAnno.addToIndexes(); + conAnno.setConstituentType(con.getLabel()); + constituentIdx.put(con.getId(), conAnno); + constituents.add(con); + } + // If it is not a constituent, it must be a token ID - we already + // have created the tokens and recorded them in the tokenIdx }); + + // Set parent and children features + constituents.forEach(con -> { + // Check if it is a constituent or token + Constituent conAnno = constituentIdx.get(con.getId()); + Set childIDs = getSetFeature(con, Features.Constituent.CHILDREN); + + List children = new ArrayList<>(); + childIDs.forEach(childID -> { + Constituent conChild = constituentIdx.get(childID); + Token tokenChild = tokenIdx.get(childID); + if (conChild != null && tokenChild == null) { + conChild.setParent(conAnno); + children.add(conChild); + } + else if (conChild == null && tokenChild != null) { + tokenChild.setParent(conAnno); + children.add(tokenChild); + } + else if (conChild == null && tokenChild == null) { + throw new IllegalStateException("ID [" + con.getId() + "] not found"); + } + else { + throw new IllegalStateException( + "ID [" + con.getId() + "] is constituent AND token? Impossible!"); + } }); + + conAnno.setChildren(FSCollectionFactory.createFSArray(aJCas, children)); + }); + + // Percolate offsets - they might not have been set on the constituents! + Constituent root = constituentIdx.get(rootId); + percolateOffsets(root); + + // Add to indexes + constituentIdx.values().forEach(conAnno -> { + conAnno.addToIndexes(); + }); + + return root; + } + + private Paragraph convertParagraph(JCas aTarget, Annotation aParagraph) + { + Paragraph paragraph = new Paragraph(aTarget, aParagraph.getStart().intValue(), + aParagraph.getEnd().intValue()); + paragraph.addToIndexes(); + return paragraph; + } + + private Sentence convertSentence(JCas aTarget, Annotation aSentence) + { + Sentence sentence = new Sentence(aTarget, aSentence.getStart().intValue(), + aSentence.getEnd().intValue()); + sentence.addToIndexes(); + return sentence; + } + + private Token convertToken(JCas aTarget, Annotation aToken) + { + Token token = new Token(aTarget, aToken.getStart().intValue(), aToken + .getEnd().intValue()); + String pos = aToken.getFeature(Features.Token.POS); + String lemma = aToken.getFeature(Features.Token.LEMMA); + + if (isNotEmpty(pos)) { + POS posAnno = new POS(aTarget, token.getBegin(), token.getEnd()); + posAnno.setPosValue(pos != null ? pos.intern() : null); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + token.setPos(posAnno); + } + + if (isNotEmpty(lemma)) { + Lemma lemmaAnno = new Lemma(aTarget, token.getBegin(), token.getEnd()); + lemmaAnno.setValue(lemma); + lemmaAnno.addToIndexes(); + token.setLemma(lemmaAnno); + } + + token.addToIndexes(); + tokenIdx.put(token.getId(), token); + + return token; + } + + private NamedEntity convertNamedEntity(JCas aTarget, Annotation aNamedEntity) + { + NamedEntity neAnno = new NamedEntity(aTarget, aNamedEntity.getStart().intValue(), + aNamedEntity.getEnd().intValue()); + neAnno.setValue(aNamedEntity.getLabel()); + neAnno.addToIndexes(); + return neAnno; + } + + private Dependency convertDependency(JCas aTarget, Annotation aDependency) + { + String dependent = aDependency.getFeature(Features.Dependency.DEPENDENT); + String governor = aDependency.getFeature(Features.Dependency.GOVERNOR); + + Dependency depAnno; + if (isEmpty(governor) || governor.equals(dependent)) { + depAnno = new ROOT(aTarget); + depAnno.setDependencyType(aDependency.getLabel()); + depAnno.setDependent(tokenIdx.get(dependent)); + depAnno.setGovernor(tokenIdx.get(dependent)); + depAnno.setBegin(depAnno.getDependent().getBegin()); + depAnno.setEnd(depAnno.getDependent().getEnd()); + depAnno.addToIndexes(); + } + else { + depAnno = new Dependency(aTarget); + depAnno.setDependencyType(aDependency.getLabel()); + depAnno.setDependent(tokenIdx.get(dependent)); + depAnno.setGovernor(tokenIdx.get(governor)); + depAnno.setBegin(depAnno.getDependent().getBegin()); + depAnno.setEnd(depAnno.getDependent().getEnd()); + depAnno.addToIndexes(); + } + + return depAnno; } @SuppressWarnings("unchecked") diff --git a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java b/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java index 7cb7e4372a..a416af5935 100644 --- a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java +++ b/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java @@ -19,6 +19,8 @@ import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import org.junit.Rule; import org.junit.Test; @@ -32,9 +34,23 @@ public void roundTrip() throws Exception { testRoundTrip( - LifReader.class, // the reader - LifWriter.class, // the writer - "conll/2006/fi-ref.json"); // the input also used as output reference + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "conll/2006/fi-ref.lif"); // the input also used as output reference + } + + @Test + public void authenticPosLifFileWithWrapper() + throws Exception + { + testOneWay( + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false, + LifWriter.PARAM_ADD_ENVELOPE, true), + "lif/stanford-pos-ref.lif", // the reference file for the output + "lif/stanford-pos.lif"); // the input file for the test } @Test @@ -42,10 +58,11 @@ public void oneDependencyStructure() throws Exception { testOneWay( - LifReader.class, // the reader - LifWriter.class, // the writer - "lif/dependencystructure-ref.json", // the reference file for the output - "lif/dependencystructure.json"); // the input file for the test + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "lif/dependencystructure-ref.lif", // the reference file for the output + "lif/dependencystructure.lif"); // the input file for the test } @Test @@ -53,10 +70,11 @@ public void onePhraseStructure() throws Exception { testOneWay( - LifReader.class, // the reader - LifWriter.class, // the writer - "lif/phrasestructure-ref.json", // the reference file for the output - "lif/phrasestructure.json"); // the input file for the test + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "lif/phrasestructure-ref.lif", // the reference file for the output + "lif/phrasestructure.lif"); // the input file for the test } @Rule diff --git a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java b/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java index 186ab21a04..24e0e2e7dd 100644 --- a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java +++ b/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java @@ -18,6 +18,8 @@ package de.tudarmstadt.ukp.dkpro.core.io.lif; import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import org.junit.Rule; import org.junit.Test; @@ -32,9 +34,10 @@ public void oneWay() throws Exception { testOneWay( - Conll2006Reader.class, // the reader - LifWriter.class, // the writer - "conll/2006/fi-ref.json", // the reference file for the output + createReaderDescription(Conll2006Reader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "conll/2006/fi-ref.lif", // the reference file for the output "conll/2006/fi-orig.conll"); // the input file for the test } diff --git a/dkpro-core-io-lif-asl/src/test/resources/README.txt b/dkpro-core-io-lif-asl/src/test/resources/README.txt index 0044e01920..44144dd22c 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/README.txt +++ b/dkpro-core-io-lif-asl/src/test/resources/README.txt @@ -9,4 +9,9 @@ src/test/resources/conll/2006/fi-orig.conll http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/ftb3.1.conllx.gz Creative Commons Attribution 3.0 License + +src/test/resources/lif/stanford-pos.lif + + Obtained from LAPPSGrid using Stanford Parser component + Text is title and abstract from: https://www.ncbi.nlm.nih.gov/pubmed/10025748 \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.json b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif similarity index 99% rename from dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.json rename to dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif index c71fc08d60..ba17bcc7d1 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.json +++ b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif @@ -6,6 +6,7 @@ "@language" : "x-unspecified" }, "views" : [ { + "id" : "v1", "metadata" : { }, "annotations" : [ { "id" : "sent-0", diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.json b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif similarity index 98% rename from dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.json rename to dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif index bfaa9d31f8..6aef011486 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.json +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif @@ -6,6 +6,7 @@ "@language" : "en" }, "views" : [ { + "id" : "v1", "metadata" : { }, "annotations" : [ { "id" : "sent-0", diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.json b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.lif similarity index 100% rename from dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.json rename to dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.lif diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.json b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif similarity index 98% rename from dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.json rename to dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif index 72f83c2067..348c2a6fdd 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.json +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif @@ -6,6 +6,7 @@ "@language" : "en" }, "views" : [ { + "id" : "v1", "metadata" : { }, "annotations" : [ { "id" : "sent-0", diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.json b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.lif similarity index 100% rename from dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.json rename to dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.lif diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif new file mode 100644 index 0000000000..68a37fb082 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif @@ -0,0 +1 @@ +{"@context":"http://vocab.lappsgrid.org/context-1.0.0.jsonld","metadata":{"sourceid":"10025748","sourcedb":"PubMed"},"text":{"@value":"Actinically degenerate elastic tissue is the likely antigenic basis of actinic granuloma of the skin and of temporal arteritis.\nStaining technique is paramount for detecting and assessing the severe degeneration that occurs in the elastic tissues of the skin and its arteries in response to prolonged exposure to actinic radiation. With a selective \"controlled\" hematoxylin-and-eosin stain, actinically damaged (\"elastotic\") elastic tissue stains blue, as Unna described, and contrasts with normal and simply hyperplastic elastic tissue, which stains red. \"Special\" elastic stains such as Orcein and Verhoeff do not demonstrate this difference. When resorptive (elastolytic) giant cell reactions develop in relation to actinically degenerate elastic tissue of the skin, the papules that arise tend to form expanding, annular rings. A previously used and appropriate name for these autoimmune lesions in the skin is actinic granuloma because this name highlights the likely actinic origin and pathogenesis of many such lesions. Granulomatous inflammation in connection with actinically degenerate internal elastic lamina appears to be the basis of temporal arteritis. Actinic granulomas may occur in the skin concurrently with temporal arteritis. A recent study of temporal arteritis strongly relates its elastic tissue changes to those of \"accelerated\" atherosclerosis."},"views":[{"metadata":{"contains":{"http://vocab.lappsgrid.org/Token#pos":{"producer":"edu.brandeis.cs.lappsgrid.stanford.corenlp.POSTagger:2.0.4","type":"tagger:stanford"}}},"annotations":[{"id":"tk_0_0","start":0,"end":11,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Actinically"}},{"id":"tk_1_1","start":12,"end":22,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"degenerate"}},{"id":"tk_2_2","start":23,"end":30,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_3_3","start":31,"end":37,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_4_4","start":38,"end":40,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"is"}},{"id":"tk_5_5","start":41,"end":44,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_6_6","start":45,"end":51,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"likely"}},{"id":"tk_7_7","start":52,"end":61,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"antigenic"}},{"id":"tk_8_8","start":62,"end":67,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"basis"}},{"id":"tk_9_9","start":68,"end":70,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_10_10","start":71,"end":78,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_11_11","start":79,"end":88,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"granuloma"}},{"id":"tk_12_12","start":89,"end":91,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_13_13","start":92,"end":95,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_14_14","start":96,"end":100,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_15_15","start":101,"end":104,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_16_16","start":105,"end":107,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_17_17","start":108,"end":116,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_18_18","start":117,"end":126,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_19_19","start":126,"end":127,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_20_0","start":128,"end":136,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"Staining"}},{"id":"tk_21_1","start":137,"end":146,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"technique"}},{"id":"tk_22_2","start":147,"end":149,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"is"}},{"id":"tk_23_3","start":150,"end":159,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"paramount"}},{"id":"tk_24_4","start":160,"end":163,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"for"}},{"id":"tk_25_5","start":164,"end":173,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"detecting"}},{"id":"tk_26_6","start":174,"end":177,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_27_7","start":178,"end":187,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"assessing"}},{"id":"tk_28_8","start":188,"end":191,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_29_9","start":192,"end":198,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"severe"}},{"id":"tk_30_10","start":199,"end":211,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"degeneration"}},{"id":"tk_31_11","start":212,"end":216,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WDT","word":"that"}},{"id":"tk_32_12","start":217,"end":223,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"occurs"}},{"id":"tk_33_13","start":224,"end":226,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_34_14","start":227,"end":230,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_35_15","start":231,"end":238,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_36_16","start":239,"end":246,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"tissues"}},{"id":"tk_37_17","start":247,"end":249,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_38_18","start":250,"end":253,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_39_19","start":254,"end":258,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_40_20","start":259,"end":262,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_41_21","start":263,"end":266,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"PRP$","word":"its"}},{"id":"tk_42_22","start":267,"end":275,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"arteries"}},{"id":"tk_43_23","start":276,"end":278,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_44_24","start":279,"end":287,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"response"}},{"id":"tk_45_25","start":288,"end":290,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_46_26","start":291,"end":300,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"prolonged"}},{"id":"tk_47_27","start":301,"end":309,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"exposure"}},{"id":"tk_48_28","start":310,"end":312,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_49_29","start":313,"end":320,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_50_30","start":321,"end":330,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"radiation"}},{"id":"tk_51_31","start":330,"end":331,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_52_0","start":332,"end":336,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"With"}},{"id":"tk_53_1","start":337,"end":338,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"a"}},{"id":"tk_54_2","start":339,"end":348,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"selective"}},{"id":"tk_55_3","start":349,"end":350,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_56_4","start":350,"end":360,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"controlled"}},{"id":"tk_57_5","start":360,"end":361,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_58_6","start":362,"end":383,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"hematoxylin-and-eosin"}},{"id":"tk_59_7","start":384,"end":389,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"stain"}},{"id":"tk_60_8","start":389,"end":390,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_61_9","start":391,"end":402,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"actinically"}},{"id":"tk_62_10","start":403,"end":410,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"damaged"}},{"id":"tk_63_11","start":411,"end":412,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-LRB-","word":"-LRB-"}},{"id":"tk_64_12","start":412,"end":413,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_65_13","start":413,"end":422,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastotic"}},{"id":"tk_66_14","start":422,"end":423,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_67_15","start":423,"end":424,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-RRB-","word":"-RRB-"}},{"id":"tk_68_16","start":425,"end":432,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_69_17","start":433,"end":439,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_70_18","start":440,"end":446,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"stains"}},{"id":"tk_71_19","start":447,"end":451,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"blue"}},{"id":"tk_72_20","start":451,"end":452,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_73_21","start":453,"end":455,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"as"}},{"id":"tk_74_22","start":456,"end":460,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Unna"}},{"id":"tk_75_23","start":461,"end":470,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBD","word":"described"}},{"id":"tk_76_24","start":470,"end":471,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_77_25","start":472,"end":475,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_78_26","start":476,"end":485,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"contrasts"}},{"id":"tk_79_27","start":486,"end":490,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"with"}},{"id":"tk_80_28","start":491,"end":497,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"normal"}},{"id":"tk_81_29","start":498,"end":501,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_82_30","start":502,"end":508,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"simply"}},{"id":"tk_83_31","start":509,"end":521,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"hyperplastic"}},{"id":"tk_84_32","start":522,"end":529,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_85_33","start":530,"end":536,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_86_34","start":536,"end":537,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_87_35","start":538,"end":543,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WDT","word":"which"}},{"id":"tk_88_36","start":544,"end":550,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"stains"}},{"id":"tk_89_37","start":551,"end":554,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"red"}},{"id":"tk_90_38","start":554,"end":555,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_91_0","start":556,"end":557,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_92_1","start":557,"end":564,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"Special"}},{"id":"tk_93_2","start":564,"end":565,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_94_3","start":566,"end":573,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_95_4","start":574,"end":580,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"stains"}},{"id":"tk_96_5","start":581,"end":585,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"such"}},{"id":"tk_97_6","start":586,"end":588,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"as"}},{"id":"tk_98_7","start":589,"end":595,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Orcein"}},{"id":"tk_99_8","start":596,"end":599,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_100_9","start":600,"end":608,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Verhoeff"}},{"id":"tk_101_10","start":609,"end":611,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"do"}},{"id":"tk_102_11","start":612,"end":615,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"not"}},{"id":"tk_103_12","start":616,"end":627,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"demonstrate"}},{"id":"tk_104_13","start":628,"end":632,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"this"}},{"id":"tk_105_14","start":633,"end":643,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"difference"}},{"id":"tk_106_15","start":643,"end":644,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_107_0","start":645,"end":649,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WRB","word":"When"}},{"id":"tk_108_1","start":650,"end":660,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"resorptive"}},{"id":"tk_109_2","start":661,"end":662,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-LRB-","word":"-LRB-"}},{"id":"tk_110_3","start":662,"end":673,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastolytic"}},{"id":"tk_111_4","start":673,"end":674,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-RRB-","word":"-RRB-"}},{"id":"tk_112_5","start":675,"end":680,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"giant"}},{"id":"tk_113_6","start":681,"end":685,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"cell"}},{"id":"tk_114_7","start":686,"end":695,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"reactions"}},{"id":"tk_115_8","start":696,"end":703,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"develop"}},{"id":"tk_116_9","start":704,"end":706,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_117_10","start":707,"end":715,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"relation"}},{"id":"tk_118_11","start":716,"end":718,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_119_12","start":719,"end":730,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"actinically"}},{"id":"tk_120_13","start":731,"end":741,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"degenerate"}},{"id":"tk_121_14","start":742,"end":749,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_122_15","start":750,"end":756,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_123_16","start":757,"end":759,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_124_17","start":760,"end":763,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_125_18","start":764,"end":768,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_126_19","start":768,"end":769,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_127_20","start":770,"end":773,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_128_21","start":774,"end":781,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"papules"}},{"id":"tk_129_22","start":782,"end":786,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WDT","word":"that"}},{"id":"tk_130_23","start":787,"end":792,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"arise"}},{"id":"tk_131_24","start":793,"end":797,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"tend"}},{"id":"tk_132_25","start":798,"end":800,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_133_26","start":801,"end":805,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"form"}},{"id":"tk_134_27","start":806,"end":815,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"expanding"}},{"id":"tk_135_28","start":815,"end":816,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_136_29","start":817,"end":824,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"annular"}},{"id":"tk_137_30","start":825,"end":830,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"rings"}},{"id":"tk_138_31","start":830,"end":831,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_139_0","start":832,"end":833,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"A"}},{"id":"tk_140_1","start":834,"end":844,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"previously"}},{"id":"tk_141_2","start":845,"end":849,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"used"}},{"id":"tk_142_3","start":850,"end":853,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_143_4","start":854,"end":865,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"appropriate"}},{"id":"tk_144_5","start":866,"end":870,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"name"}},{"id":"tk_145_6","start":871,"end":874,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"for"}},{"id":"tk_146_7","start":875,"end":880,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"these"}},{"id":"tk_147_8","start":881,"end":891,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"autoimmune"}},{"id":"tk_148_9","start":892,"end":899,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"lesions"}},{"id":"tk_149_10","start":900,"end":902,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_150_11","start":903,"end":906,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_151_12","start":907,"end":911,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_152_13","start":912,"end":914,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"is"}},{"id":"tk_153_14","start":915,"end":922,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_154_15","start":923,"end":932,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"granuloma"}},{"id":"tk_155_16","start":933,"end":940,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"because"}},{"id":"tk_156_17","start":941,"end":945,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"this"}},{"id":"tk_157_18","start":946,"end":950,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"name"}},{"id":"tk_158_19","start":951,"end":961,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"highlights"}},{"id":"tk_159_20","start":962,"end":965,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_160_21","start":966,"end":972,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"likely"}},{"id":"tk_161_22","start":973,"end":980,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_162_23","start":981,"end":987,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"origin"}},{"id":"tk_163_24","start":988,"end":991,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_164_25","start":992,"end":1004,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"pathogenesis"}},{"id":"tk_165_26","start":1005,"end":1007,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_166_27","start":1008,"end":1012,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"many"}},{"id":"tk_167_28","start":1013,"end":1017,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"such"}},{"id":"tk_168_29","start":1018,"end":1025,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"lesions"}},{"id":"tk_169_30","start":1025,"end":1026,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_170_0","start":1027,"end":1040,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"Granulomatous"}},{"id":"tk_171_1","start":1041,"end":1053,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"inflammation"}},{"id":"tk_172_2","start":1054,"end":1056,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_173_3","start":1057,"end":1067,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"connection"}},{"id":"tk_174_4","start":1068,"end":1072,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"with"}},{"id":"tk_175_5","start":1073,"end":1084,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"actinically"}},{"id":"tk_176_6","start":1085,"end":1095,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"degenerate"}},{"id":"tk_177_7","start":1096,"end":1104,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"internal"}},{"id":"tk_178_8","start":1105,"end":1112,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_179_9","start":1113,"end":1119,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"lamina"}},{"id":"tk_180_10","start":1120,"end":1127,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"appears"}},{"id":"tk_181_11","start":1128,"end":1130,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_182_12","start":1131,"end":1133,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"be"}},{"id":"tk_183_13","start":1134,"end":1137,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_184_14","start":1138,"end":1143,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"basis"}},{"id":"tk_185_15","start":1144,"end":1146,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_186_16","start":1147,"end":1155,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_187_17","start":1156,"end":1165,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_188_18","start":1165,"end":1166,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_189_0","start":1167,"end":1174,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"Actinic"}},{"id":"tk_190_1","start":1175,"end":1185,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"granulomas"}},{"id":"tk_191_2","start":1186,"end":1189,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"MD","word":"may"}},{"id":"tk_192_3","start":1190,"end":1195,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"occur"}},{"id":"tk_193_4","start":1196,"end":1198,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_194_5","start":1199,"end":1202,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_195_6","start":1203,"end":1207,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_196_7","start":1208,"end":1220,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"concurrently"}},{"id":"tk_197_8","start":1221,"end":1225,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"with"}},{"id":"tk_198_9","start":1226,"end":1234,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_199_10","start":1235,"end":1244,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_200_11","start":1244,"end":1245,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_201_0","start":1246,"end":1247,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"A"}},{"id":"tk_202_1","start":1248,"end":1254,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"recent"}},{"id":"tk_203_2","start":1255,"end":1260,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"study"}},{"id":"tk_204_3","start":1261,"end":1263,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_205_4","start":1264,"end":1272,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_206_5","start":1273,"end":1282,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_207_6","start":1283,"end":1291,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"strongly"}},{"id":"tk_208_7","start":1292,"end":1299,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"relates"}},{"id":"tk_209_8","start":1300,"end":1303,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"PRP$","word":"its"}},{"id":"tk_210_9","start":1304,"end":1311,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_211_10","start":1312,"end":1318,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_212_11","start":1319,"end":1326,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"changes"}},{"id":"tk_213_12","start":1327,"end":1329,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_214_13","start":1330,"end":1335,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"those"}},{"id":"tk_215_14","start":1336,"end":1338,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_216_15","start":1339,"end":1340,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_217_16","start":1340,"end":1351,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"accelerated"}},{"id":"tk_218_17","start":1351,"end":1352,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_219_18","start":1353,"end":1368,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"atherosclerosis"}},{"id":"tk_220_19","start":1368,"end":1369,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}}]}]} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif new file mode 100644 index 0000000000..744e069664 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif @@ -0,0 +1,1785 @@ +{ + "discriminator" : "http://vocab.lappsgrid.org/ns/media/jsonld#lapps", + "payload" : { + "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata" : { }, + "text" : { + "@value" : "Actinically degenerate elastic tissue is the likely antigenic basis of actinic granuloma of the skin and of temporal arteritis.\nStaining technique is paramount for detecting and assessing the severe degeneration that occurs in the elastic tissues of the skin and its arteries in response to prolonged exposure to actinic radiation. With a selective \"controlled\" hematoxylin-and-eosin stain, actinically damaged (\"elastotic\") elastic tissue stains blue, as Unna described, and contrasts with normal and simply hyperplastic elastic tissue, which stains red. \"Special\" elastic stains such as Orcein and Verhoeff do not demonstrate this difference. When resorptive (elastolytic) giant cell reactions develop in relation to actinically degenerate elastic tissue of the skin, the papules that arise tend to form expanding, annular rings. A previously used and appropriate name for these autoimmune lesions in the skin is actinic granuloma because this name highlights the likely actinic origin and pathogenesis of many such lesions. Granulomatous inflammation in connection with actinically degenerate internal elastic lamina appears to be the basis of temporal arteritis. Actinic granulomas may occur in the skin concurrently with temporal arteritis. A recent study of temporal arteritis strongly relates its elastic tissue changes to those of \"accelerated\" atherosclerosis.", + "@language" : "x-unspecified" + }, + "views" : [ { + "id" : "v1", + "metadata" : { }, + "annotations" : [ { + "id" : "tok-0", + "start" : 0, + "end" : 11, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-1", + "start" : 12, + "end" : 22, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-2", + "start" : 23, + "end" : 30, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-3", + "start" : 31, + "end" : 37, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-4", + "start" : 38, + "end" : 40, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-5", + "start" : 41, + "end" : 44, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-6", + "start" : 45, + "end" : 51, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-7", + "start" : 52, + "end" : 61, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-8", + "start" : 62, + "end" : 67, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-9", + "start" : 68, + "end" : 70, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-10", + "start" : 71, + "end" : 78, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-11", + "start" : 79, + "end" : 88, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-12", + "start" : 89, + "end" : 91, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-13", + "start" : 92, + "end" : 95, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-14", + "start" : 96, + "end" : 100, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-15", + "start" : 101, + "end" : 104, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-16", + "start" : 105, + "end" : 107, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-17", + "start" : 108, + "end" : 116, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-18", + "start" : 117, + "end" : 126, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-19", + "start" : 126, + "end" : 127, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-20", + "start" : 128, + "end" : 136, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-21", + "start" : 137, + "end" : 146, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-22", + "start" : 147, + "end" : 149, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-23", + "start" : 150, + "end" : 159, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-24", + "start" : 160, + "end" : 163, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-25", + "start" : 164, + "end" : 173, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-26", + "start" : 174, + "end" : 177, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-27", + "start" : 178, + "end" : 187, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-28", + "start" : 188, + "end" : 191, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-29", + "start" : 192, + "end" : 198, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-30", + "start" : 199, + "end" : 211, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-31", + "start" : 212, + "end" : 216, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WDT" + } + }, { + "id" : "tok-32", + "start" : 217, + "end" : 223, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-33", + "start" : 224, + "end" : 226, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-34", + "start" : 227, + "end" : 230, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-35", + "start" : 231, + "end" : 238, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-36", + "start" : 239, + "end" : 246, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-37", + "start" : 247, + "end" : 249, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-38", + "start" : 250, + "end" : 253, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-39", + "start" : 254, + "end" : 258, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-40", + "start" : 259, + "end" : 262, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-41", + "start" : 263, + "end" : 266, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "PRP$" + } + }, { + "id" : "tok-42", + "start" : 267, + "end" : 275, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-43", + "start" : 276, + "end" : 278, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-44", + "start" : 279, + "end" : 287, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-45", + "start" : 288, + "end" : 290, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-46", + "start" : 291, + "end" : 300, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-47", + "start" : 301, + "end" : 309, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-48", + "start" : 310, + "end" : 312, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-49", + "start" : 313, + "end" : 320, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-50", + "start" : 321, + "end" : 330, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-51", + "start" : 330, + "end" : 331, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-52", + "start" : 332, + "end" : 336, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-53", + "start" : 337, + "end" : 338, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-54", + "start" : 339, + "end" : 348, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-55", + "start" : 349, + "end" : 350, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-56", + "start" : 350, + "end" : 360, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-57", + "start" : 360, + "end" : 361, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-58", + "start" : 362, + "end" : 383, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-59", + "start" : 384, + "end" : 389, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-60", + "start" : 389, + "end" : 390, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-61", + "start" : 391, + "end" : 402, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-62", + "start" : 403, + "end" : 410, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-63", + "start" : 411, + "end" : 412, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-LRB-" + } + }, { + "id" : "tok-64", + "start" : 412, + "end" : 413, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-65", + "start" : 413, + "end" : 422, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-66", + "start" : 422, + "end" : 423, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-67", + "start" : 423, + "end" : 424, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-RRB-" + } + }, { + "id" : "tok-68", + "start" : 425, + "end" : 432, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-69", + "start" : 433, + "end" : 439, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-70", + "start" : 440, + "end" : 446, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-71", + "start" : 447, + "end" : 451, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-72", + "start" : 451, + "end" : 452, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-73", + "start" : 453, + "end" : 455, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-74", + "start" : 456, + "end" : 460, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-75", + "start" : 461, + "end" : 470, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBD" + } + }, { + "id" : "tok-76", + "start" : 470, + "end" : 471, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-77", + "start" : 472, + "end" : 475, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-78", + "start" : 476, + "end" : 485, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-79", + "start" : 486, + "end" : 490, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-80", + "start" : 491, + "end" : 497, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-81", + "start" : 498, + "end" : 501, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-82", + "start" : 502, + "end" : 508, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-83", + "start" : 509, + "end" : 521, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-84", + "start" : 522, + "end" : 529, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-85", + "start" : 530, + "end" : 536, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-86", + "start" : 536, + "end" : 537, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-87", + "start" : 538, + "end" : 543, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WDT" + } + }, { + "id" : "tok-88", + "start" : 544, + "end" : 550, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-89", + "start" : 551, + "end" : 554, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-90", + "start" : 554, + "end" : 555, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-91", + "start" : 556, + "end" : 557, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-92", + "start" : 557, + "end" : 564, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-93", + "start" : 564, + "end" : 565, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-94", + "start" : 566, + "end" : 573, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-95", + "start" : 574, + "end" : 580, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-96", + "start" : 581, + "end" : 585, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-97", + "start" : 586, + "end" : 588, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-98", + "start" : 589, + "end" : 595, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-99", + "start" : 596, + "end" : 599, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-100", + "start" : 600, + "end" : 608, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-101", + "start" : 609, + "end" : 611, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-102", + "start" : 612, + "end" : 615, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-103", + "start" : 616, + "end" : 627, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-104", + "start" : 628, + "end" : 632, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-105", + "start" : 633, + "end" : 643, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-106", + "start" : 643, + "end" : 644, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-107", + "start" : 645, + "end" : 649, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WRB" + } + }, { + "id" : "tok-108", + "start" : 650, + "end" : 660, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-109", + "start" : 661, + "end" : 662, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-LRB-" + } + }, { + "id" : "tok-110", + "start" : 662, + "end" : 673, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-111", + "start" : 673, + "end" : 674, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-RRB-" + } + }, { + "id" : "tok-112", + "start" : 675, + "end" : 680, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-113", + "start" : 681, + "end" : 685, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-114", + "start" : 686, + "end" : 695, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-115", + "start" : 696, + "end" : 703, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-116", + "start" : 704, + "end" : 706, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-117", + "start" : 707, + "end" : 715, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-118", + "start" : 716, + "end" : 718, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-119", + "start" : 719, + "end" : 730, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-120", + "start" : 731, + "end" : 741, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-121", + "start" : 742, + "end" : 749, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-122", + "start" : 750, + "end" : 756, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-123", + "start" : 757, + "end" : 759, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-124", + "start" : 760, + "end" : 763, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-125", + "start" : 764, + "end" : 768, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-126", + "start" : 768, + "end" : 769, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-127", + "start" : 770, + "end" : 773, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-128", + "start" : 774, + "end" : 781, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-129", + "start" : 782, + "end" : 786, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WDT" + } + }, { + "id" : "tok-130", + "start" : 787, + "end" : 792, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-131", + "start" : 793, + "end" : 797, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-132", + "start" : 798, + "end" : 800, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-133", + "start" : 801, + "end" : 805, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-134", + "start" : 806, + "end" : 815, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-135", + "start" : 815, + "end" : 816, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-136", + "start" : 817, + "end" : 824, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-137", + "start" : 825, + "end" : 830, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-138", + "start" : 830, + "end" : 831, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-139", + "start" : 832, + "end" : 833, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-140", + "start" : 834, + "end" : 844, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-141", + "start" : 845, + "end" : 849, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-142", + "start" : 850, + "end" : 853, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-143", + "start" : 854, + "end" : 865, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-144", + "start" : 866, + "end" : 870, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-145", + "start" : 871, + "end" : 874, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-146", + "start" : 875, + "end" : 880, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-147", + "start" : 881, + "end" : 891, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-148", + "start" : 892, + "end" : 899, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-149", + "start" : 900, + "end" : 902, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-150", + "start" : 903, + "end" : 906, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-151", + "start" : 907, + "end" : 911, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-152", + "start" : 912, + "end" : 914, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-153", + "start" : 915, + "end" : 922, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-154", + "start" : 923, + "end" : 932, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-155", + "start" : 933, + "end" : 940, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-156", + "start" : 941, + "end" : 945, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-157", + "start" : 946, + "end" : 950, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-158", + "start" : 951, + "end" : 961, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-159", + "start" : 962, + "end" : 965, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-160", + "start" : 966, + "end" : 972, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-161", + "start" : 973, + "end" : 980, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-162", + "start" : 981, + "end" : 987, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-163", + "start" : 988, + "end" : 991, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-164", + "start" : 992, + "end" : 1004, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-165", + "start" : 1005, + "end" : 1007, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-166", + "start" : 1008, + "end" : 1012, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-167", + "start" : 1013, + "end" : 1017, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-168", + "start" : 1018, + "end" : 1025, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-169", + "start" : 1025, + "end" : 1026, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-170", + "start" : 1027, + "end" : 1040, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-171", + "start" : 1041, + "end" : 1053, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-172", + "start" : 1054, + "end" : 1056, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-173", + "start" : 1057, + "end" : 1067, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-174", + "start" : 1068, + "end" : 1072, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-175", + "start" : 1073, + "end" : 1084, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-176", + "start" : 1085, + "end" : 1095, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-177", + "start" : 1096, + "end" : 1104, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-178", + "start" : 1105, + "end" : 1112, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-179", + "start" : 1113, + "end" : 1119, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-180", + "start" : 1120, + "end" : 1127, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-181", + "start" : 1128, + "end" : 1130, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-182", + "start" : 1131, + "end" : 1133, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-183", + "start" : 1134, + "end" : 1137, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-184", + "start" : 1138, + "end" : 1143, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-185", + "start" : 1144, + "end" : 1146, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-186", + "start" : 1147, + "end" : 1155, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-187", + "start" : 1156, + "end" : 1165, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-188", + "start" : 1165, + "end" : 1166, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-189", + "start" : 1167, + "end" : 1174, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-190", + "start" : 1175, + "end" : 1185, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-191", + "start" : 1186, + "end" : 1189, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "MD" + } + }, { + "id" : "tok-192", + "start" : 1190, + "end" : 1195, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-193", + "start" : 1196, + "end" : 1198, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-194", + "start" : 1199, + "end" : 1202, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-195", + "start" : 1203, + "end" : 1207, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-196", + "start" : 1208, + "end" : 1220, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-197", + "start" : 1221, + "end" : 1225, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-198", + "start" : 1226, + "end" : 1234, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-199", + "start" : 1235, + "end" : 1244, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-200", + "start" : 1244, + "end" : 1245, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-201", + "start" : 1246, + "end" : 1247, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-202", + "start" : 1248, + "end" : 1254, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-203", + "start" : 1255, + "end" : 1260, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-204", + "start" : 1261, + "end" : 1263, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-205", + "start" : 1264, + "end" : 1272, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-206", + "start" : 1273, + "end" : 1282, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-207", + "start" : 1283, + "end" : 1291, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-208", + "start" : 1292, + "end" : 1299, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-209", + "start" : 1300, + "end" : 1303, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "PRP$" + } + }, { + "id" : "tok-210", + "start" : 1304, + "end" : 1311, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-211", + "start" : 1312, + "end" : 1318, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-212", + "start" : 1319, + "end" : 1326, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-213", + "start" : 1327, + "end" : 1329, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-214", + "start" : 1330, + "end" : 1335, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-215", + "start" : 1336, + "end" : 1338, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-216", + "start" : 1339, + "end" : 1340, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-217", + "start" : 1340, + "end" : 1351, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-218", + "start" : 1351, + "end" : 1352, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-219", + "start" : 1353, + "end" : 1368, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-220", + "start" : 1368, + "end" : 1369, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + } ] + } ] + }, + "parameters" : { } +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif new file mode 100644 index 0000000000..438d91cd9a --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif @@ -0,0 +1,2237 @@ +{ + "discriminator": "http://vocab.lappsgrid.org/ns/media/jsonld#lif", + "payload": { + "@context": "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata": { + "sourceid": "10025748", + "sourcedb": "PubMed" + }, + "text": { + "@value": "Actinically degenerate elastic tissue is the likely antigenic basis of actinic granuloma of the skin and of temporal arteritis.\nStaining technique is paramount for detecting and assessing the severe degeneration that occurs in the elastic tissues of the skin and its arteries in response to prolonged exposure to actinic radiation. With a selective \"controlled\" hematoxylin-and-eosin stain, actinically damaged (\"elastotic\") elastic tissue stains blue, as Unna described, and contrasts with normal and simply hyperplastic elastic tissue, which stains red. \"Special\" elastic stains such as Orcein and Verhoeff do not demonstrate this difference. When resorptive (elastolytic) giant cell reactions develop in relation to actinically degenerate elastic tissue of the skin, the papules that arise tend to form expanding, annular rings. A previously used and appropriate name for these autoimmune lesions in the skin is actinic granuloma because this name highlights the likely actinic origin and pathogenesis of many such lesions. Granulomatous inflammation in connection with actinically degenerate internal elastic lamina appears to be the basis of temporal arteritis. Actinic granulomas may occur in the skin concurrently with temporal arteritis. A recent study of temporal arteritis strongly relates its elastic tissue changes to those of \"accelerated\" atherosclerosis." + }, + "views": [ + { + "metadata": { + "contains": { + "http://vocab.lappsgrid.org/Token#pos": { + "producer": "edu.brandeis.cs.lappsgrid.stanford.corenlp.POSTagger:2.0.4", + "type": "tagger:stanford" + } + } + }, + "annotations": [ + { + "id": "tk_0_0", + "start": 0, + "end": 11, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Actinically" + } + }, + { + "id": "tk_1_1", + "start": 12, + "end": 22, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "degenerate" + } + }, + { + "id": "tk_2_2", + "start": 23, + "end": 30, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_3_3", + "start": 31, + "end": 37, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_4_4", + "start": 38, + "end": 40, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "is" + } + }, + { + "id": "tk_5_5", + "start": 41, + "end": 44, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_6_6", + "start": 45, + "end": 51, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "likely" + } + }, + { + "id": "tk_7_7", + "start": 52, + "end": 61, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "antigenic" + } + }, + { + "id": "tk_8_8", + "start": 62, + "end": 67, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "basis" + } + }, + { + "id": "tk_9_9", + "start": 68, + "end": 70, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_10_10", + "start": 71, + "end": 78, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_11_11", + "start": 79, + "end": 88, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "granuloma" + } + }, + { + "id": "tk_12_12", + "start": 89, + "end": 91, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_13_13", + "start": 92, + "end": 95, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_14_14", + "start": 96, + "end": 100, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_15_15", + "start": 101, + "end": 104, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_16_16", + "start": 105, + "end": 107, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_17_17", + "start": 108, + "end": 116, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_18_18", + "start": 117, + "end": 126, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_19_19", + "start": 126, + "end": 127, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_20_0", + "start": 128, + "end": 136, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "Staining" + } + }, + { + "id": "tk_21_1", + "start": 137, + "end": 146, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "technique" + } + }, + { + "id": "tk_22_2", + "start": 147, + "end": 149, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "is" + } + }, + { + "id": "tk_23_3", + "start": 150, + "end": 159, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "paramount" + } + }, + { + "id": "tk_24_4", + "start": 160, + "end": 163, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "for" + } + }, + { + "id": "tk_25_5", + "start": 164, + "end": 173, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "detecting" + } + }, + { + "id": "tk_26_6", + "start": 174, + "end": 177, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_27_7", + "start": 178, + "end": 187, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "assessing" + } + }, + { + "id": "tk_28_8", + "start": 188, + "end": 191, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_29_9", + "start": 192, + "end": 198, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "severe" + } + }, + { + "id": "tk_30_10", + "start": 199, + "end": 211, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "degeneration" + } + }, + { + "id": "tk_31_11", + "start": 212, + "end": 216, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WDT", + "word": "that" + } + }, + { + "id": "tk_32_12", + "start": 217, + "end": 223, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "occurs" + } + }, + { + "id": "tk_33_13", + "start": 224, + "end": 226, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_34_14", + "start": 227, + "end": 230, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_35_15", + "start": 231, + "end": 238, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_36_16", + "start": 239, + "end": 246, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "tissues" + } + }, + { + "id": "tk_37_17", + "start": 247, + "end": 249, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_38_18", + "start": 250, + "end": 253, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_39_19", + "start": 254, + "end": 258, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_40_20", + "start": 259, + "end": 262, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_41_21", + "start": 263, + "end": 266, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "PRP$", + "word": "its" + } + }, + { + "id": "tk_42_22", + "start": 267, + "end": 275, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "arteries" + } + }, + { + "id": "tk_43_23", + "start": 276, + "end": 278, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_44_24", + "start": 279, + "end": 287, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "response" + } + }, + { + "id": "tk_45_25", + "start": 288, + "end": 290, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_46_26", + "start": 291, + "end": 300, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "prolonged" + } + }, + { + "id": "tk_47_27", + "start": 301, + "end": 309, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "exposure" + } + }, + { + "id": "tk_48_28", + "start": 310, + "end": 312, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_49_29", + "start": 313, + "end": 320, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_50_30", + "start": 321, + "end": 330, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "radiation" + } + }, + { + "id": "tk_51_31", + "start": 330, + "end": 331, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_52_0", + "start": 332, + "end": 336, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "With" + } + }, + { + "id": "tk_53_1", + "start": 337, + "end": 338, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "a" + } + }, + { + "id": "tk_54_2", + "start": 339, + "end": 348, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "selective" + } + }, + { + "id": "tk_55_3", + "start": 349, + "end": 350, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_56_4", + "start": 350, + "end": 360, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "controlled" + } + }, + { + "id": "tk_57_5", + "start": 360, + "end": 361, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_58_6", + "start": 362, + "end": 383, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "hematoxylin-and-eosin" + } + }, + { + "id": "tk_59_7", + "start": 384, + "end": 389, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "stain" + } + }, + { + "id": "tk_60_8", + "start": 389, + "end": 390, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_61_9", + "start": 391, + "end": 402, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "actinically" + } + }, + { + "id": "tk_62_10", + "start": 403, + "end": 410, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "damaged" + } + }, + { + "id": "tk_63_11", + "start": 411, + "end": 412, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-LRB-", + "word": "-LRB-" + } + }, + { + "id": "tk_64_12", + "start": 412, + "end": 413, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_65_13", + "start": 413, + "end": 422, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastotic" + } + }, + { + "id": "tk_66_14", + "start": 422, + "end": 423, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_67_15", + "start": 423, + "end": 424, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-RRB-", + "word": "-RRB-" + } + }, + { + "id": "tk_68_16", + "start": 425, + "end": 432, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_69_17", + "start": 433, + "end": 439, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_70_18", + "start": 440, + "end": 446, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "stains" + } + }, + { + "id": "tk_71_19", + "start": 447, + "end": 451, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "blue" + } + }, + { + "id": "tk_72_20", + "start": 451, + "end": 452, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_73_21", + "start": 453, + "end": 455, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "as" + } + }, + { + "id": "tk_74_22", + "start": 456, + "end": 460, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Unna" + } + }, + { + "id": "tk_75_23", + "start": 461, + "end": 470, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBD", + "word": "described" + } + }, + { + "id": "tk_76_24", + "start": 470, + "end": 471, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_77_25", + "start": 472, + "end": 475, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_78_26", + "start": 476, + "end": 485, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "contrasts" + } + }, + { + "id": "tk_79_27", + "start": 486, + "end": 490, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "with" + } + }, + { + "id": "tk_80_28", + "start": 491, + "end": 497, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "normal" + } + }, + { + "id": "tk_81_29", + "start": 498, + "end": 501, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_82_30", + "start": 502, + "end": 508, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "simply" + } + }, + { + "id": "tk_83_31", + "start": 509, + "end": 521, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "hyperplastic" + } + }, + { + "id": "tk_84_32", + "start": 522, + "end": 529, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_85_33", + "start": 530, + "end": 536, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_86_34", + "start": 536, + "end": 537, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_87_35", + "start": 538, + "end": 543, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WDT", + "word": "which" + } + }, + { + "id": "tk_88_36", + "start": 544, + "end": 550, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "stains" + } + }, + { + "id": "tk_89_37", + "start": 551, + "end": 554, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "red" + } + }, + { + "id": "tk_90_38", + "start": 554, + "end": 555, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_91_0", + "start": 556, + "end": 557, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_92_1", + "start": 557, + "end": 564, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "Special" + } + }, + { + "id": "tk_93_2", + "start": 564, + "end": 565, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_94_3", + "start": 566, + "end": 573, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_95_4", + "start": 574, + "end": 580, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "stains" + } + }, + { + "id": "tk_96_5", + "start": 581, + "end": 585, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "such" + } + }, + { + "id": "tk_97_6", + "start": 586, + "end": 588, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "as" + } + }, + { + "id": "tk_98_7", + "start": 589, + "end": 595, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Orcein" + } + }, + { + "id": "tk_99_8", + "start": 596, + "end": 599, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_100_9", + "start": 600, + "end": 608, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Verhoeff" + } + }, + { + "id": "tk_101_10", + "start": 609, + "end": 611, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "do" + } + }, + { + "id": "tk_102_11", + "start": 612, + "end": 615, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "not" + } + }, + { + "id": "tk_103_12", + "start": 616, + "end": 627, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "demonstrate" + } + }, + { + "id": "tk_104_13", + "start": 628, + "end": 632, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "this" + } + }, + { + "id": "tk_105_14", + "start": 633, + "end": 643, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "difference" + } + }, + { + "id": "tk_106_15", + "start": 643, + "end": 644, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_107_0", + "start": 645, + "end": 649, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WRB", + "word": "When" + } + }, + { + "id": "tk_108_1", + "start": 650, + "end": 660, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "resorptive" + } + }, + { + "id": "tk_109_2", + "start": 661, + "end": 662, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-LRB-", + "word": "-LRB-" + } + }, + { + "id": "tk_110_3", + "start": 662, + "end": 673, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastolytic" + } + }, + { + "id": "tk_111_4", + "start": 673, + "end": 674, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-RRB-", + "word": "-RRB-" + } + }, + { + "id": "tk_112_5", + "start": 675, + "end": 680, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "giant" + } + }, + { + "id": "tk_113_6", + "start": 681, + "end": 685, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "cell" + } + }, + { + "id": "tk_114_7", + "start": 686, + "end": 695, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "reactions" + } + }, + { + "id": "tk_115_8", + "start": 696, + "end": 703, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "develop" + } + }, + { + "id": "tk_116_9", + "start": 704, + "end": 706, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_117_10", + "start": 707, + "end": 715, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "relation" + } + }, + { + "id": "tk_118_11", + "start": 716, + "end": 718, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_119_12", + "start": 719, + "end": 730, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "actinically" + } + }, + { + "id": "tk_120_13", + "start": 731, + "end": 741, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "degenerate" + } + }, + { + "id": "tk_121_14", + "start": 742, + "end": 749, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_122_15", + "start": 750, + "end": 756, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_123_16", + "start": 757, + "end": 759, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_124_17", + "start": 760, + "end": 763, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_125_18", + "start": 764, + "end": 768, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_126_19", + "start": 768, + "end": 769, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_127_20", + "start": 770, + "end": 773, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_128_21", + "start": 774, + "end": 781, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "papules" + } + }, + { + "id": "tk_129_22", + "start": 782, + "end": 786, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WDT", + "word": "that" + } + }, + { + "id": "tk_130_23", + "start": 787, + "end": 792, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "arise" + } + }, + { + "id": "tk_131_24", + "start": 793, + "end": 797, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "tend" + } + }, + { + "id": "tk_132_25", + "start": 798, + "end": 800, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_133_26", + "start": 801, + "end": 805, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "form" + } + }, + { + "id": "tk_134_27", + "start": 806, + "end": 815, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "expanding" + } + }, + { + "id": "tk_135_28", + "start": 815, + "end": 816, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_136_29", + "start": 817, + "end": 824, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "annular" + } + }, + { + "id": "tk_137_30", + "start": 825, + "end": 830, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "rings" + } + }, + { + "id": "tk_138_31", + "start": 830, + "end": 831, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_139_0", + "start": 832, + "end": 833, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "A" + } + }, + { + "id": "tk_140_1", + "start": 834, + "end": 844, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "previously" + } + }, + { + "id": "tk_141_2", + "start": 845, + "end": 849, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "used" + } + }, + { + "id": "tk_142_3", + "start": 850, + "end": 853, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_143_4", + "start": 854, + "end": 865, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "appropriate" + } + }, + { + "id": "tk_144_5", + "start": 866, + "end": 870, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "name" + } + }, + { + "id": "tk_145_6", + "start": 871, + "end": 874, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "for" + } + }, + { + "id": "tk_146_7", + "start": 875, + "end": 880, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "these" + } + }, + { + "id": "tk_147_8", + "start": 881, + "end": 891, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "autoimmune" + } + }, + { + "id": "tk_148_9", + "start": 892, + "end": 899, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "lesions" + } + }, + { + "id": "tk_149_10", + "start": 900, + "end": 902, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_150_11", + "start": 903, + "end": 906, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_151_12", + "start": 907, + "end": 911, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_152_13", + "start": 912, + "end": 914, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "is" + } + }, + { + "id": "tk_153_14", + "start": 915, + "end": 922, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_154_15", + "start": 923, + "end": 932, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "granuloma" + } + }, + { + "id": "tk_155_16", + "start": 933, + "end": 940, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "because" + } + }, + { + "id": "tk_156_17", + "start": 941, + "end": 945, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "this" + } + }, + { + "id": "tk_157_18", + "start": 946, + "end": 950, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "name" + } + }, + { + "id": "tk_158_19", + "start": 951, + "end": 961, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "highlights" + } + }, + { + "id": "tk_159_20", + "start": 962, + "end": 965, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_160_21", + "start": 966, + "end": 972, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "likely" + } + }, + { + "id": "tk_161_22", + "start": 973, + "end": 980, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_162_23", + "start": 981, + "end": 987, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "origin" + } + }, + { + "id": "tk_163_24", + "start": 988, + "end": 991, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_164_25", + "start": 992, + "end": 1004, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "pathogenesis" + } + }, + { + "id": "tk_165_26", + "start": 1005, + "end": 1007, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_166_27", + "start": 1008, + "end": 1012, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "many" + } + }, + { + "id": "tk_167_28", + "start": 1013, + "end": 1017, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "such" + } + }, + { + "id": "tk_168_29", + "start": 1018, + "end": 1025, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "lesions" + } + }, + { + "id": "tk_169_30", + "start": 1025, + "end": 1026, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_170_0", + "start": 1027, + "end": 1040, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "Granulomatous" + } + }, + { + "id": "tk_171_1", + "start": 1041, + "end": 1053, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "inflammation" + } + }, + { + "id": "tk_172_2", + "start": 1054, + "end": 1056, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_173_3", + "start": 1057, + "end": 1067, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "connection" + } + }, + { + "id": "tk_174_4", + "start": 1068, + "end": 1072, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "with" + } + }, + { + "id": "tk_175_5", + "start": 1073, + "end": 1084, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "actinically" + } + }, + { + "id": "tk_176_6", + "start": 1085, + "end": 1095, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "degenerate" + } + }, + { + "id": "tk_177_7", + "start": 1096, + "end": 1104, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "internal" + } + }, + { + "id": "tk_178_8", + "start": 1105, + "end": 1112, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_179_9", + "start": 1113, + "end": 1119, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "lamina" + } + }, + { + "id": "tk_180_10", + "start": 1120, + "end": 1127, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "appears" + } + }, + { + "id": "tk_181_11", + "start": 1128, + "end": 1130, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_182_12", + "start": 1131, + "end": 1133, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "be" + } + }, + { + "id": "tk_183_13", + "start": 1134, + "end": 1137, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_184_14", + "start": 1138, + "end": 1143, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "basis" + } + }, + { + "id": "tk_185_15", + "start": 1144, + "end": 1146, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_186_16", + "start": 1147, + "end": 1155, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_187_17", + "start": 1156, + "end": 1165, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_188_18", + "start": 1165, + "end": 1166, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_189_0", + "start": 1167, + "end": 1174, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "Actinic" + } + }, + { + "id": "tk_190_1", + "start": 1175, + "end": 1185, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "granulomas" + } + }, + { + "id": "tk_191_2", + "start": 1186, + "end": 1189, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "MD", + "word": "may" + } + }, + { + "id": "tk_192_3", + "start": 1190, + "end": 1195, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "occur" + } + }, + { + "id": "tk_193_4", + "start": 1196, + "end": 1198, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_194_5", + "start": 1199, + "end": 1202, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_195_6", + "start": 1203, + "end": 1207, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_196_7", + "start": 1208, + "end": 1220, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "concurrently" + } + }, + { + "id": "tk_197_8", + "start": 1221, + "end": 1225, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "with" + } + }, + { + "id": "tk_198_9", + "start": 1226, + "end": 1234, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_199_10", + "start": 1235, + "end": 1244, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_200_11", + "start": 1244, + "end": 1245, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_201_0", + "start": 1246, + "end": 1247, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "A" + } + }, + { + "id": "tk_202_1", + "start": 1248, + "end": 1254, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "recent" + } + }, + { + "id": "tk_203_2", + "start": 1255, + "end": 1260, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "study" + } + }, + { + "id": "tk_204_3", + "start": 1261, + "end": 1263, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_205_4", + "start": 1264, + "end": 1272, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_206_5", + "start": 1273, + "end": 1282, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_207_6", + "start": 1283, + "end": 1291, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "strongly" + } + }, + { + "id": "tk_208_7", + "start": 1292, + "end": 1299, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "relates" + } + }, + { + "id": "tk_209_8", + "start": 1300, + "end": 1303, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "PRP$", + "word": "its" + } + }, + { + "id": "tk_210_9", + "start": 1304, + "end": 1311, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_211_10", + "start": 1312, + "end": 1318, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_212_11", + "start": 1319, + "end": 1326, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "changes" + } + }, + { + "id": "tk_213_12", + "start": 1327, + "end": 1329, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_214_13", + "start": 1330, + "end": 1335, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "those" + } + }, + { + "id": "tk_215_14", + "start": 1336, + "end": 1338, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_216_15", + "start": 1339, + "end": 1340, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_217_16", + "start": 1340, + "end": 1351, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "accelerated" + } + }, + { + "id": "tk_218_17", + "start": 1351, + "end": 1352, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_219_18", + "start": 1353, + "end": 1368, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "atherosclerosis" + } + }, + { + "id": "tk_220_19", + "start": 1368, + "end": 1369, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + } + ] + } + ] + } +} \ No newline at end of file From e8a992f2ac3f11d810ce18575300059c49c9db76 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 18 Feb 2019 06:27:06 +0100 Subject: [PATCH 07/18] #1329 - Span annotations with slot features may disappear from WebAnno TSV - Add a placeholder in the TARGET column for slot features if there are no slots at all - Updated existing unit tests with the placeholder - Added an additional unit test - Did not update the TSV version since the change is not necessarily incompatible --- .../tsv3x/Tsv3XCasDocumentBuilder.java | 14 ++++---- .../tsv/internal/tsv3x/Tsv3XDeserializer.java | 5 +-- .../tsv/internal/tsv3x/Tsv3XSerializer.java | 4 ++- .../tsv/WebAnnoTsv3WriterTestBase.java | 35 +++++++++++++++++-- .../resources/desc/type/webannoTestTypes.xml | 31 ++++++++++++++++ .../sampleSlotAnnotation2/reference.tsv | 2 +- .../reference.tsv | 2 +- .../testUnsetSlotFeature/reference.tsv | 11 ++++++ .../testUnsetSlotFeature/reference.xmi | 15 ++++++++ 9 files changed, 106 insertions(+), 13 deletions(-) create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java index 48111a9960..ccb8295974 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java @@ -300,13 +300,15 @@ private static void scanUnitForAmbiguousSlotReferences(TsvUnit aUnit) for (AnnotationFS aFS : annotationsForColumn) { FeatureStructure[] links = getFeature(aFS, col.uimaFeature, FeatureStructure[].class); - for (FeatureStructure link : links) { - AnnotationFS targetFS = getFeature(link, TsvSchema.FEAT_SLOT_TARGET, - AnnotationFS.class); - if (targetFS == null) { - throw new IllegalStateException("Slot link has no target: " + link); + if (links != null) { + for (FeatureStructure link : links) { + AnnotationFS targetFS = getFeature(link, TsvSchema.FEAT_SLOT_TARGET, + AnnotationFS.class); + if (targetFS == null) { + throw new IllegalStateException("Slot link has no target: " + link); + } + aUnit.getDocument().addDisambiguationId(targetFS); } - aUnit.getDocument().addDisambiguationId(targetFS); } } } diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java index 5326ed63f5..2d2d13c4cb 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java @@ -833,12 +833,13 @@ private void setFeatures(TsvColumn aCol, TsvUnit aUnit, AnnotationFS aAnnotation FeatureStructure[] links = getFeature(aAnnotation, aCol.uimaFeature.getShortName(), FeatureStructure[].class); - assert values.length == links.length; + assert (links.length == 0 && values.length == 1 && NULL_VALUE.equals(values[0])) + || (values.length == links.length); for (int i = 0; i < values.length; i++) { String value = values[i]; - if (NULL_COLUMN.equals(value)) { + if (NULL_VALUE.equals(value) || NULL_COLUMN.equals(value)) { continue; } diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java index 1455d69052..5db5cb89c1 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java @@ -361,7 +361,9 @@ private static void writeSlotTarget(PrintWriter aOut, TsvDocument aDoc, TsvColum } } else { - aOut.print(NULL_COLUMN); + // If the slot hosts has no slots, we use this column as a placeholder so we know + // the span of the slot host + aOut.print(NULL_VALUE); } } diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java index dd4af76f9d..a6364e316d 100644 --- a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java @@ -1120,6 +1120,35 @@ WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); } + @Test + public void testUnsetSlotFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + makeLinkHostFS(jcas, "webanno.custom.FlexLinkHost", t1.getBegin(), t1.getEnd(), + (FeatureStructure[]) null); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.FlexLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.FlexLinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + @Test public void testSimpleSlotFeatureWithoutValues() throws Exception { @@ -1752,8 +1781,10 @@ private static AnnotationFS makeLinkHostFS(JCas aJCas, String aType, int aBegin, { Type hostType = aJCas.getTypeSystem().getType(aType); AnnotationFS hostA1 = aJCas.getCas().createAnnotation(hostType, aBegin, aEnd); - hostA1.setFeatureValue(hostType.getFeatureByBaseName("links"), - FSCollectionFactory.createFSArray(aJCas, asList(aLinks))); + if (aLinks != null) { + hostA1.setFeatureValue(hostType.getFeatureByBaseName("links"), + FSCollectionFactory.createFSArray(aJCas, asList(aLinks))); + } aJCas.getCas().addFsToIndexes(hostA1); return hostA1; } diff --git a/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml b/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml index 395315fa51..ea78fb510c 100644 --- a/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml +++ b/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml @@ -124,6 +124,37 @@ + + webanno.custom.FlexLinkHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.FlexLinkType + false + + + + + webanno.custom.FlexLinkType + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + uima.tcas.Annotation + + + webanno.custom.SimpleChain diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv index 94ba4bf9a7..904eeac4a9 100644 --- a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv @@ -3,7 +3,7 @@ #Text=This is a test . -1-1 0-4 This _[1]|pr1[2] _|1-1[1] +1-1 0-4 This _[1]|pr1[2] *|1-1[1] 1-2 5-7 is _ _ 1-3 8-9 a _ _ 1-4 10-14 test _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv index ecac5c0f84..475132155d 100644 --- a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv @@ -4,7 +4,7 @@ #Text=This is a test . -1-1 0-4 This _ _[1]|_[2] _|_ val1[1]|val2[2] +1-1 0-4 This _ _[1]|_[2] *|* val1[1]|val2[2] 1-2 5-7 is * _ _ _ 1-3 8-9 a * _ _ _ 1-4 10-14 test _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv new file mode 100644 index 0000000000..31ca959d59 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.FlexLinkHost|ROLE_webanno.custom.FlexLinkHost:links_webanno.custom.FlexLinkType|uima.tcas.Annotation + + +#Text=This is a test . +1-1 0-4 This _ _ * +1-2 5-7 is * _ _ +1-3 8-9 a * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi new file mode 100644 index 0000000000..2bccf68653 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + From ef09c065477db21971f3f0a4ceb9967c20022cf6 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 18 Feb 2019 06:41:19 +0100 Subject: [PATCH 08/18] #1329 - Span annotations with slot features may disappear from WebAnno TSV - Added component metadata --- .../ukp/dkpro/core/api/parameter/MimeTypes.java | 5 +++-- .../dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java | 5 +++++ .../dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java | 11 +++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java b/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java index 2f91d7d278..9266d539b4 100644 --- a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java +++ b/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java @@ -28,7 +28,7 @@ public final class MimeTypes public final static String APPLICATION_VND_XMI_XML = "application/vnd.xmi+xml"; - // DKPro application types + // DKPro Core application types public final static String APPLICATION_X_ANCORA_XML = "application/x.org.dkpro.ancora+xml"; public final static String APPLICATION_X_BNC = "application/x.org.dkpro.bnc+xml"; public final static String APPLICATION_X_BRAT = "application/x.org.dkpro.brat"; @@ -67,7 +67,7 @@ public final class MimeTypes // Non-standard text types public final static String TEXT_TCF = "text/tcf+xml"; - // DKPro text types + // DKPro Core text types public final static String TEXT_X_CONLL_2000 = "text/x.org.dkpro.conll-2000"; public final static String TEXT_X_CONLL_2002 = "text/x.org.dkpro.conll-2002"; public final static String TEXT_X_CONLL_2003 = "text/x.org.dkpro.conll-2003"; @@ -84,6 +84,7 @@ public final class MimeTypes public final static String TEXT_X_PTB_CHUNKED = "text/x.org.dkpro.ptb-chunked"; public final static String TEXT_X_PTB_COMBINED = "text/x.org.dkpro.ptb-combined"; public final static String TEXT_X_REUTERS21578 = "text/x.org.dkpro.reuters21578"; + public final static String TEXT_X_WEBANNO_TSV3 = "text/x.org.dkpro.webanno-tsv3"; // OpenNLP model types public final static String APPLICATION_X_OPENNLP_CHUNK = "application/x.org.dkpro.core.opennlp.chunk"; diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java index 2005fedd7b..d89428a20c 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java @@ -23,15 +23,20 @@ import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XDeserializer; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; +import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; /** * Reads the WebAnno TSV v3.x format. */ +@ResourceMetaData(name = "PubAnnotation Reader") +@MimeTypeCapability({MimeTypes.TEXT_X_WEBANNO_TSV3}) public class WebannoTsv3XReader extends JCasResourceCollectionReader_ImplBase { diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java index b6a0847bc8..dda5223bf7 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java @@ -23,6 +23,9 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasDocumentBuilder; import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasSchemaAnalyzer; @@ -32,10 +35,18 @@ import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; +import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; /** * Writes the WebAnno TSV v3.x format. */ +@ResourceMetaData(name = "WebAnno TSV v3.x Writer") +@MimeTypeCapability({MimeTypes.TEXT_X_WEBANNO_TSV3}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"}) public class WebannoTsv3XWriter extends JCasFileWriter_ImplBase { From 2de414c31e024a8bef442f7c27488597340634d7 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 18 Feb 2019 08:37:01 +0100 Subject: [PATCH 09/18] #1329 - Span annotations with slot features may disappear from WebAnno TSV - Added component metadata --- .../java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java index d89428a20c..a04efd3d99 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java @@ -35,7 +35,7 @@ /** * Reads the WebAnno TSV v3.x format. */ -@ResourceMetaData(name = "PubAnnotation Reader") +@ResourceMetaData(name = "WebAnno TSV v3.x Reader") @MimeTypeCapability({MimeTypes.TEXT_X_WEBANNO_TSV3}) public class WebannoTsv3XReader extends JCasResourceCollectionReader_ImplBase From a3f5dbeaa8f9bbd732c4e31d5df9f11efca736a0 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 19 Feb 2019 13:48:54 +0100 Subject: [PATCH 10/18] #1327 - Update LIF support - Add metadata about converted layers - Consider deprecated NE types - Add correct token to token map - Remove unnecessary entries from the POM --- dkpro-core-io-lif-asl/pom.xml | 30 --------------- .../dkpro/core/io/lif/internal/DKPro2Lif.java | 12 +++++- .../dkpro/core/io/lif/internal/Lif2DKPro.java | 21 ++++++++++- .../src/test/resources/conll/2006/fi-ref.lif | 37 ++++++++++++++++++- .../resources/lif/dependencystructure-ref.lif | 37 ++++++++++++++++++- .../resources/lif/phrasestructure-ref.lif | 37 ++++++++++++++++++- .../test/resources/lif/stanford-pos-ref.lif | 37 ++++++++++++++++++- 7 files changed, 174 insertions(+), 37 deletions(-) diff --git a/dkpro-core-io-lif-asl/pom.xml b/dkpro-core-io-lif-asl/pom.xml index e12a8b2fa9..d8830ba729 100644 --- a/dkpro-core-io-lif-asl/pom.xml +++ b/dkpro-core-io-lif-asl/pom.xml @@ -85,13 +85,6 @@ discriminator 2.3.3 - it.unimi.dsi fastutil @@ -105,12 +98,6 @@ junit test - - net.javacrumbs.json-unit - json-unit - 2.4.0 - test - de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.testing-asl @@ -122,21 +109,4 @@ test - diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java index 6ae657d549..ba4a80c236 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java @@ -44,6 +44,8 @@ public class DKPro2Lif { + private static final String DKPRO_CORE_LIF_CONVERTER = "DKPro Core LIF Converter"; + private static final String PHRASE_STRUCTURE = "phrasestruct"; private static final String CONSTITUENT = "const"; private static final String DEPENDENCY_STRUCTURE = "depstruct"; @@ -67,32 +69,40 @@ public void convert(JCas aJCas, Container container) for (Paragraph p : select(aJCas, Paragraph.class)) { convertParagraph(view, p); } + view.addContains(Discriminators.Uri.PARAGRAPH, DKPRO_CORE_LIF_CONVERTER, "Paragraph"); // Sentence for (Sentence s : select(aJCas, Sentence.class)) { convertSentence(view, s); } + view.addContains(Discriminators.Uri.SENTENCE, DKPRO_CORE_LIF_CONVERTER, "Sentence"); // Token, POS, Lemma for (Token t : select(aJCas, Token.class)) { convertToken(view, t); } + view.addContains(Discriminators.Uri.TOKEN, DKPRO_CORE_LIF_CONVERTER, "Token"); + view.addContains(Discriminators.Uri.LEMMA, DKPRO_CORE_LIF_CONVERTER, "Lemma"); + view.addContains(Discriminators.Uri.POS, DKPRO_CORE_LIF_CONVERTER, "POS"); // NamedEntity for (NamedEntity ne : select(aJCas, NamedEntity.class)) { convertNamedEntity(view, ne); } + view.addContains(Discriminators.Uri.NE, DKPRO_CORE_LIF_CONVERTER, "Named entity"); // Dependencies for (Sentence s : select(aJCas, Sentence.class)) { convertDependencies(view, s); } + view.addContains(Discriminators.Uri.DEPENDENCY, DKPRO_CORE_LIF_CONVERTER, "Dependencies"); // Constituents for (ROOT r : select(aJCas, ROOT.class)) { convertConstituents(view, r); } - + view.addContains(Discriminators.Uri.PHRASE_STRUCTURE, DKPRO_CORE_LIF_CONVERTER, + "Constituents"); } private void convertParagraph(View aTarget, Paragraph aParagraph) diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java index 821db0cbe7..2a90f4dce6 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java @@ -50,6 +50,7 @@ public class Lif2DKPro { + private static final String DKPRO_CORE_LIF_CONVERTER = "DKPro Core LIF Converter"; private Map tokenIdx; public void convert(Container aContainer, JCas aJCas) @@ -65,31 +66,38 @@ public void convert(Container aContainer, JCas aJCas) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.PARAGRAPH.equals(a.getAtType())) .forEach(para -> convertParagraph(aJCas, para)); + view.addContains(Discriminators.Uri.PARAGRAPH, DKPRO_CORE_LIF_CONVERTER, "Paragraph"); // Sentence view.getAnnotations().stream() .filter(a -> Discriminators.Uri.SENTENCE.equals(a.getAtType())) .forEach(sent -> convertSentence(aJCas, sent)); + view.addContains(Discriminators.Uri.SENTENCE, DKPRO_CORE_LIF_CONVERTER, "Sentence"); // Token, POS, Lemma (builds token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.TOKEN.equals(a.getAtType())) .forEach(token -> convertToken(aJCas, token)); + view.addContains(Discriminators.Uri.TOKEN, DKPRO_CORE_LIF_CONVERTER, "Token"); // NamedEntity view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.NE.equals(a.getAtType())) + .filter(a -> isNamedEntity(a.getAtType())) .forEach(ne -> convertNamedEntity(aJCas, ne)); + view.addContains(Discriminators.Uri.NE, DKPRO_CORE_LIF_CONVERTER, "Named entity"); // Dependencies (requires token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.DEPENDENCY.equals(a.getAtType())) .forEach(dep -> convertDependency(aJCas, dep)); + view.addContains(Discriminators.Uri.DEPENDENCY, DKPRO_CORE_LIF_CONVERTER, "Dependencies"); // Constituents (requires token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.PHRASE_STRUCTURE.equals(a.getAtType())) .forEach(ps -> convertConstituents(aJCas, view, ps)); + view.addContains(Discriminators.Uri.PHRASE_STRUCTURE, DKPRO_CORE_LIF_CONVERTER, + "Constituents"); } private Object convertConstituents(JCas aJCas, View view, Annotation ps) @@ -210,7 +218,8 @@ private Token convertToken(JCas aTarget, Annotation aToken) } token.addToIndexes(); - tokenIdx.put(token.getId(), token); + + tokenIdx.put(aToken.getId(), token); return token; } @@ -311,4 +320,12 @@ private String findRoot(View aView, Annotation aPS) // Return the ID of the root constituent return constituents.iterator().next(); } + + private boolean isNamedEntity(String aTypeName) + { + return Discriminators.Uri.NE.equals(aTypeName) || Discriminators.Uri.DATE.equals(aTypeName) + || Discriminators.Uri.LOCATION.equals(aTypeName) + || Discriminators.Uri.ORGANIZATION.equals(aTypeName) + || Discriminators.Uri.PERSON.equals(aTypeName); + } } diff --git a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif index ba17bcc7d1..422704e70c 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif +++ b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif @@ -7,7 +7,42 @@ }, "views" : [ { "id" : "v1", - "metadata" : { }, + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, "annotations" : [ { "id" : "sent-0", "start" : 0, diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif index 6aef011486..9afd0d2302 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif @@ -7,7 +7,42 @@ }, "views" : [ { "id" : "v1", - "metadata" : { }, + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, "annotations" : [ { "id" : "sent-0", "start" : 0, diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif index 348c2a6fdd..fcf6b83f43 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif @@ -7,7 +7,42 @@ }, "views" : [ { "id" : "v1", - "metadata" : { }, + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, "annotations" : [ { "id" : "sent-0", "start" : 0, diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif index 744e069664..e1cfce0e1a 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif @@ -9,7 +9,42 @@ }, "views" : [ { "id" : "v1", - "metadata" : { }, + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, "annotations" : [ { "id" : "tok-0", "start" : 0, From 8c0037ede6deb3d872951587c88cc3426abcc9c5 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 19 Feb 2019 17:08:58 +0100 Subject: [PATCH 11/18] #1327 - Update LIF support - Don't add metadata to view on reading LIF --- .../ukp/dkpro/core/io/lif/internal/Lif2DKPro.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java index 2a90f4dce6..ed8c27ae06 100644 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java +++ b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java @@ -50,7 +50,6 @@ public class Lif2DKPro { - private static final String DKPRO_CORE_LIF_CONVERTER = "DKPro Core LIF Converter"; private Map tokenIdx; public void convert(Container aContainer, JCas aJCas) @@ -66,38 +65,31 @@ public void convert(Container aContainer, JCas aJCas) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.PARAGRAPH.equals(a.getAtType())) .forEach(para -> convertParagraph(aJCas, para)); - view.addContains(Discriminators.Uri.PARAGRAPH, DKPRO_CORE_LIF_CONVERTER, "Paragraph"); // Sentence view.getAnnotations().stream() .filter(a -> Discriminators.Uri.SENTENCE.equals(a.getAtType())) .forEach(sent -> convertSentence(aJCas, sent)); - view.addContains(Discriminators.Uri.SENTENCE, DKPRO_CORE_LIF_CONVERTER, "Sentence"); // Token, POS, Lemma (builds token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.TOKEN.equals(a.getAtType())) .forEach(token -> convertToken(aJCas, token)); - view.addContains(Discriminators.Uri.TOKEN, DKPRO_CORE_LIF_CONVERTER, "Token"); // NamedEntity view.getAnnotations().stream() .filter(a -> isNamedEntity(a.getAtType())) .forEach(ne -> convertNamedEntity(aJCas, ne)); - view.addContains(Discriminators.Uri.NE, DKPRO_CORE_LIF_CONVERTER, "Named entity"); // Dependencies (requires token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.DEPENDENCY.equals(a.getAtType())) .forEach(dep -> convertDependency(aJCas, dep)); - view.addContains(Discriminators.Uri.DEPENDENCY, DKPRO_CORE_LIF_CONVERTER, "Dependencies"); // Constituents (requires token index) view.getAnnotations().stream() .filter(a -> Discriminators.Uri.PHRASE_STRUCTURE.equals(a.getAtType())) .forEach(ps -> convertConstituents(aJCas, view, ps)); - view.addContains(Discriminators.Uri.PHRASE_STRUCTURE, DKPRO_CORE_LIF_CONVERTER, - "Constituents"); } private Object convertConstituents(JCas aJCas, View view, Annotation ps) From 06d6da29b32d04863258ed7b3e0d2c47940aa80f Mon Sep 17 00:00:00 2001 From: Tobias Horsmann Date: Mon, 4 Mar 2019 14:12:01 +0100 Subject: [PATCH 12/18] #1308 - integrate mystem --- dkpro-core-asl/pom.xml | 6 + dkpro-core-mystem-asl/LICENSE.txt | 202 ++++++++++++ dkpro-core-mystem-asl/pom.xml | 115 +++++++ .../org/dkpro/core/mystem/MyStemStemmer.java | 291 ++++++++++++++++++ dkpro-core-mystem-asl/src/scripts/build.xml | 130 ++++++++ .../dkpro/core/mystem/MyStemStemmerTest.java | 54 ++++ 6 files changed, 798 insertions(+) create mode 100644 dkpro-core-mystem-asl/LICENSE.txt create mode 100644 dkpro-core-mystem-asl/pom.xml create mode 100644 dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java create mode 100644 dkpro-core-mystem-asl/src/scripts/build.xml create mode 100644 dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml index 727dc0cc63..81d222fe3d 100644 --- a/dkpro-core-asl/pom.xml +++ b/dkpro-core-asl/pom.xml @@ -441,6 +441,11 @@ de.tudarmstadt.ukp.dkpro.core.mecab-asl 1.11.0-SNAPSHOT + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.mystem-asl + 1.11.0-SNAPSHOT + de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.ngrams-asl @@ -616,6 +621,7 @@ ../dkpro-core-mecab-asl ../dkpro-core-morpha-asl ../dkpro-core-mstparser-asl + ../dkpro-core-mystem-asl ../dkpro-core-ngrams-asl ../dkpro-core-nlp4j-asl ../dkpro-core-norvig-asl diff --git a/dkpro-core-mystem-asl/LICENSE.txt b/dkpro-core-mystem-asl/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/dkpro-core-mystem-asl/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-mystem-asl/pom.xml b/dkpro-core-mystem-asl/pom.xml new file mode 100644 index 0000000000..949f26673a --- /dev/null +++ b/dkpro-core-mystem-asl/pom.xml @@ -0,0 +1,115 @@ + + + 4.0.0 + org.dkpro.core + dkpro-core-mystem-asl + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core-asl + 1.11.0-SNAPSHOT + ../dkpro-core-asl + + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + commons-io + + commons-io + + + + org.apache.commons + + commons-lang3 + + + + org.dkpro.core + dkpro-core-mystem-bin + + + de.tudarmstadt.ukp.dkpro.core + + de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl + + + + de.tudarmstadt.ukp.dkpro.core + + de.tudarmstadt.ukp.dkpro.core.api.resources-asl + + + + de.tudarmstadt.ukp.dkpro.core + + de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + + + + junit + junit + + + de.tudarmstadt.ukp.dkpro.core + + de.tudarmstadt.ukp.dkpro.core.testing-asl + + + + + + + org.dkpro.core + dkpro-core-mystem-bin + 20180116.0 + + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + org.dkpro.core:dkpro-core-mystem-bin + + + + + + + https://dkpro.github.io/dkpro-core/ + DKPro Core ASL - MyStem + \ No newline at end of file diff --git a/dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java b/dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java new file mode 100644 index 0000000000..08b82a58c1 --- /dev/null +++ b/dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.mystem; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.output.FileWriterWithEncoding; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.pear.util.FileUtil; +import org.apache.uima.resource.ResourceInitializationException; + +import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; +import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; +import de.tudarmstadt.ukp.dkpro.core.api.resources.PlatformDetector; +import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * This MyStem stemmer implementation only works with the Russian language. + */ +@Component(OperationType.STEMMER) +@ResourceMetaData(name = "MyStem Stemmer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("ru") +@TypeCapability(inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" }) +public class MyStemStemmer + extends FeaturePathAnnotatorBase +{ + + private static final String MESSAGE_DIGEST = MyStemStemmer.class.getName() + "_Messages"; + + private RuntimeProvider runtimeProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + catch32BitOperatinSystemUsers(); + + runtimeProvider = new RuntimeProvider("classpath:/org/dkpro/core/mystem/bin/"); + } + + private void catch32BitOperatinSystemUsers() + { + PlatformDetector detector = new PlatformDetector(); + if (detector.getArch().equals(PlatformDetector.ARCH_X86_32)) { + throw new UnsupportedOperationException("Only 64bit operating systems supported"); + } + } + + @Override + protected Set getDefaultPaths() + { + return Collections.singleton(Token.class.getName()); + } + + @Override + protected void generateAnnotations(JCas aJCas) + throws FeaturePathException, AnalysisEngineProcessException + { + + // CAS is necessary to retrieve values + CAS currCAS = aJCas.getCas(); + + // Try language set in CAS. + String lang = aJCas.getDocumentLanguage(); + + if (StringUtils.isBlank(lang)) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); + } + + lang = lang.toLowerCase(Locale.US); + + if (!"ru".equals(lang)) { // Only specified language is supported + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "unsupported_language_error", + new Object[] { lang }); + } + + for (String path : paths) { + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + String typeName = segments[0]; + + // Try to get the type from the typesystem of the CAS + Type t = CasUtil.getType(currCAS, typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // get an fpi object and initialize it + // initialize the FeaturePathInfo with the corresponding part + initializeFeaturePathInfoFrom(fp, segments); + + // get the annotations + AnnotationIndex idx = currCAS.getAnnotationIndex(t); + FSIterator iterator = idx.iterator(); + + List afs = new ArrayList<>(); + iterator.forEachRemaining(x -> afs.add((AnnotationFS) x)); + + // get the stems + PlatformDetector pd = new PlatformDetector(); + String platform = pd.getPlatformId(); + getLogger().info("Load binary for platform: [" + platform + "]"); + + File executableFile = getExecutable(); + + File inputFile = prepareInputfile(aJCas); + File outputFile = prepareOutputFile(); + + List cmd = new ArrayList<>(); + cmd.add(executableFile.getAbsolutePath()); + cmd.add("-n"); // one word per line output + cmd.add("-l"); // suppress input token form and output only stem + cmd.add(inputFile.getAbsolutePath()); + cmd.add(outputFile.getAbsolutePath()); + + runProcess(cmd); + + List l = readStemmerOutput(outputFile); + + if (afs.size() != l.size()) { + throw new AnalysisEngineProcessException(new IllegalStateException( + "Number of [" + t.getName() + "] annotations [" + afs.size() + + "] does not match with number of stems [" + l.size() + "]")); + } + + for (int i = 0; i < l.size(); i++) { + + AnnotationFS fs = afs.get(i); + String stem = l.get(i); + + if (this.filterFeaturePath != null) { + // check annotation filter condition + if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { + createStemAnnotation(aJCas, fs, stem); + } + } + else { // no annotation filter specified + createStemAnnotation(aJCas, fs, stem); + } + } + } + } + + private void createStemAnnotation(JCas aJCas, AnnotationFS fs, String stem) + throws AnalysisEngineProcessException + { + + // Check for blank text, it makes no sense to add a stem then (and raised an + // exception) + String value = fp.getValue(fs); + if (!StringUtils.isBlank(value)) { + Stem stemAnnot = new Stem(aJCas, fs.getBegin(), fs.getEnd()); + + stemAnnot.setValue(stem); + stemAnnot.addToIndexes(aJCas); + + // Try setting the "stem" feature on Tokens. + Feature feat = fs.getType().getFeatureByBaseName("stem"); + if (feat != null && feat.getRange() != null + && aJCas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { + fs.setFeatureValue(feat, stemAnnot); + } + } + } + + private List readStemmerOutput(File outputFile) throws AnalysisEngineProcessException + { + List readLines; + try { + readLines = FileUtils.readLines(outputFile, "utf-8"); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + return readLines; + } + + private void runProcess(List cmd) throws AnalysisEngineProcessException + { + try { + ProcessBuilder pb = new ProcessBuilder(); + pb.inheritIO(); + pb.command(cmd); + Process p = pb.start(); + p.waitFor(); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private File prepareOutputFile() throws AnalysisEngineProcessException + { + try { + File file = FileUtil.createTempFile("mystemOutput" + System.currentTimeMillis(), + ".txt"); + file.deleteOnExit(); + return file; + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private File prepareInputfile(JCas aJCas) throws AnalysisEngineProcessException + { + File inputTmp = null; + try { + inputTmp = FileUtil.createTempFile("mystemInput" + System.currentTimeMillis(), ".txt"); + + try (BufferedWriter wrt = new BufferedWriter( + new FileWriterWithEncoding(inputTmp, "utf-8"))) { + Iterator iterator = JCasUtil.select(aJCas, Token.class).iterator(); + while (iterator.hasNext()) { + Token next = iterator.next(); + wrt.write(next.getCoveredText()); + if (iterator.hasNext()) { + wrt.write(" "); + } + } + } + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + + if (inputTmp != null) { + inputTmp.deleteOnExit(); + } + return inputTmp; + } + + private File getExecutable() throws AnalysisEngineProcessException + { + File exec = null; + try { + exec = runtimeProvider.getFile("mystem"); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + return exec; + } +} diff --git a/dkpro-core-mystem-asl/src/scripts/build.xml b/dkpro-core-mystem-asl/src/scripts/build.xml new file mode 100644 index 0000000000..71b1c769ed --- /dev/null +++ b/dkpro-core-mystem-asl/src/scripts/build.xml @@ -0,0 +1,130 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MyStem version 3.1 + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java b/dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java new file mode 100644 index 0000000000..becb60c302 --- /dev/null +++ b/dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java @@ -0,0 +1,54 @@ +package org.dkpro.core.mystem; +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.jcas.JCas; +import org.dkpro.core.mystem.MyStemStemmer; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; +import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; +import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; + +public class MyStemStemmerTest +{ + @Test + public void testRussian() throws Exception + { + runTest("ru", "Не печатать исходные словоформы, только леммы и граммемы.", new String[] { + "не", "печатать", "исходный", "словоформа", "только", "лемма", "и", "граммема" }); + } + + private JCas runTest(String aLanguage, String aText, String[] aStems, Object... aParams) + throws Exception + { + JCas result = TestRunner.runTest(createEngineDescription(MyStemStemmer.class, aParams), + aLanguage, aText); + AssertAnnotations.assertStem(aStems, select(result, Stem.class)); + + return result; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} From 5f80ef4d3f0e80e8195a831be79816bef554bf44 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 4 Mar 2019 14:31:57 +0100 Subject: [PATCH 13/18] #1322 - Upgrade to OpenNLP 1.9.1 - Bumped dependency version to 1.9.1 --- dkpro-core-opennlp-asl/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dkpro-core-opennlp-asl/pom.xml b/dkpro-core-opennlp-asl/pom.xml index 0c06ed6b7d..5bfc020c15 100644 --- a/dkpro-core-opennlp-asl/pom.xml +++ b/dkpro-core-opennlp-asl/pom.xml @@ -30,7 +30,7 @@ DKPro Core ASL - OpenNLP (v ${opennlp.version}) (ASL) https://dkpro.github.io/dkpro-core/ - 1.9.0 + 1.9.1 From 8bdecb1a34b3b127d48dfc63beb77cf83801182d Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 19 Mar 2019 13:47:30 +0100 Subject: [PATCH 14/18] #1338 - Factor CAS <-> brat conversion code into Pojos - Moved conversion code from BratWriter to DKPro2Brat --- .../ukp/dkpro/core/io/brat/BratWriter.java | 469 +------------- .../ukp/dkpro/core/io/brat/DKPro2Brat.java | 593 ++++++++++++++++++ 2 files changed, 609 insertions(+), 453 deletions(-) create mode 100644 dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/DKPro2Brat.java diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java index 46f47b2df7..3446277da7 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java +++ b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java @@ -17,39 +17,22 @@ */ package de.tudarmstadt.ukp.dkpro.core.io.brat; -import static org.apache.uima.fit.util.JCasUtil.selectAll; - import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringWriter; import java.io.Writer; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; +import java.util.Collection; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.FSUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; @@ -59,19 +42,8 @@ import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotation; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotationDocument; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAttributeDecl; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratConfiguration; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratConstants; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotationDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgument; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgumentDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratRelationAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotationDrawingDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.Offsets; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.RelationParam; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TypeMapping; import eu.openminted.share.annotations.api.DocumentationResource; @@ -147,7 +119,6 @@ public class BratWriter extends JCasFileWriter_ImplBase @ConfigurationParameter(name = PARAM_RELATION_TYPES, mandatory = true, defaultValue = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent" }) private Set relationTypes; - private Map parsedRelationTypes; // /** // * Types that are events. Optionally, multiple slot features can be specified. @@ -176,7 +147,6 @@ public class BratWriter extends JCasFileWriter_ImplBase "de.tudarmstadt.ukp.dkpro.core.api.ner.type.(\\w+) -> $1" }) private String[] typeMappings; - private TypeMapping typeMapping; /** * The brat web application can currently not handle attributes on relations, thus they are @@ -209,41 +179,33 @@ public class BratWriter extends JCasFileWriter_ImplBase @ConfigurationParameter(name = PARAM_SHORT_ATTRIBUTE_NAMES, mandatory = true, defaultValue = "false") private boolean shortAttributeNames; - private int nextEventAnnotationId; - private int nextTextAnnotationId; - private int nextRelationAnnotationId; - private int nextAttributeId; - private int nextPaletteIndex; - private Map spanIdMap; - private BratConfiguration conf; - private final static Pattern NEWLINE_EXTRACT_PATTERN = Pattern.compile("(.+?)(?:\\R|$)+"); - - private Set warnings; + private DKPro2Brat converter; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); - conf = new BratConfiguration(); - - warnings = new LinkedHashSet(); - parsedRelationTypes = new HashMap<>(); - for (String rel : relationTypes) { - RelationParam p = RelationParam.parse(rel); - parsedRelationTypes.put(p.getType(), p); - } - // parsedEventTypes = new HashMap<>(); // for (String rel : eventTypes) { // EventParam p = EventParam.parse(rel); // parsedEventTypes.put(p.getType(), p); // } + conf = new BratConfiguration(); + converter = new DKPro2Brat(conf); + converter.setWriteNullAttributes(writeNullAttributes); + converter.setWriteRelationAttributes(writeRelationAttributes); + converter.setShortAttributeNames(shortAttributeNames); + converter.setPalette(palette); + converter.setExcludeTypes(excludeTypes); + converter.setSpanTypes(spanTypes); + converter.setRelationTypes( + relationTypes.stream().map(RelationParam::parse).collect(Collectors.toList())); if (enableTypeMappings) { - typeMapping = new TypeMapping(typeMappings); + converter.setTypeMapping(new TypeMapping(typeMappings)); } } @@ -251,13 +213,6 @@ public void initialize(UimaContext aContext) public void process(JCas aJCas) throws AnalysisEngineProcessException { - nextEventAnnotationId = 1; - nextTextAnnotationId = 1; - nextRelationAnnotationId = 1; - nextAttributeId = 1; - nextPaletteIndex = 0; - spanIdMap = new HashMap<>(); - try { if (".ann".equals(filenameSuffix)) { writeText(aJCas); @@ -284,10 +239,6 @@ public void collectionProcessComplete() catch (IOException e) { throw new AnalysisEngineProcessException(e); } - - for (String warning : warnings) { - getLogger().warn(warning); - } } private void writeAnnotationConfiguration() @@ -311,52 +262,10 @@ private void writeAnnotations(JCas aJCas) { BratAnnotationDocument doc = new BratAnnotationDocument(); - List relationFS = new ArrayList<>(); + Collection warnings = converter.convert(aJCas, doc); - Map eventFS = new LinkedHashMap<>(); - - // Go through all the annotations but only handle the ones that have no references to - // other annotations. - for (FeatureStructure fs : selectAll(aJCas)) { - // Skip document annotation - if (fs == aJCas.getDocumentAnnotationFs()) { - continue; - } - - // Skip excluded types - if (excludeTypes.contains(fs.getType().getName())) { - getLogger().debug("Excluding [" + fs.getType().getName() + "]"); - continue; - } - - if (spanTypes.contains(fs.getType().getName())) { - writeTextAnnotation(doc, (AnnotationFS) fs); - } - else if (parsedRelationTypes.containsKey(fs.getType().getName())) { - relationFS.add(fs); - } - else if (hasNonPrimitiveFeatures(fs) && (fs instanceof AnnotationFS)) { -// else if (parsedEventTypes.containsKey(fs.getType().getName())) { - BratEventAnnotation event = writeEventAnnotation(doc, (AnnotationFS) fs); - eventFS.put(event, fs); - } - else if (fs instanceof AnnotationFS) { - warnings.add("Assuming annotation type [" + fs.getType().getName() + "] is span"); - writeTextAnnotation(doc, (AnnotationFS) fs); - } - else { - warnings.add("Skipping annotation with type [" + fs.getType().getName() + "]"); - } - } - - // Handle relations now since now we can resolve their targets to IDs. - for (FeatureStructure fs : relationFS) { - writeRelationAnnotation(doc, fs); - } - - // Handle event slots now since now we can resolve their targets to IDs. - for (Entry e : eventFS.entrySet()) { - writeSlots(doc, e.getKey(), e.getValue()); + for (String warning : warnings) { + getLogger().warn(warning); } switch (filenameSuffix) { @@ -410,352 +319,6 @@ else if (fs instanceof AnnotationFS) { } } - /** - * Checks if the feature structure has non-default non-primitive properties. - */ - private boolean hasNonPrimitiveFeatures(FeatureStructure aFS) - { - for (Feature f : aFS.getType().getFeatures()) { - if (CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { - continue; - } - - if (!f.getRange().isPrimitive()) { - return true; - } - } - - return false; - } - - private String getBratType(Type aType) - { - if (enableTypeMappings) { - return typeMapping.getBratType(aType); - } - else { - return aType.getName().replace('.', '-'); - } - } - - private BratEventAnnotation writeEventAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) - { - - // Write trigger annotation - BratTextAnnotation trigger = splitNewline(aFS); - - nextTextAnnotationId++; - - // Write event annotation - BratEventAnnotation event = new BratEventAnnotation(nextEventAnnotationId, - getBratType(aFS.getType()), trigger.getId()); - spanIdMap.put(aFS, event.getId()); - nextEventAnnotationId++; - - // We do not add the trigger annotations to the document - they are owned by the event - //aDoc.addAnnotation(trigger); - event.setTriggerAnnotation(trigger); - - // Write attributes - writeAttributes(event, aFS); - - // Slots are written later after we know all the span/event IDs - - conf.addLabelDecl(event.getType(), aFS.getType().getShortName(), aFS.getType() - .getShortName().substring(0, 1)); - - if (!conf.hasDrawingDecl(event.getType())) { - conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(event.getType(), "black", - palette[nextPaletteIndex % palette.length])); - nextPaletteIndex++; - } - - aDoc.addAnnotation(event); - return event; - } - - private void writeSlots(BratAnnotationDocument aDoc, BratEventAnnotation aEvent, - FeatureStructure aFS) - { - String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); - String type = getBratType(aFS.getType()); - - assert type.equals(aEvent.getType()); - - BratEventAnnotationDecl decl = conf.getEventDecl(type); - if (decl == null) { - decl = new BratEventAnnotationDecl(superType, type); - conf.addEventDecl(decl); - } - - Map> slots = new LinkedHashMap<>(); - for (Feature feat : aFS.getType().getFeatures()) { - if (!isSlotFeature(aFS, feat)) { - continue; - } - String slot = feat.getShortName(); - - List args = slots.get(slot); - if (args == null) { - args = new ArrayList<>(); - slots.put(slot, args); - } - - if ( - FSUtil.isMultiValuedFeature(aFS, feat) - // this can only be true for array types - && feat.getRange().getComponentType() != null - // Avoid calling getParent on TOP - && !CAS.TYPE_NAME_TOP.equals(feat.getRange().getComponentType().getName()) - && CAS.TYPE_NAME_TOP.equals(aFS.getCAS().getTypeSystem() - .getParent(feat.getRange().getComponentType()).getName()) - && (feat.getRange().getComponentType().getFeatureByBaseName("target") != null) - && (feat.getRange().getComponentType().getFeatureByBaseName("role") != null) - ) { - // Handle WebAnno-style slot links - // FIXME It would be better if the link type could be configured, e.g. what - // is the name of the link feature and what is the name of the role feature... - // but right now we just keep it hard-coded to the values that are used - // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno - BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, - BratConstants.CARD_ZERO_OR_MORE); - decl.addSlot(slotDecl); - - FeatureStructure[] links = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); - if (links != null) { - for (FeatureStructure link : links) { - FeatureStructure target = FSUtil.getFeature(link, "target", - FeatureStructure.class); - Feature roleFeat = link.getType().getFeatureByBaseName("role"); - BratEventArgument arg = new BratEventArgument(slot, args.size(), - spanIdMap.get(target)); - args.add(arg); - - // Attach the role attribute to the target span - BratAnnotation targetAnno = aDoc.getAnnotation(spanIdMap.get(target)); - writePrimitiveAttribute(targetAnno, link, roleFeat); - } - } - } - else if (FSUtil.isMultiValuedFeature(aFS, feat)) { - // Handle normal multi-valued features - BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, - BratConstants.CARD_ZERO_OR_MORE); - decl.addSlot(slotDecl); - - FeatureStructure[] targets = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); - if (targets != null) { - for (FeatureStructure target : targets) { - BratEventArgument arg = new BratEventArgument(slot, args.size(), - spanIdMap.get(target)); - args.add(arg); - } - } - } - else { - // Handle normal single-valued features - BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, - BratConstants.CARD_OPTIONAL); - decl.addSlot(slotDecl); - - FeatureStructure target = FSUtil.getFeature(aFS, feat, FeatureStructure.class); - if (target != null) { - BratEventArgument arg = new BratEventArgument(slot, args.size(), - spanIdMap.get(target)); - args.add(arg); - } - } - } - - aEvent.setArguments(slots.values().stream().flatMap(args -> args.stream()) - .collect(Collectors.toList())); - } - - private boolean isSlotFeature(FeatureStructure aFS, Feature aFeature) - { - return !isInternalFeature(aFeature) - && (FSUtil.isMultiValuedFeature(aFS, aFeature) || !aFeature.getRange() - .isPrimitive()); - } - - private void writeRelationAnnotation(BratAnnotationDocument aDoc, FeatureStructure aFS) - { - RelationParam rel = parsedRelationTypes.get(aFS.getType().getName()); - - FeatureStructure arg1 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( - rel.getArg1())); - FeatureStructure arg2 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( - rel.getArg2())); - - if (arg1 == null || arg2 == null) { - throw new IllegalArgumentException("Dangling relation"); - } - - String arg1Id = spanIdMap.get(arg1); - String arg2Id = spanIdMap.get(arg2); - - if (arg1Id == null || arg2Id == null) { - throw new IllegalArgumentException("Unknown targets!"); - } - - String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); - String type = getBratType(aFS.getType()); - - BratRelationAnnotation anno = new BratRelationAnnotation(nextRelationAnnotationId, - type, rel.getArg1(), arg1Id, rel.getArg2(), arg2Id); - nextRelationAnnotationId++; - - conf.addRelationDecl(superType, type, rel.getArg1(), rel.getArg2()); - - conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() - .getShortName().substring(0, 1)); - - aDoc.addAnnotation(anno); - - // brat doesn't support attributes on relations - // https://github.com/nlplab/brat/issues/791 - if (writeRelationAttributes) { - writeAttributes(anno, aFS); - } - } - - - - private BratTextAnnotation splitNewline(AnnotationFS aFS) - { - - // extract all but newlines as groups - Matcher m = NEWLINE_EXTRACT_PATTERN.matcher(aFS.getCoveredText()); - List offsets = new ArrayList<>(); - while (m.find()) { - Offsets offset = new Offsets(m.start(1) + aFS.getBegin(), m.end(1) + aFS.getBegin() ); - offsets.add(offset); - } - // replaces any group of newline by one space - String[] texts = new String[] { aFS.getCoveredText().replaceAll("\\R+", " ") }; - return new BratTextAnnotation(nextTextAnnotationId, getBratType(aFS.getType()), offsets, - texts); - } - - private void writeTextAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) - { - String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); - String type = getBratType(aFS.getType()); - BratTextAnnotation anno = splitNewline(aFS); - - nextTextAnnotationId++; - - conf.addEntityDecl(superType, type); - - conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() - .getShortName().substring(0, 1)); - - if (!conf.hasDrawingDecl(anno.getType())) { - conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(anno.getType(), "black", - palette[nextPaletteIndex % palette.length])); - nextPaletteIndex++; - } - - aDoc.addAnnotation(anno); - - writeAttributes(anno, aFS); - - spanIdMap.put(aFS, anno.getId()); - } - - private boolean isInternalFeature(Feature aFeature) - { - // https://issues.apache.org/jira/browse/UIMA-4565 - return "uima.cas.AnnotationBase:sofa".equals(aFeature.getName()); - // return CAS.FEATURE_FULL_NAME_SOFA.equals(aFeature.getName()); - } - - private void writeAttributes(BratAnnotation aAnno, FeatureStructure aFS) - { - for (Feature feat : aFS.getType().getFeatures()) { - // Skip Sofa feature - if (isInternalFeature(feat)) { - continue; - } - - // No need to write begin / end, they are already on the text annotation - if (CAS.FEATURE_FULL_NAME_BEGIN.equals(feat.getName()) || - CAS.FEATURE_FULL_NAME_END.equals(feat.getName())) { - continue; - } - - // No need to write link endpoints again, they are already on the relation annotation - RelationParam relParam = parsedRelationTypes.get(aFS.getType().getName()); - if (relParam != null) { - if (relParam.getArg1().equals(feat.getShortName()) - || relParam.getArg2().equals(feat.getShortName())) { - continue; - } - } - - if (feat.getRange().isPrimitive()) { - writePrimitiveAttribute(aAnno, aFS, feat); - } - // The following warning is not relevant for event annotations because these render such - // features as slots. - else if (!(aAnno instanceof BratEventAnnotation)) { - warnings.add( - "Unable to render feature [" + feat.getName() + "] with range [" - + feat.getRange().getName() + "] as attribute"); - } - } - } - - private void writePrimitiveAttribute(BratAnnotation aAnno, FeatureStructure aFS, Feature feat) - { - String featureValue = aFS.getFeatureValueAsString(feat); - - // Do not write attributes with null values unless this is explicitly enabled - if (featureValue == null && !writeNullAttributes) { - return; - } - - String attributeName = shortAttributeNames ? feat.getShortName() - : aAnno.getType() + '_' + feat.getShortName(); - - aAnno.addAttribute(nextAttributeId, attributeName, featureValue); - nextAttributeId++; - - // Do not write certain values to the visual/annotation configuration because - // they are not compatible with the brat annotation file format. The values are - // still maintained in the ann file. - if (isValidFeatureValue(featureValue)) { - // Features are inherited to subtypes in UIMA. By storing the attribute under - // the name of the type that declares the feature (domain) instead of the name - // of the actual instance we are processing, we make sure not to maintain - // multiple value sets for the same feature. - BratAttributeDecl attrDecl = conf.addAttributeDecl( - aAnno.getType(), - getAllSubtypes(aFS.getCAS().getTypeSystem(), feat.getDomain()), - attributeName, featureValue); - conf.addDrawingDecl(attrDecl); - } - } - - // This generates lots of types as well that we may not otherwise have in declared in the - // brat configuration files, but brat doesn't seem to mind. - private Set getAllSubtypes(TypeSystem aTS, Type aType) - { - Set types = new LinkedHashSet<>(); - aTS.getProperlySubsumedTypes(aType).stream().forEach(t -> types.add(getBratType(t))); - return types; - } - - /** - * Some feature values do not need to be registered or cannot be registered because brat does - * not support them. - */ - private boolean isValidFeatureValue(String aFeatureValue) - { - // https://github.com/nlplab/brat/issues/1149 - return !(aFeatureValue == null || aFeatureValue.length() == 0 || aFeatureValue.equals(",")); - } - private void writeText(JCas aJCas) throws IOException { diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/DKPro2Brat.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/DKPro2Brat.java new file mode 100644 index 0000000000..9f1d2ab486 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/DKPro2Brat.java @@ -0,0 +1,593 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.dkpro.core.io.brat; + +import static org.apache.uima.fit.util.JCasUtil.selectAll; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.util.FSUtil; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotation; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotationDocument; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAttributeDecl; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratConfiguration; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratConstants; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotation; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotationDecl; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgument; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgumentDecl; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratRelationAnnotation; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotation; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotationDrawingDecl; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.Offsets; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.RelationParam; +import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TypeMapping; + +public class DKPro2Brat +{ + private final Log log = LogFactory.getLog(getClass()); + + private final static Pattern NEWLINE_EXTRACT_PATTERN = Pattern.compile("(.+?)(?:\\R|$)+"); + + private final BratConfiguration conf; + + private int nextEventAnnotationId; + private int nextTextAnnotationId; + private int nextRelationAnnotationId; + private int nextAttributeId; + private int nextPaletteIndex; + private Map spanIdMap; + + private Set warnings; + + private String[] palette = new String[] { "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", + "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd", "#ccebc5", "#ffed6f" }; + private Set excludeTypes = Collections + .singleton("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"); + private Set spanTypes = new HashSet<>(); + private Map parsedRelationTypes = new HashMap<>(); + private TypeMapping typeMapping; + + private boolean writeRelationAttributes; + private boolean writeNullAttributes; + private boolean shortAttributeNames; + + public DKPro2Brat(BratConfiguration aConf) + { + super(); + conf = aConf; + } + + public boolean isWriteRelationAttributes() + { + return writeRelationAttributes; + } + + public void setWriteRelationAttributes(boolean aWriteRelationAttributes) + { + writeRelationAttributes = aWriteRelationAttributes; + } + + public boolean isWriteNullAttributes() + { + return writeNullAttributes; + } + + public void setWriteNullAttributes(boolean aWriteNullAttributes) + { + writeNullAttributes = aWriteNullAttributes; + } + + public boolean isShortAttributeNames() + { + return shortAttributeNames; + } + + public void setShortAttributeNames(boolean aShortAttributeNames) + { + shortAttributeNames = aShortAttributeNames; + } + + + + public String[] getPalette() + { + return palette; + } + + public void setPalette(String[] aPalette) + { + palette = aPalette; + } + + public Set getExcludeTypes() + { + return excludeTypes; + } + + public void setExcludeTypes(Set aExcludeTypes) + { + excludeTypes = aExcludeTypes; + } + + public Map getRelationTypes() + { + return parsedRelationTypes; + } + + public void setRelationTypes(Collection aRelationTypes) + { + aRelationTypes.stream().forEachOrdered(p -> parsedRelationTypes.put(p.getType(), p)); + } + + public Set getSpanTypes() + { + return spanTypes; + } + + public void setSpanTypes(Set aSpanTypes) + { + spanTypes = aSpanTypes; + } + + public TypeMapping getTypeMapping() + { + return typeMapping; + } + + public void setTypeMapping(TypeMapping aTypeMapping) + { + typeMapping = aTypeMapping; + } + + private void init() + { + nextEventAnnotationId = 1; + nextTextAnnotationId = 1; + nextRelationAnnotationId = 1; + nextAttributeId = 1; + nextPaletteIndex = 0; + spanIdMap = new HashMap<>(); + warnings = new LinkedHashSet<>(); + } + + public Set convert(JCas aJCas, BratAnnotationDocument doc) + { + init(); + + List relationFS = new ArrayList<>(); + + Map eventFS = new LinkedHashMap<>(); + + // Go through all the annotations but only handle the ones that have no references to + // other annotations. + for (FeatureStructure fs : selectAll(aJCas)) { + // Skip document annotation + if (fs == aJCas.getDocumentAnnotationFs()) { + continue; + } + + // Skip excluded types + if (excludeTypes.contains(fs.getType().getName())) { + log.debug("Excluding [" + fs.getType().getName() + "]"); + continue; + } + + if (spanTypes.contains(fs.getType().getName())) { + writeTextAnnotation(doc, (AnnotationFS) fs); + } + else if (parsedRelationTypes.containsKey(fs.getType().getName())) { + relationFS.add(fs); + } + else if (hasNonPrimitiveFeatures(fs) && (fs instanceof AnnotationFS)) { +// else if (parsedEventTypes.containsKey(fs.getType().getName())) { + BratEventAnnotation event = writeEventAnnotation(doc, (AnnotationFS) fs); + eventFS.put(event, fs); + } + else if (fs instanceof AnnotationFS) { + warnings.add("Assuming annotation type [" + fs.getType().getName() + "] is span"); + writeTextAnnotation(doc, (AnnotationFS) fs); + } + else { + warnings.add("Skipping annotation with type [" + fs.getType().getName() + "]"); + } + } + + // Handle relations now since now we can resolve their targets to IDs. + for (FeatureStructure fs : relationFS) { + writeRelationAnnotation(doc, fs); + } + + // Handle event slots now since now we can resolve their targets to IDs. + for (Entry e : eventFS.entrySet()) { + writeSlots(doc, e.getKey(), e.getValue()); + } + + return warnings; + } + + /** + * Checks if the feature structure has non-default non-primitive properties. + */ + private boolean hasNonPrimitiveFeatures(FeatureStructure aFS) + { + for (Feature f : aFS.getType().getFeatures()) { + if (CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { + continue; + } + + if (!f.getRange().isPrimitive()) { + return true; + } + } + + return false; + } + + private BratEventAnnotation writeEventAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) + { + + // Write trigger annotation + BratTextAnnotation trigger = splitNewline(aFS); + + nextTextAnnotationId++; + + // Write event annotation + BratEventAnnotation event = new BratEventAnnotation(nextEventAnnotationId, + getBratType(aFS.getType()), trigger.getId()); + spanIdMap.put(aFS, event.getId()); + nextEventAnnotationId++; + + // We do not add the trigger annotations to the document - they are owned by the event + //aDoc.addAnnotation(trigger); + event.setTriggerAnnotation(trigger); + + // Write attributes + writeAttributes(event, aFS); + + // Slots are written later after we know all the span/event IDs + + conf.addLabelDecl(event.getType(), aFS.getType().getShortName(), aFS.getType() + .getShortName().substring(0, 1)); + + if (!conf.hasDrawingDecl(event.getType())) { + conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(event.getType(), "black", + palette[nextPaletteIndex % palette.length])); + nextPaletteIndex++; + } + + aDoc.addAnnotation(event); + return event; + } + + private void writeTextAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) + { + String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); + String type = getBratType(aFS.getType()); + BratTextAnnotation anno = splitNewline(aFS); + + nextTextAnnotationId++; + + conf.addEntityDecl(superType, type); + + conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() + .getShortName().substring(0, 1)); + + if (!conf.hasDrawingDecl(anno.getType())) { + conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(anno.getType(), "black", + palette[nextPaletteIndex % palette.length])); + nextPaletteIndex++; + } + + aDoc.addAnnotation(anno); + + writeAttributes(anno, aFS); + + spanIdMap.put(aFS, anno.getId()); + } + + private void writeRelationAnnotation(BratAnnotationDocument aDoc, FeatureStructure aFS) + { + RelationParam rel = parsedRelationTypes.get(aFS.getType().getName()); + + FeatureStructure arg1 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( + rel.getArg1())); + FeatureStructure arg2 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( + rel.getArg2())); + + if (arg1 == null || arg2 == null) { + throw new IllegalArgumentException("Dangling relation"); + } + + String arg1Id = spanIdMap.get(arg1); + String arg2Id = spanIdMap.get(arg2); + + if (arg1Id == null || arg2Id == null) { + throw new IllegalArgumentException("Unknown targets!"); + } + + String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); + String type = getBratType(aFS.getType()); + + BratRelationAnnotation anno = new BratRelationAnnotation(nextRelationAnnotationId, + type, rel.getArg1(), arg1Id, rel.getArg2(), arg2Id); + nextRelationAnnotationId++; + + conf.addRelationDecl(superType, type, rel.getArg1(), rel.getArg2()); + + conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() + .getShortName().substring(0, 1)); + + aDoc.addAnnotation(anno); + + // brat doesn't support attributes on relations + // https://github.com/nlplab/brat/issues/791 + if (writeRelationAttributes) { + writeAttributes(anno, aFS); + } + } + + private void writeAttributes(BratAnnotation aAnno, FeatureStructure aFS) + { + for (Feature feat : aFS.getType().getFeatures()) { + // Skip Sofa feature + if (isInternalFeature(feat)) { + continue; + } + + // No need to write begin / end, they are already on the text annotation + if (CAS.FEATURE_FULL_NAME_BEGIN.equals(feat.getName()) || + CAS.FEATURE_FULL_NAME_END.equals(feat.getName())) { + continue; + } + + // No need to write link endpoints again, they are already on the relation annotation + RelationParam relParam = parsedRelationTypes.get(aFS.getType().getName()); + if (relParam != null) { + if (relParam.getArg1().equals(feat.getShortName()) + || relParam.getArg2().equals(feat.getShortName())) { + continue; + } + } + + if (feat.getRange().isPrimitive()) { + writePrimitiveAttribute(aAnno, aFS, feat); + } + // The following warning is not relevant for event annotations because these render such + // features as slots. + else if (!(aAnno instanceof BratEventAnnotation)) { + warnings.add( + "Unable to render feature [" + feat.getName() + "] with range [" + + feat.getRange().getName() + "] as attribute"); + } + } + } + + private void writeSlots(BratAnnotationDocument aDoc, BratEventAnnotation aEvent, + FeatureStructure aFS) + { + String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); + String type = getBratType(aFS.getType()); + + assert type.equals(aEvent.getType()); + + BratEventAnnotationDecl decl = conf.getEventDecl(type); + if (decl == null) { + decl = new BratEventAnnotationDecl(superType, type); + conf.addEventDecl(decl); + } + + Map> slots = new LinkedHashMap<>(); + for (Feature feat : aFS.getType().getFeatures()) { + if (!isSlotFeature(aFS, feat)) { + continue; + } + String slot = feat.getShortName(); + + List args = slots.get(slot); + if (args == null) { + args = new ArrayList<>(); + slots.put(slot, args); + } + + if ( + FSUtil.isMultiValuedFeature(aFS, feat) + // this can only be true for array types + && feat.getRange().getComponentType() != null + // Avoid calling getParent on TOP + && !CAS.TYPE_NAME_TOP.equals(feat.getRange().getComponentType().getName()) + && CAS.TYPE_NAME_TOP.equals(aFS.getCAS().getTypeSystem() + .getParent(feat.getRange().getComponentType()).getName()) + && (feat.getRange().getComponentType().getFeatureByBaseName("target") != null) + && (feat.getRange().getComponentType().getFeatureByBaseName("role") != null) + ) { + // Handle WebAnno-style slot links + // FIXME It would be better if the link type could be configured, e.g. what + // is the name of the link feature and what is the name of the role feature... + // but right now we just keep it hard-coded to the values that are used + // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno + BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, + BratConstants.CARD_ZERO_OR_MORE); + decl.addSlot(slotDecl); + + FeatureStructure[] links = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); + if (links != null) { + for (FeatureStructure link : links) { + FeatureStructure target = FSUtil.getFeature(link, "target", + FeatureStructure.class); + Feature roleFeat = link.getType().getFeatureByBaseName("role"); + BratEventArgument arg = new BratEventArgument(slot, args.size(), + spanIdMap.get(target)); + args.add(arg); + + // Attach the role attribute to the target span + BratAnnotation targetAnno = aDoc.getAnnotation(spanIdMap.get(target)); + writePrimitiveAttribute(targetAnno, link, roleFeat); + } + } + } + else if (FSUtil.isMultiValuedFeature(aFS, feat)) { + // Handle normal multi-valued features + BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, + BratConstants.CARD_ZERO_OR_MORE); + decl.addSlot(slotDecl); + + FeatureStructure[] targets = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); + if (targets != null) { + for (FeatureStructure target : targets) { + BratEventArgument arg = new BratEventArgument(slot, args.size(), + spanIdMap.get(target)); + args.add(arg); + } + } + } + else { + // Handle normal single-valued features + BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, + BratConstants.CARD_OPTIONAL); + decl.addSlot(slotDecl); + + FeatureStructure target = FSUtil.getFeature(aFS, feat, FeatureStructure.class); + if (target != null) { + BratEventArgument arg = new BratEventArgument(slot, args.size(), + spanIdMap.get(target)); + args.add(arg); + } + } + } + + aEvent.setArguments(slots.values().stream().flatMap(args -> args.stream()) + .collect(Collectors.toList())); + } + + private boolean isSlotFeature(FeatureStructure aFS, Feature aFeature) + { + return !isInternalFeature(aFeature) + && (FSUtil.isMultiValuedFeature(aFS, aFeature) || !aFeature.getRange() + .isPrimitive()); + } + + + + + private boolean isInternalFeature(Feature aFeature) + { + // https://issues.apache.org/jira/browse/UIMA-4565 + return "uima.cas.AnnotationBase:sofa".equals(aFeature.getName()); + // return CAS.FEATURE_FULL_NAME_SOFA.equals(aFeature.getName()); + } + + private void writePrimitiveAttribute(BratAnnotation aAnno, FeatureStructure aFS, Feature feat) + { + String featureValue = aFS.getFeatureValueAsString(feat); + + // Do not write attributes with null values unless this is explicitly enabled + if (featureValue == null && !writeNullAttributes) { + return; + } + + String attributeName = shortAttributeNames ? feat.getShortName() + : aAnno.getType() + '_' + feat.getShortName(); + + aAnno.addAttribute(nextAttributeId, attributeName, featureValue); + nextAttributeId++; + + // Do not write certain values to the visual/annotation configuration because + // they are not compatible with the brat annotation file format. The values are + // still maintained in the ann file. + if (isValidFeatureValue(featureValue)) { + // Features are inherited to subtypes in UIMA. By storing the attribute under + // the name of the type that declares the feature (domain) instead of the name + // of the actual instance we are processing, we make sure not to maintain + // multiple value sets for the same feature. + BratAttributeDecl attrDecl = conf.addAttributeDecl( + aAnno.getType(), + getAllSubtypes(aFS.getCAS().getTypeSystem(), feat.getDomain()), + attributeName, featureValue); + conf.addDrawingDecl(attrDecl); + } + } + + // This generates lots of types as well that we may not otherwise have in declared in the + // brat configuration files, but brat doesn't seem to mind. + private Set getAllSubtypes(TypeSystem aTS, Type aType) + { + Set types = new LinkedHashSet<>(); + aTS.getProperlySubsumedTypes(aType).stream().forEach(t -> types.add(getBratType(t))); + return types; + } + + /** + * Some feature values do not need to be registered or cannot be registered because brat does + * not support them. + */ + private boolean isValidFeatureValue(String aFeatureValue) + { + // https://github.com/nlplab/brat/issues/1149 + return !(aFeatureValue == null || aFeatureValue.length() == 0 || aFeatureValue.equals(",")); + } + + private BratTextAnnotation splitNewline(AnnotationFS aFS) + { + + // extract all but newlines as groups + Matcher m = NEWLINE_EXTRACT_PATTERN.matcher(aFS.getCoveredText()); + List offsets = new ArrayList<>(); + while (m.find()) { + Offsets offset = new Offsets(m.start(1) + aFS.getBegin(), m.end(1) + aFS.getBegin() ); + offsets.add(offset); + } + // replaces any group of newline by one space + String[] texts = new String[] { aFS.getCoveredText().replaceAll("\\R+", " ") }; + return new BratTextAnnotation(nextTextAnnotationId, getBratType(aFS.getType()), offsets, + texts); + } + + private String getBratType(Type aType) + { + if (typeMapping != null) { + return typeMapping.getBratType(aType); + } + else { + return aType.getName().replace('.', '-'); + } + } +} From 6672b6c73a2467eb5821781f958f6a742124073a Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 19 Mar 2019 17:07:06 +0100 Subject: [PATCH 15/18] #1338 - Factor CAS <-> brat conversion code into Pojos - Adjusted dependencies --- dkpro-core-io-brat-asl/pom.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dkpro-core-io-brat-asl/pom.xml b/dkpro-core-io-brat-asl/pom.xml index b58d182682..a19e49701f 100644 --- a/dkpro-core-io-brat-asl/pom.xml +++ b/dkpro-core-io-brat-asl/pom.xml @@ -46,6 +46,14 @@ org.apache.commons commons-lang3 + + commons-logging + commons-logging-api + + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.springframework spring-core From b7ab1ed62466fbbb852f8c97bdb888e2be10c2eb Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 10 Apr 2019 20:39:42 +0200 Subject: [PATCH 16/18] #1325 - Avoid datasets being extracted outside their target directory - Fixing endless loop when extracting --- .../ukp/dkpro/core/api/datasets/internal/actions/Explode.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java index 53e95fff66..ea12f9aea4 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java +++ b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java @@ -150,7 +150,7 @@ private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aArchive.toString())).toAbsolutePath(); + Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -197,7 +197,7 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea throws IOException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aArchive.toString())).toAbsolutePath(); + Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; From 0371975bab12f8f94e2f3c655922a360c76b7519 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 10 Apr 2019 21:17:10 +0200 Subject: [PATCH 17/18] #1325 - Avoid datasets being extracted outside their target directory - Fix getting the archive base (i.e. the path without the extension) --- .../datasets/internal/actions/Explode.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java index ea12f9aea4..57628253d1 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java +++ b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java @@ -39,6 +39,8 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import com.github.junrar.Archive; import com.github.junrar.exception.RarException; @@ -54,6 +56,8 @@ public class Explode extends Action_ImplBase { + private final Log LOG = LogFactory.getLog(getClass()); + @Override public void apply(ActionDescription aAction, DatasetDescription aDataset, ArtifactDescription aPack, Path aCachedFile) @@ -103,7 +107,7 @@ private void extract7z(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); + Path base = getPathWithoutFileExtension(aArchive).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -150,7 +154,7 @@ private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); + Path base = getPathWithoutFileExtension(aArchive).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -158,6 +162,9 @@ private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); + LOG.info("Extracting files of [" + aArchive.getFileName() + "] to [" + aTarget.resolve(base) + + "]"); + try (Archive archive = new Archive(new FileVolumeManager(aArchive.toFile()))) { FileHeader fh = archive.nextFileHeader(); while (fh != null) { @@ -197,7 +204,7 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea throws IOException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = Paths.get(getBase(aArchive.getFileName().toString())).toAbsolutePath(); + Path base = getPathWithoutFileExtension(aArchive).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -205,6 +212,9 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); + LOG.info("Extracting files of [" + aArchive.getFileName() + "] to [" + aTarget.resolve(base) + + "]"); + ArchiveEntry entry = null; while ((entry = aAStream.getNextEntry()) != null) { String name = stripLeadingFolders(entry.getName(), strip); @@ -254,14 +264,14 @@ private String stripLeadingFolders(String aName, int aLevels) } } - public static String getBase(String aFilename) + public static Path getPathWithoutFileExtension(Path aFilename) { // We always extract archives into a subfolder. Figure out the name of the folder. - String base = aFilename; + String base = aFilename.getFileName().toString(); while (base.contains(".")) { base = FilenameUtils.removeExtension(base); } - return base; + return aFilename.getParent().resolve(base); } @SuppressWarnings("unchecked") From 6bdca97de39983083e400cb8715c019fe508b912 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 10 Apr 2019 22:14:54 +0200 Subject: [PATCH 18/18] #1325 - Avoid datasets being extracted outside their target directory - Fixing extraction path ... again --- .../datasets/internal/actions/Explode.java | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java index 57628253d1..a07fff7ad3 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java +++ b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java @@ -107,7 +107,7 @@ private void extract7z(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = getPathWithoutFileExtension(aArchive).toAbsolutePath(); + Path base = aTarget.resolve(getPathWithoutFileExtension(aArchive)).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -115,6 +115,9 @@ private void extract7z(ActionDescription aAction, Path aArchive, Path aTarget) AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); + LOG.info("Extracting files of [" + aArchive.getFileName() + "] to [" + aTarget.resolve(base) + + "]"); + try (SevenZFile archive = new SevenZFile(aArchive.toFile())) { SevenZArchiveEntry entry = archive.getNextEntry(); while (entry != null) { @@ -126,7 +129,7 @@ private void extract7z(ActionDescription aAction, Path aArchive, Path aTarget) } if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name).toAbsolutePath(); + Path out = base.resolve(name).toAbsolutePath(); if (!out.startsWith(base)) { throw new IOException( "Archive tries to generate file outside target folder: [" + name @@ -154,7 +157,7 @@ private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = getPathWithoutFileExtension(aArchive).toAbsolutePath(); + Path base = aTarget.resolve(getPathWithoutFileExtension(aArchive)).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -176,7 +179,7 @@ private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) } if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name).toAbsolutePath(); + Path out = base.resolve(name).toAbsolutePath(); if (!out.startsWith(base)) { throw new IOException( "Archive tries to generate file outside target folder: [" + name @@ -204,7 +207,7 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea throws IOException { // We always extract archives into a subfolder. Figure out the name of the folder. - Path base = getPathWithoutFileExtension(aArchive).toAbsolutePath(); + Path base = aTarget.resolve(getPathWithoutFileExtension(aArchive)).toAbsolutePath(); Map cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; @@ -225,7 +228,7 @@ private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStrea } if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name).toAbsolutePath(); + Path out = base.resolve(name).toAbsolutePath(); if (!out.startsWith(base)) { throw new IOException( "Archive tries to generate file outside target folder: [" + name + "]"); @@ -264,14 +267,20 @@ private String stripLeadingFolders(String aName, int aLevels) } } - public static Path getPathWithoutFileExtension(Path aFilename) + /** + * The the name of the archive without any extensions (e.g. in the case of multiple extensions + * such as .tar.gz). + */ + public static String getPathWithoutFileExtension(Path aFilename) { + + // We always extract archives into a subfolder. Figure out the name of the folder. String base = aFilename.getFileName().toString(); while (base.contains(".")) { base = FilenameUtils.removeExtension(base); } - return aFilename.getParent().resolve(base); + return base; } @SuppressWarnings("unchecked")