diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml index 0e6585510f..c3ee3d645c 100644 --- a/dkpro-core-asl/pom.xml +++ b/dkpro-core-asl/pom.xml @@ -364,6 +364,11 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.jtok-asl 1.9.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-kuromoji-asl + 1.9.0-SNAPSHOT de.tudarmstadt.ukp.dkpro.core @@ -556,6 +561,7 @@ ../dkpro-core-ixa-asl ../dkpro-core-jazzy-asl ../dkpro-core-jtok-asl + ../dkpro-core-kuromoji-asl ../dkpro-core-languagetool-asl ../dkpro-core-langdetect-asl ../dkpro-core-ldweb1t-asl diff --git a/dkpro-core-kuromoji-asl/LICENSE.txt b/dkpro-core-kuromoji-asl/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/dkpro-core-kuromoji-asl/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-kuromoji-asl/pom.xml b/dkpro-core-kuromoji-asl/pom.xml new file mode 100644 index 0000000000..e44f3f15e9 --- /dev/null +++ b/dkpro-core-kuromoji-asl/pom.xml @@ -0,0 +1,60 @@ + + + 4.0.0 + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core-asl + 1.9.0-SNAPSHOT + ../dkpro-core-asl + + org.dkpro.core + dkpro-core-kuromoji-asl + jar + DKPro Core ASL - Kuromoji + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + com.atilika.kuromoji + kuromoji-ipadic + 0.9.0 + + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + + + junit + junit + test + + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.testing-asl + test + + + \ No newline at end of file diff --git a/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java b/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java new file mode 100644 index 0000000000..070bf56bfe --- /dev/null +++ b/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.kuromoji; + +import java.util.List; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; + +import com.atilika.kuromoji.ipadic.Token; +import com.atilika.kuromoji.ipadic.Tokenizer; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; + +/** + * BreakIterator segmenter. + */ +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class KuromojiSegmenter + extends SegmenterBase +{ + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + } + + @Override + protected void process(JCas aJCas, String text, int zoneBegin) + throws AnalysisEngineProcessException + { + int sentenceBegin = 0; + int sentenceEnd = text.indexOf("。"); + while (sentenceEnd > sentenceBegin) { + String stext = text.substring(sentenceBegin, sentenceEnd + 1); + + processSentence(aJCas, stext, zoneBegin + sentenceBegin); + + sentenceBegin = sentenceEnd + 1; + sentenceEnd = text.indexOf("。", sentenceBegin); + } + + if (sentenceBegin < text.length()) { + String stext = text.substring(sentenceBegin, text.length()); + processSentence(aJCas, stext, zoneBegin + sentenceBegin); + } + } + + private Sentence processSentence(JCas aJCas, String text, int zoneBegin) + { + String innerText = text; + boolean addFinalToken = false; + if (innerText.endsWith("。")) { + innerText = text.substring(0, text.length() - 1); + addFinalToken = true; + } + + Tokenizer tokenizer = new Tokenizer(); + List tokens = tokenizer.tokenize(innerText); + + Annotation firstToken = null; + Annotation lastToken = null; + + for (Token t : tokens) { + Annotation ut = createToken(aJCas, t.getPosition() + zoneBegin, + t.getPosition() + t.getSurface().length() + zoneBegin); + + // Tokenizer reports whitespace as tokens - we don't add whitespace-only tokens. + if (ut == null) { + continue; + } + + if (firstToken == null) { + firstToken = ut; + } + + lastToken = ut; + } + + if (addFinalToken) { + lastToken = createToken(aJCas, zoneBegin + text.length() - 1, + zoneBegin + text.length()); + } + + if (firstToken != null && lastToken != null) { + return createSentence(aJCas, firstToken.getBegin(), lastToken.getEnd()); + } + else { + return null; + } + } +} diff --git a/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java b/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java new file mode 100644 index 0000000000..096a51a4cc --- /dev/null +++ b/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.kuromoji; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.kuromoji.KuromojiSegmenter; +import org.junit.Rule; +import org.junit.Test; + +import com.atilika.kuromoji.ipadic.Tokenizer; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; +import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; + +public class KuromojiSegmenterTest +{ + @Test + public void bug() throws Exception { + Tokenizer tokenizer = new Tokenizer() ; + List tokens = tokenizer.tokenize("「国宝五城」["); + for (com.atilika.kuromoji.ipadic.Token token : tokens) { + System.out.println(token.getSurface() + "\t" + token.getAllFeatures()); + } + } + + @Test + public void testJapanese() throws Exception + { + JCas jcas = JCasFactory.createText("滧の べ滦榥榜ぶ 廤ま楺獣お 䨣みゅ騪", "ja"); + + AnalysisEngine aed = createEngine(KuromojiSegmenter.class); + aed.process(jcas); + + String[] tokens = { "滧", "の", "べ", "滦", "榥", "榜", "ぶ", "廤", "ま", "楺", "獣", "お", "䨣", "み", + "ゅ", "騪" }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testJapanese2() throws Exception + { + JCas jcas = JCasFactory.createText("1993年(平成5年)12月にはユネスコの世界遺産(文化遺産)" + + "に登録された[13]。この他、「国宝五城」[注釈 1]や「三名城」、" + + "「三大平山城・三大連立式平山城」の一つにも数えられている。", "ja"); + + AnalysisEngine aed = createEngine(KuromojiSegmenter.class); + aed.process(jcas); + + String[] sentences = { + "1993年(平成5年)12月にはユネスコの世界遺産(文化遺産)に登録された[13]。", + "この他、「国宝五城」[注釈 1]や「三名城」、「三大平山城・三大連立式平山城」の一つにも数えられている。" }; + + String[] tokens = { "1993", "年", "(", "平成", "5", "年", ")", "12", "月", "に", "は", "ユネスコ", "の", + "世界", "遺産", "(", "文化", "遺産", ")", "に", "登録", "さ", "れ", "た", "[", "13", "]", "。", + "この", "他", "、", "「", "国宝", "五", "城", "」[", "注釈", "1", "]", "や", "「", "三", "名城", "」", + "、", "「", "三", "大平山", "城", "・", "三", "大", "連立", "式", "平山", "城", "」", "の", "一つ", "に", + "も", "数え", "られ", "て", "いる", "。" }; + + AssertAnnotations.assertSentence(sentences, select(jcas, Sentence.class)); + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); + +} diff --git a/dkpro-core-kuromoji-asl/src/test/resources/log4j.properties b/dkpro-core-kuromoji-asl/src/test/resources/log4j.properties new file mode 100644 index 0000000000..9f0bdd6149 --- /dev/null +++ b/dkpro-core-kuromoji-asl/src/test/resources/log4j.properties @@ -0,0 +1,12 @@ +log4j.rootLogger=WARN,development + +log4j.appender.development=org.apache.log4j.ConsoleAppender +log4j.appender.development.layout=org.apache.log4j.PatternLayout +log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n + +log4j.logger.de.tudarmstadt.ukp = DEBUG +log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO + +log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader = WARN +log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter = WARN +log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase = WARN \ No newline at end of file