From 4dfffa9d83588b636588978b6a09b842d18f271a Mon Sep 17 00:00:00 2001 From: Duy Do Date: Mon, 20 Feb 2017 13:10:56 +0700 Subject: [PATCH 1/2] upgrade to Elasticsearch 5.2.1 --- pom.xml | 149 +++++++----------- src/main/assemblies/plugin.xml | 12 +- .../analysis/vi/VietnameseAnalyzer.java | 9 +- .../VietnameseAnalysisBinderProcessor.java | 30 ---- .../analysis/VietnameseAnalyzerProvider.java | 16 +- .../analysis/VietnameseTokenizerFactory.java | 12 +- .../analysis/VietnameseIndicesAnalysis.java | 58 ------- .../VietnameseIndicesAnalysisModule.java | 27 ---- .../analysis/vi/AnalysisVietnamesePlugin.java | 34 ++-- src/main/resources/plugin-security.policy | 2 + .../VietnameseAnalysisIntegrationTest.java | 73 --------- .../analysis/VietnameseAnalysisTest.java | 83 ++++++++++ .../analysis/VietnameseAnalysisTests.java | 102 ------------ src/test/resources/log4j.xml | 15 ++ .../index/analysis/vi_analysis.json | 16 +- 15 files changed, 193 insertions(+), 445 deletions(-) delete mode 100644 src/main/java/org/elasticsearch/index/analysis/VietnameseAnalysisBinderProcessor.java delete mode 100644 src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysis.java delete mode 100644 src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysisModule.java delete mode 100644 src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java delete mode 100644 src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTests.java create mode 100644 src/test/resources/log4j.xml diff --git a/pom.xml b/pom.xml index d3471c2..010340f 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-vietnamese - 2.4.1 + 5.0.0 jar elasticsearch-analysis-vietnamese https://github.com/duydo/elasticsearch-analysis-vietnamese/ @@ -16,6 +16,13 @@ repo + + + duydo + Duy Do + http://duydo.me + + scm:git:git@github.com:duydo/elasticsearch-analysis-vietnamese.git scm:git:git@github.com:duydo/elasticsearch-analysis-vietnamese.git @@ -23,45 +30,18 @@ UTF-8 - 1.7 - 2.4.1 - 5.5.2 - 1 - true - onerror - INFO + 1.8 + 5.2.1 + 6.4.1 + 4.1.0 + 2.7 - - org.apache.lucene - lucene-test-framework - ${lucene.version} - test - - - org.hamcrest - hamcrest-all - 1.3 - test - - - com.carrotsearch.randomizedtesting - randomizedtesting-runner - 2.1.14 - test - org.elasticsearch elasticsearch ${elasticsearch.version} - test-jar - test - - - org.elasticsearch - elasticsearch - ${elasticsearch.version} - compile + provided commons-io @@ -79,15 +59,28 @@ 4.1.1 - log4j - log4j - 1.2.17 - runtime + org.elasticsearch.test + framework + ${elasticsearch.version} + test - org.elasticsearch - elasticsearch - ${elasticsearch.version} + net.java.dev.jna + jna + ${jna.version} + test + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + test + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + test @@ -106,10 +99,10 @@ org.apache.maven.plugins maven-compiler-plugin - 2.3.2 + 3.3 - 1.6 - 1.6 + 1.8 + 1.8 @@ -127,7 +120,7 @@ maven-assembly-plugin - 2.3 + 2.6 false ${project.build.directory}/releases/ @@ -144,61 +137,35 @@ - + + + org.apache.maven.plugins + maven-surefire-plugin + 2.9 + + true + + com.carrotsearch.randomizedtesting junit4-maven-plugin - 2.0.12 + 2.3.3 + + + + + + + + + - tests + unit-tests test junit4 - - 20 - pipe,warn - true - - - - - - - - - ${tests.jvms} - - - - - - - **/*Tests.class - **/*Test.class - - - **/Abstract*.class - **/*StressTest.class - - - -Xmx512m - -XX:MaxDirectMemorySize=512m - -Des.logger.prefix= - - ${tests.shuffle} - diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index 11fbc4a..54d9df7 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -8,26 +8,24 @@ ${project.basedir}/src/main/resources/plugin-descriptor.properties - + elasticsearch true ${project.basedir}/src/main/resources/plugin-security.policy - + elasticsearch true + - / + elasticsearch true true - - org.elasticsearch:elasticsearch - - / + elasticsearch true true diff --git a/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java b/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java index 9b4dec0..f66c823 100644 --- a/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java +++ b/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java @@ -14,15 +14,8 @@ package org.apache.lucene.analysis.vi; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.StopwordAnalyzerBase; -import org.apache.lucene.util.Version; +import org.apache.lucene.analysis.*; -import java.io.Reader; import java.util.Arrays; import java.util.List; diff --git a/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalysisBinderProcessor.java deleted file mode 100644 index 61eab15..0000000 --- a/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalysisBinderProcessor.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.elasticsearch.index.analysis; - -/** - * @author duydo - */ -public class VietnameseAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { - @Override - public void processAnalyzers(AnalyzersBindings analyzersBindings) { - analyzersBindings.processAnalyzer("vi_analyzer", VietnameseAnalyzerProvider.class); - } - - @Override - public void processTokenizers(TokenizersBindings tokenizersBindings) { - tokenizersBindings.processTokenizer("vi_tokenizer", VietnameseTokenizerFactory.class); - } -} diff --git a/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java index 60e3bf8..9524389 100644 --- a/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java @@ -15,12 +15,9 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.vi.VietnameseAnalyzer; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.settings.IndexSettingsService; +import org.elasticsearch.index.IndexSettings; /** * @author duydo @@ -28,14 +25,9 @@ public class VietnameseAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final VietnameseAnalyzer analyzer; - @Inject - public VietnameseAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, - @Assisted String name, - @Assisted Settings settings) { - super(index, indexSettingsService.getSettings(), name, settings); - analyzer = new VietnameseAnalyzer( - Analysis.parseStopWords(env, settings, VietnameseAnalyzer.getDefaultStopSet(), true) - ); + public VietnameseAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + analyzer = new VietnameseAnalyzer(Analysis.parseStopWords(environment, settings, VietnameseAnalyzer.getDefaultStopSet(), true)); } @Override diff --git a/src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java index 1d56e76..c55b208 100644 --- a/src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java @@ -16,12 +16,9 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.vi.VietnameseTokenizer; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.settings.IndexSettingsService; - +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; /** * @author duydo @@ -31,9 +28,8 @@ public class VietnameseTokenizerFactory extends AbstractTokenizerFactory { private final boolean sentenceDetectorEnabled; private final boolean ambiguitiesResolved; - @Inject - public VietnameseTokenizerFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { - super(index, indexSettingsService.getSettings(), name, settings); + public VietnameseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); sentenceDetectorEnabled = settings.getAsBoolean("sentence_detector", Boolean.FALSE); ambiguitiesResolved = settings.getAsBoolean("ambiguities_resolved", Boolean.FALSE); } diff --git a/src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysis.java deleted file mode 100644 index 1d88c3d..0000000 --- a/src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysis.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.elasticsearch.indices.analysis; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.vi.VietnameseAnalyzer; -import org.apache.lucene.analysis.vi.VietnameseTokenizer; -import org.elasticsearch.common.component.AbstractComponent; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.analysis.AnalyzerScope; -import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; -import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; -import org.elasticsearch.index.analysis.TokenizerFactory; - -import java.io.Reader; - -/** - * Registers indices level analysis components so, if not explicitly configured, will be shared among all indices. - * - * @author duydo - */ -public class VietnameseIndicesAnalysis extends AbstractComponent { - @Inject - public VietnameseIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { - super(settings); - indicesAnalysisService.analyzerProviderFactories().put("vi_analyzer", - new PreBuiltAnalyzerProviderFactory("vi_analyzer", - AnalyzerScope.INDICES, new VietnameseAnalyzer() - ) - ); - indicesAnalysisService.tokenizerFactories().put("vi_tokenizer", - new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "vi_tokenizer"; - } - - @Override - public Tokenizer create() { - return new VietnameseTokenizer(); - } - }) - ); - } -} diff --git a/src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysisModule.java b/src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysisModule.java deleted file mode 100644 index 30ae8cd..0000000 --- a/src/main/java/org/elasticsearch/indices/analysis/VietnameseIndicesAnalysisModule.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.elasticsearch.indices.analysis; - -import org.elasticsearch.common.inject.AbstractModule; - -/** - * @author duydo - */ -public class VietnameseIndicesAnalysisModule extends AbstractModule { - @Override - protected void configure() { - bind(VietnameseIndicesAnalysis.class).asEagerSingleton(); - } -} diff --git a/src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisVietnamesePlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisVietnamesePlugin.java index 8488e9c..f077765 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisVietnamesePlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisVietnamesePlugin.java @@ -15,38 +15,30 @@ package org.elasticsearch.plugin.analysis.vi; -import com.google.common.collect.ImmutableList; -import org.elasticsearch.common.inject.Module; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.analysis.AnalysisModule; -import org.elasticsearch.index.analysis.VietnameseAnalysisBinderProcessor; -import org.elasticsearch.indices.analysis.VietnameseIndicesAnalysisModule; +import org.apache.lucene.analysis.Analyzer; +import org.elasticsearch.index.analysis.AnalyzerProvider; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.index.analysis.VietnameseAnalyzerProvider; +import org.elasticsearch.index.analysis.VietnameseTokenizerFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; -import java.util.Collection; import java.util.Collections; +import java.util.Map; /** * @author duydo */ -public class AnalysisVietnamesePlugin extends Plugin { +public class AnalysisVietnamesePlugin extends Plugin implements AnalysisPlugin { @Override - public String name() { - return "elasticsearch-analysis-vietnamese"; + public Map> getTokenizers() { + return Collections.singletonMap("vi_tokenizer", VietnameseTokenizerFactory::new); } @Override - public String description() { - return "Elasticsearch Vietnamese Analysis Plugin"; - } - - @Override - public Collection nodeModules() { - return Collections.singletonList(new VietnameseIndicesAnalysisModule()); - } - - public void onModule(AnalysisModule module) { - module.addProcessor(new VietnameseAnalysisBinderProcessor()); + public Map>> getAnalyzers() { + return Collections.singletonMap("vi_analyzer", VietnameseAnalyzerProvider::new); } } diff --git a/src/main/resources/plugin-security.policy b/src/main/resources/plugin-security.policy index f611d26..9a5310a 100644 --- a/src/main/resources/plugin-security.policy +++ b/src/main/resources/plugin-security.policy @@ -1,3 +1,5 @@ grant { permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; + permission java.lang.RuntimePermission "accessDeclaredMembers"; + permission java.io.FilePermission "*", "read,write"; }; diff --git a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java deleted file mode 100644 index fd2c5ee..0000000 --- a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.elasticsearch.index.analysis; - -/** - * @author duydo - */ - -import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.test.ESIntegTestCase; -import org.junit.Test; - -import java.io.IOException; -import java.util.concurrent.ExecutionException; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.CoreMatchers.notNullValue; - -@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.SUITE) -public class VietnameseAnalysisIntegrationTest extends ESIntegTestCase { - - @Test - public void testVietnameseAnalyzer() throws ExecutionException, InterruptedException { - AnalyzeResponse response = client().admin().indices() - .prepareAnalyze("công nghệ thông tin Việt Nam").setAnalyzer("vi_analyzer") - .execute().get(); - String[] expected = {"công nghệ thông tin", "việt nam"}; - assertThat(response, notNullValue()); - assertThat(response.getTokens().size(), is(2)); - for (int i = 0; i < expected.length; i++) { - assertThat(response.getTokens().get(i).getTerm(), is(expected[i])); - } - } - - @Test - public void testVietnameseAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException { - createIndex("test"); - ensureGreen("test"); - final XContentBuilder mapping = jsonBuilder().startObject() - .startObject("type") - .startObject("properties") - .startObject("foo") - .field("type", "string") - .field("analyzer", "vi_analyzer") - .endObject() - .endObject() - .endObject() - .endObject(); - client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get(); - index("test", "type", "1", "foo", "công nghệ thông tin Việt Nam"); - refresh(); - SearchResponse response = client().prepareSearch("test").setQuery( - QueryBuilders.matchQuery("foo", "Việt Nam") - ).execute().actionGet(); - assertThat(response.getHits().getTotalHits(), is(1L)); - } -} \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java new file mode 100644 index 0000000..c4683ff --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java @@ -0,0 +1,83 @@ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.vi.VietnameseAnalyzer; +import org.apache.lucene.analysis.vi.VietnameseTokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.plugin.analysis.vi.AnalysisVietnamesePlugin; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.io.StringReader; + +import static org.hamcrest.Matchers.*; +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; + +/** + * Created by duydo on 2/19/17. + */ +public class VietnameseAnalysisTest extends ESTestCase { + + public void testSimpleVietnameseAnalysis() throws IOException { + TestAnalysis analysis = createTestAnalysis(); + assertNotNull(analysis); + + TokenizerFactory tokenizerFactory = analysis.tokenizer.get("vi_tokenizer"); + assertNotNull(tokenizerFactory); + assertThat(tokenizerFactory, instanceOf(VietnameseTokenizerFactory.class)); + + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer"); + assertNotNull(analyzer); + assertThat(analyzer.analyzer(), instanceOf(VietnameseAnalyzer.class)); + + analyzer = analysis.indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); + assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(VietnameseTokenizer.class)); + + } + + + public void testVietnameseTokenizer() throws IOException { + TestAnalysis analysis = createTestAnalysis(); + TokenizerFactory tokenizerFactory = analysis.tokenizer.get("vi_tokenizer"); + assertNotNull(tokenizerFactory); + + Tokenizer tokenizer = tokenizerFactory.create(); + assertNotNull(tokenizer); + + tokenizer.setReader(new StringReader("Công nghệ thông tin Việt Nam")); + assertTokenStreamContents(tokenizer, new String[]{"Công nghệ thông tin", "Việt Nam"}); + } + + public void testVietnameseAnalyzer() throws IOException { + TestAnalysis analysis = createTestAnalysis(); + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer"); + assertNotNull(analyzer); + + TokenStream ts = analyzer.analyzer().tokenStream("test", "Công nghệ thông tin Việt Nam"); + CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + for (String expected : new String[]{"công nghệ thông tin", "việt nam"}) { + assertThat(ts.incrementToken(), equalTo(true)); + assertThat(term.toString(), equalTo(expected)); + } + assertThat(ts.incrementToken(), equalTo(false)); + } + + public TestAnalysis createTestAnalysis() throws IOException { + String json = "/org/elasticsearch/index/analysis/vi_analysis.json"; + Settings settings = Settings.builder() + .loadFromStream(json, VietnameseAnalysisTest.class.getResourceAsStream(json)) + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build(); + Settings nodeSettings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build(); + return createTestAnalysis(new Index("test", "_na_"), nodeSettings, settings, new AnalysisVietnamesePlugin()); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTests.java deleted file mode 100644 index de6a5ae..0000000 --- a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTests.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.elasticsearch.index.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.vi.VietnameseAnalyzer; -import org.apache.lucene.analysis.vi.VietnameseTokenizer; -import org.elasticsearch.Version; -import org.elasticsearch.cluster.metadata.IndexMetaData; -import org.elasticsearch.common.inject.Injector; -import org.elasticsearch.common.inject.ModulesBuilder; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.settings.SettingsModule; -import org.elasticsearch.env.Environment; -import org.elasticsearch.env.EnvironmentModule; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexNameModule; -import org.elasticsearch.index.settings.IndexSettingsModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import org.elasticsearch.plugin.analysis.vi.AnalysisVietnamesePlugin; -import org.elasticsearch.test.ESTestCase; -import org.junit.Test; - -import java.io.IOException; -import java.io.StringReader; - -import static org.elasticsearch.common.settings.Settings.settingsBuilder; -import static org.hamcrest.Matchers.*; - -/** - * @author duydo - */ -public class VietnameseAnalysisTests extends ESTestCase { - @Test - public void testDefaultsVietnameseAnalysis() throws IOException { - AnalysisService analysisService = createAnalysisService(); - - NamedAnalyzer analyzer = analysisService.analyzer("vi_analyzer"); - assertThat(analyzer.analyzer(), instanceOf(VietnameseAnalyzer.class)); - - analyzer = analysisService.analyzer("my_analyzer"); - assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); - assertThat(analyzer.analyzer().tokenStream(null, ""), instanceOf(VietnameseTokenizer.class)); - - TokenizerFactory tokenizerFactory = analysisService.tokenizer("vi_tokenizer"); - assertThat(tokenizerFactory, instanceOf(VietnameseTokenizerFactory.class)); - - String source = "công nghệ thông tin Việt Nam"; - String[] exptected = new String[]{"công nghệ thông tin", "Việt Nam"}; - - Tokenizer tokenizer = tokenizerFactory.create(); - tokenizer.setReader(new StringReader(source)); - assertSimpleTokenStreamOutput(tokenizer, exptected); - } - - public AnalysisService createAnalysisService() { - Settings settings = settingsBuilder().loadFromSource("org/elasticsearch/index/analysis/vi_analysis.json") - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .build(); - Index index = new Index("test"); - Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), - new EnvironmentModule(new Environment(settings))) - .createInjector(); - AnalysisModule analysisModule = new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)); - new AnalysisVietnamesePlugin().onModule(analysisModule); - Injector injector = new ModulesBuilder().add( - new IndexSettingsModule(index, settings), - new IndexNameModule(index), - analysisModule) - .createChildInjector(parentInjector); - return injector.getInstance(AnalysisService.class); - } - - public static void assertSimpleTokenStreamOutput(TokenStream stream, - String[] expected) throws IOException { - stream.reset(); - CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); - assertThat(termAttr, notNullValue()); - int i = 0; - while (stream.incrementToken()) { - assertThat(expected.length, greaterThan(i)); - assertThat("expected different term at index " + i, expected[i++], equalTo(termAttr.toString())); - } - assertThat("not all tokens produced", i, equalTo(expected.length)); - stream.end(); - stream.close(); - } -} diff --git a/src/test/resources/log4j.xml b/src/test/resources/log4j.xml new file mode 100644 index 0000000..ff8d1f3 --- /dev/null +++ b/src/test/resources/log4j.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/test/resources/org/elasticsearch/index/analysis/vi_analysis.json b/src/test/resources/org/elasticsearch/index/analysis/vi_analysis.json index fd0eada..52cc784 100644 --- a/src/test/resources/org/elasticsearch/index/analysis/vi_analysis.json +++ b/src/test/resources/org/elasticsearch/index/analysis/vi_analysis.json @@ -1,12 +1,12 @@ { - "index": { - "analysis": { - "analyzer": { - "my_analyzer": { - "type": "custom", - "tokenizer": "vi_tokenizer" - } - } + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "vi_tokenizer" } + } } + } } \ No newline at end of file From 42b81f7b1900334a29044efefb9ab14e8e45571f Mon Sep 17 00:00:00 2001 From: Duy Do Date: Mon, 20 Feb 2017 15:52:22 +0700 Subject: [PATCH 2/2] add unit test --- README.md | 56 ++++++++++--- pom.xml | 2 +- .../analysis/vi/VietnameseTokenizer.java | 5 +- .../VietnameseAnalysisIntegrationTest.java | 80 +++++++++++++++++++ 4 files changed, 127 insertions(+), 16 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java diff --git a/README.md b/README.md index 5ecfeeb..325c70e 100644 --- a/README.md +++ b/README.md @@ -3,24 +3,63 @@ Vietnamese Analysis Plugin for Elasticsearch Vietnamese Analysis plugin integrates Vietnamese language analysis into Elasticsearch. +The plugin provides the `vi_analyzer` analyzer and `vi_tokenizer` tokenizer. The `vi_analyzer` is composed of the `vi_tokenizer` tokenizer, the `lowercase` and `stop` filter. + + +## Installation on Elasticsearch 5.x + In order to install the plugin, choose a version in [releases](https://github.com/duydo/elasticsearch-analysis-vietnamese/releases) page then run: ```sh -bin/plugin install link/to/binary/version +bin/elasticsearch-plugin install link/to/binary/version ``` Or to build from source, you need to build it with Maven: ```bash mvn clean package -bin/plugin install file:target/releases/elasticsearch-analysis-vietnamese-2.4.1.zip +bin/elasticsearch-plugin install file:target/releases/elasticsearch-analysis-vietnamese-5.2.1.zip ``` -*Notes*: To build the plugin you need to clone and build the [vn-nlp-libararies](https://github.com/duydo/vn-nlp-libraries). The plugin uses [Lê Hồng Phương](http://mim.hus.vnu.edu.vn/phuonglh/) vnTokenizer library. Thanks thầy Lê Hồng Phương for great contribution. +*In order to build the plugin you need to build the [vn-nlp-libararies](https://github.com/duydo/vn-nlp-libraries) first. Thanks to thầy [Lê Hồng Phương](http://mim.hus.vnu.edu.vn/phuonglh/) for his VnTokenizer library.* + + + +## Example +```sh +curl "http://localhost:9200/_analyze?pretty" -d' +{ + "analyzer": "vi_analyzer", + "text": "Công nghệ thông tin Việt Nam" +}' +``` + +Result +```json +{ + "tokens" : [ + { + "token" : "công nghệ thông tin", + "start_offset" : 0, + "end_offset" : 19, + "type" : "word", + "position" : 0 + }, + { + "token" : "việt nam", + "start_offset" : 20, + "end_offset" : 28, + "type" : "name2", + "position" : 1 + } + ] +} +``` |Vietnamese Analysis Plugin|Elasticsearch| |---|---| -| master|2.4.1| +| master|5.2.1| +| 5.2.1|5.2.1| | 2.4.1|2.4.1| | 2.4.0|2.4.0| | 2.3.5|2.3.5| @@ -39,15 +78,6 @@ bin/plugin install file:target/releases/elasticsearch-analysis-vietnamese-2.4.1. | 0.1.1|1.4+| | 0.1|1.3| - -## User guide - -The plugin provides the `vi_analyzer` analyzer and `vi_tokenizer` tokenizer. - -The `vi_analyzer` is built using the `vi_tokenizer` tokenizer, the `lowercase` and `stop` filter. - - The analyzer analyzes `"công nghệ thông tin Việt Nam"` into `"công nghệ thông tin"` and `"việt nam"` tokens. - License ------- diff --git a/pom.xml b/pom.xml index 010340f..c7fa25c 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-vietnamese - 5.0.0 + 5.2.1 jar elasticsearch-analysis-vietnamese https://github.com/duydo/elasticsearch-analysis-vietnamese/ diff --git a/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java b/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java index 4317191..8800683 100644 --- a/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java +++ b/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java @@ -74,10 +74,11 @@ public VietnameseTokenizer(boolean sentenceDetectorEnabled, boolean ambiguitiesR tokenizer = AccessController.doPrivileged(new PrivilegedAction() { @Override public vn.hus.nlp.tokenizer.Tokenizer run() { - return TokenizerProvider.getInstance().getTokenizer(); + vn.hus.nlp.tokenizer.Tokenizer vnTokenizer = TokenizerProvider.getInstance().getTokenizer(); + vnTokenizer.setAmbiguitiesResolved(ambiguitiesResolved); + return vnTokenizer; } }); - tokenizer.setAmbiguitiesResolved(ambiguitiesResolved); } private void tokenize(Reader input) throws IOException { diff --git a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java new file mode 100644 index 0000000..03af453 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java @@ -0,0 +1,80 @@ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; +import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugin.analysis.vi.AnalysisVietnamesePlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.plugins.PluginInfo; +import org.elasticsearch.test.ESIntegTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.concurrent.ExecutionException; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.notNullValue; + +/** + * Created by duydo on 2/20/17. + */ +public class VietnameseAnalysisIntegrationTest extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + return Collections.singleton(AnalysisVietnamesePlugin.class); + } + + public void testPluginIsLoaded() throws Exception { + NodesInfoResponse response = client().admin().cluster().prepareNodesInfo().setPlugins(true).get(); + for (NodeInfo nodeInfo : response.getNodes()) { + boolean pluginFound = false; + for (PluginInfo pluginInfo : nodeInfo.getPlugins().getPluginInfos()) { + if (pluginInfo.getName().equals(AnalysisVietnamesePlugin.class.getName())) { + pluginFound = true; + break; + } + } + assertThat(pluginFound, is(true)); + } + } + + public void testVietnameseAnalyzer() throws ExecutionException, InterruptedException { + AnalyzeResponse response = client().admin().indices() + .prepareAnalyze("công nghệ thông tin Việt Nam").setAnalyzer("vi_analyzer") + .execute().get(); + String[] expected = {"công nghệ thông tin", "việt nam"}; + assertThat(response, notNullValue()); + assertThat(response.getTokens().size(), is(2)); + for (int i = 0; i < expected.length; i++) { + assertThat(response.getTokens().get(i).getTerm(), is(expected[i])); + } + } + + public void testVietnameseAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException { + createIndex("test"); + ensureGreen("test"); + final XContentBuilder mapping = jsonBuilder().startObject() + .startObject("type") + .startObject("properties") + .startObject("foo") + .field("type", "text") + .field("analyzer", "vi_analyzer") + .endObject() + .endObject() + .endObject() + .endObject(); + client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get(); + index("test", "type", "1", "foo", "công nghệ thông tin Việt Nam"); + refresh(); + SearchResponse response = client().prepareSearch("test").setQuery( + QueryBuilders.matchQuery("foo", "Việt Nam") + ).execute().actionGet(); + assertThat(response.getHits().getTotalHits(), is(1L)); + } +}