From a647b7ee27f9a93bbbd72124dea380d4434e6e3e Mon Sep 17 00:00:00 2001 From: "leanne.laceybyrne@eliatra.com" Date: Fri, 11 Oct 2024 16:50:20 +0100 Subject: [PATCH 1/3] adding page letter tokenizer Signed-off-by: leanne.laceybyrne@eliatra.com --- _analyzers/tokenizers/letter-tokenizers.md | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 _analyzers/tokenizers/letter-tokenizers.md diff --git a/_analyzers/tokenizers/letter-tokenizers.md b/_analyzers/tokenizers/letter-tokenizers.md new file mode 100644 index 0000000000..0f89860a3b --- /dev/null +++ b/_analyzers/tokenizers/letter-tokenizers.md @@ -0,0 +1,36 @@ +--- +layout: default +title: Letter Tokenizer +parent: Tokenizers +nav_order: 60 +--- + +# Letter tokenizer + +The `letter` tokenizer splits text into words when it finds any character that isn't a letter. It works well for many European languages but struggles with some Asian languages where words aren’t separated by spaces. + + +## Example + +Let's use the `letter` tokenizer to process text by breaking it into individual terms whenever it encounters non-letter characters. + + +```json +POST _analyze +{ + "tokenizer": "letter", + "text": "Cats 4EVER love chasing butterflies and rainbows!" +} + +``` +{% include copy-curl.html %} + +Analyzing the text "Cats 4EVER love chasing butterflies and rainbows!" with the `letter` tokenizer produces the output: + +``` +"Cats", "EVER", "love", "chasing", "butterflies", "and", "rainbows" +``` + +## Configuration + +The letter tokenizer does not have any customizable settings. From 1dc5390e63af6e843cc55834bfaad6b5043819a1 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Fri, 3 Jan 2025 13:00:33 -0500 Subject: [PATCH 2/3] Doc review Signed-off-by: Fanit Kolchina --- _analyzers/tokenizers/index.md | 2 +- _analyzers/tokenizers/letter-tokenizers.md | 36 -------- _analyzers/tokenizers/letter.md | 97 ++++++++++++++++++++++ 3 files changed, 98 insertions(+), 37 deletions(-) delete mode 100644 _analyzers/tokenizers/letter-tokenizers.md create mode 100644 _analyzers/tokenizers/letter.md diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index e5ac796c12..1f9e49c855 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -2,7 +2,7 @@ layout: default title: Tokenizers nav_order: 60 -has_children: false +has_children: true has_toc: false redirect_from: - /analyzers/tokenizers/index/ diff --git a/_analyzers/tokenizers/letter-tokenizers.md b/_analyzers/tokenizers/letter-tokenizers.md deleted file mode 100644 index 0f89860a3b..0000000000 --- a/_analyzers/tokenizers/letter-tokenizers.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -layout: default -title: Letter Tokenizer -parent: Tokenizers -nav_order: 60 ---- - -# Letter tokenizer - -The `letter` tokenizer splits text into words when it finds any character that isn't a letter. It works well for many European languages but struggles with some Asian languages where words aren’t separated by spaces. - - -## Example - -Let's use the `letter` tokenizer to process text by breaking it into individual terms whenever it encounters non-letter characters. - - -```json -POST _analyze -{ - "tokenizer": "letter", - "text": "Cats 4EVER love chasing butterflies and rainbows!" -} - -``` -{% include copy-curl.html %} - -Analyzing the text "Cats 4EVER love chasing butterflies and rainbows!" with the `letter` tokenizer produces the output: - -``` -"Cats", "EVER", "love", "chasing", "butterflies", "and", "rainbows" -``` - -## Configuration - -The letter tokenizer does not have any customizable settings. diff --git a/_analyzers/tokenizers/letter.md b/_analyzers/tokenizers/letter.md new file mode 100644 index 0000000000..3b8d5babf9 --- /dev/null +++ b/_analyzers/tokenizers/letter.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Letter +parent: Tokenizers +nav_order: 60 +--- + +# Letter tokenizer + +The `letter` tokenizer splits text into words on any non-letter characters. It works well for many European languages but struggles with some Asian languages where words aren't separated by spaces. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `letter` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_letter_analyzer": { + "type": "custom", + "tokenizer": "letter" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_letter_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST _analyze +{ + "tokenizer": "letter", + "text": "Cats 4EVER love chasing butterflies!" +} + +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Cats", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "EVER", + "start_offset": 6, + "end_offset": 10, + "type": "word", + "position": 1 + }, + { + "token": "love", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "chasing", + "start_offset": 16, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "butterflies", + "start_offset": 24, + "end_offset": 35, + "type": "word", + "position": 4 + } + ] +} +``` From f9860bc18b65500f6c15c44698ddb177451a54a9 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:33:43 -0500 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/tokenizers/letter.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/letter.md b/_analyzers/tokenizers/letter.md index 3b8d5babf9..ba67a7841d 100644 --- a/_analyzers/tokenizers/letter.md +++ b/_analyzers/tokenizers/letter.md @@ -7,7 +7,7 @@ nav_order: 60 # Letter tokenizer -The `letter` tokenizer splits text into words on any non-letter characters. It works well for many European languages but struggles with some Asian languages where words aren't separated by spaces. +The `letter` tokenizer splits text into words on any non-letter characters. It works well with many European languages but is ineffective with some Asian languages in which words aren't separated by spaces. ## Example usage