-
Notifications
You must be signed in to change notification settings - Fork 7
/
PrefixTokenizer.java
47 lines (40 loc) · 1.72 KB
/
PrefixTokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
* Made with all the love in the world
* by scireum in Remshalden, Germany
*
* Copyright by scireum GmbH
* http://www.scireum.de - [email protected]
*/
package sirius.biz.mongo;
import sirius.db.text.ChainableTokenProcessor;
import sirius.db.text.DeduplicateProcessor;
import sirius.db.text.PatternReplaceProcessor;
import sirius.db.text.PatternSplitProcessor;
import sirius.db.text.PipelineProcessor;
import sirius.db.text.ToLowercaseProcessor;
import sirius.db.text.TokenLimitProcessor;
import sirius.db.text.Tokenizer;
import java.util.regex.Pattern;
/**
* Provides the tokenizer which computes prefixes for the {@link PrefixSearchableEntity}.
*/
public class PrefixTokenizer extends Tokenizer {
/**
* Represents a regular expression which detects all characters which aren't allowed in a search prefix.
*/
private static final Pattern SPLIT_TOKEN_LEVEL_1 = Pattern.compile("[^\\p{L}\\d_\\-.]");
/**
* Represents a regular expression which detects all characters which are allowed in a search prefix but still cause
* a token to be splitted.
*/
private static final Pattern SPLIT_TOKEN_LEVEL_2 = Pattern.compile("[^\\p{L}]");
@Override
protected ChainableTokenProcessor createProcessor() {
return new PipelineProcessor(PatternReplaceProcessor.createRemoveControlCharacters(),
new PatternSplitProcessor(SPLIT_TOKEN_LEVEL_1, true, true),
new PatternSplitProcessor(SPLIT_TOKEN_LEVEL_2, true, true),
new TokenLimitProcessor(1, 255),
new ToLowercaseProcessor(),
new DeduplicateProcessor(true));
}
}