Skip to content

Commit

Permalink
Merge pull request #655 from scireum/feature/sbi/SIRI-906
Browse files Browse the repository at this point in the history
Tokenization - Prevent detecting plus and ampersand as hard word boundary
  • Loading branch information
sabieber authored May 29, 2024
2 parents 9823a30 + d7c1809 commit 6a27652
Show file tree
Hide file tree
Showing 25 changed files with 355 additions and 316 deletions.
2 changes: 1 addition & 1 deletion src/main/java/sirius/db/text/BasicIndexTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class BasicIndexTokenizer extends Tokenizer {
@Override
protected ChainableTokenProcessor createProcessor() {
// ATTENTION: The pipeline used to contain a "new ReduceCharacterProcessor()" stage. This lead to the search
// index consisting of normalised strings, unlike earlier verions of sirius-db. Consequently, without migration,
// index consisting of normalised strings, unlike earlier versions of sirius-db. Consequently, without migration,
// existing indices were incompatible. Furthermore, ElasticQueryCompiler did not follow along and continued to
// query for raw strings, hence not finding search terms containing special characters such as umlauts.
// Weighing the two possible options — keeping the normalised index and normalising the search term, or
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/sirius/db/text/BypassProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
* i.e. properly enforce word boundaries when working with processors which emit purges internally.
*/
public class BypassProcessor extends ChainableTokenProcessor {
private ChainableTokenProcessor tokenProcessor;
private AtomicBoolean innerPurge = new AtomicBoolean();
private AtomicBoolean permitPurge = new AtomicBoolean();

private final ChainableTokenProcessor tokenProcessor;
private final AtomicBoolean innerPurge = new AtomicBoolean();
private final AtomicBoolean permitPurge = new AtomicBoolean();

/**
* Creates a new instance for the given processor.
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/sirius/db/text/DeduplicateProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
*/
public class DeduplicateProcessor extends ChainableTokenProcessor {

private Set<String> tokens = new HashSet<>();
private boolean global;
private final Set<String> tokens = new HashSet<>();
private final boolean global;

/**
* Creates a new processor.
Expand Down
62 changes: 33 additions & 29 deletions src/main/java/sirius/db/text/PatternExtractProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,13 @@
public class PatternExtractProcessor extends ChainableTokenProcessor {

private static final Pattern EXTRACT_EMAILS = Pattern.compile("(\\p{Alnum}[^@]++)@(.+)$");
/**
* Matches numbered placeholders like {0}, {1}, {2} etc.
*/
private static final Pattern NUMBERED_PLACEHOLDER = Pattern.compile("\\{(\\d+)}");

private static class ReplacementPattern {
int groupIndex = -1;
String staticString;

ReplacementPattern(String staticString) {
this.staticString = staticString;
}

ReplacementPattern(int groupIndex) {
this.groupIndex = groupIndex;
}

void execute(Matcher matcher, StringBuilder output) {
if (groupIndex > -1) {
output.append(matcher.group(groupIndex));
} else {
output.append(staticString);
}
}
}

private Pattern pattern;
private List<List<ReplacementPattern>> replacements;
private final Pattern pattern;
private final List<List<ReplacementPattern>> replacements;

/**
* Creates a new processor.
Expand Down Expand Up @@ -82,14 +65,14 @@ public static PatternExtractProcessor createEmailExtractor() {

private List<ReplacementPattern> compileReplacementPattern(String input) {
List<ReplacementPattern> result = new ArrayList<>();
Matcher matcher = Pattern.compile("\\{(\\d+)}").matcher(input);
Matcher numberedPlaceholderMatcher = NUMBERED_PLACEHOLDER.matcher(input);
int start = 0;
while (matcher.find(start)) {
if (matcher.start() > start) {
result.add(new ReplacementPattern(input.substring(start, matcher.start())));
while (numberedPlaceholderMatcher.find(start)) {
if (numberedPlaceholderMatcher.start() > start) {
result.add(new ReplacementPattern(input.substring(start, numberedPlaceholderMatcher.start())));
}
result.add(new ReplacementPattern(Integer.parseInt(matcher.group(1))));
start = matcher.end();
result.add(new ReplacementPattern(Integer.parseInt(numberedPlaceholderMatcher.group(1))));
start = numberedPlaceholderMatcher.end();
}

if (start < input.length()) {
Expand Down Expand Up @@ -123,4 +106,25 @@ public void accept(String value) {
emit(value);
}
}

private static class ReplacementPattern {
int groupIndex = -1;
String staticString;

ReplacementPattern(String staticString) {
this.staticString = staticString;
}

ReplacementPattern(int groupIndex) {
this.groupIndex = groupIndex;
}

void execute(Matcher matcher, StringBuilder output) {
if (groupIndex > -1) {
output.append(matcher.group(groupIndex));
} else {
output.append(staticString);
}
}
}
}
4 changes: 2 additions & 2 deletions src/main/java/sirius/db/text/PatternReplaceProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ public class PatternReplaceProcessor extends ChainableTokenProcessor {

private static final Pattern CONTROL_CHARACTERS = Pattern.compile("\\p{Cntrl}");

private Pattern pattern;
private String replacement;
private final Pattern pattern;
private final String replacement;

/**
* Creates a new processor.
Expand Down
10 changes: 5 additions & 5 deletions src/main/java/sirius/db/text/PatternSplitProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
*/
public class PatternSplitProcessor extends ChainableTokenProcessor {

private static final Pattern HARD_BOUNDARY = Pattern.compile("[^\\p{L}\\d_\\-.,:/\\\\@ ]");
private static final Pattern HARD_BOUNDARY = Pattern.compile("[^\\p{L}\\d_\\-.,:/\\\\@+& ]");
private static final Pattern SOFT_BOUNDARY = Pattern.compile("[^\\p{L}\\d]");
private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}");

private Pattern pattern;
private boolean preserveOriginal;
private boolean purge;
private final Pattern pattern;
private final boolean preserveOriginal;
private final boolean purge;

/**
* Creates a new processor.
Expand All @@ -41,7 +41,7 @@ public PatternSplitProcessor(Pattern pattern, boolean preserveOriginal, boolean
/**
* Creates a processor which splits a hard "word" boundaries.
* <p>
* This are all punctuation symbols other than <tt>/ , . : \ _ -</tt>
* These are all punctuation symbols other than <tt>/ , . : \ _ - @ + &</tt>
*
* @return a new processor which splits at hard token boundaries. This will {@link #purge()} after each sub token
* being emitted
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/sirius/db/text/PipelineProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
*/
public class PipelineProcessor extends ChainableTokenProcessor {

private List<ChainableTokenProcessor> processors;
private final List<ChainableTokenProcessor> processors;

/**
* Creates a new processor.
Expand All @@ -36,8 +36,8 @@ public PipelineProcessor(List<ChainableTokenProcessor> processors) {
*/
public PipelineProcessor(Stream<ChainableTokenProcessor> processorsStream) {
this.processors = processorsStream.filter(Objects::nonNull).toList();
for (int i = 0; i < processors.size() - 1; i++) {
processors.get(i).chain(processors.get(i + 1));
for (int processorIndex = 0; processorIndex < processors.size() - 1; processorIndex++) {
processors.get(processorIndex).chain(processors.get(processorIndex + 1));
}
}

Expand Down
39 changes: 0 additions & 39 deletions src/test/java/sirius/db/text/BasicIndexTokenizerTest.java

This file was deleted.

30 changes: 0 additions & 30 deletions src/test/java/sirius/db/text/PatternExtractProcessorTest.java

This file was deleted.

22 changes: 0 additions & 22 deletions src/test/java/sirius/db/text/PatternReplaceProcessorTest.java

This file was deleted.

32 changes: 0 additions & 32 deletions src/test/java/sirius/db/text/PatternSplitProcessorTest.java

This file was deleted.

38 changes: 0 additions & 38 deletions src/test/java/sirius/db/text/PipelineProcessorTest.java

This file was deleted.

24 changes: 0 additions & 24 deletions src/test/java/sirius/db/text/ReduceCharactersProcessorTest.java

This file was deleted.

23 changes: 0 additions & 23 deletions src/test/java/sirius/db/text/ToLowercaseProcessorTest.java

This file was deleted.

Loading

0 comments on commit 6a27652

Please sign in to comment.