Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenization - Prevent detecting plus and ampersand as hard word boundary #655

Merged
merged 4 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/main/java/sirius/db/text/BasicIndexTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class BasicIndexTokenizer extends Tokenizer {
@Override
protected ChainableTokenProcessor createProcessor() {
// ATTENTION: The pipeline used to contain a "new ReduceCharacterProcessor()" stage. This lead to the search
// index consisting of normalised strings, unlike earlier verions of sirius-db. Consequently, without migration,
// index consisting of normalised strings, unlike earlier versions of sirius-db. Consequently, without migration,
// existing indices were incompatible. Furthermore, ElasticQueryCompiler did not follow along and continued to
// query for raw strings, hence not finding search terms containing special characters such as umlauts.
// Weighing the two possible options — keeping the normalised index and normalising the search term, or
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/sirius/db/text/BypassProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
* i.e. properly enforce word boundaries when working with processors which emit purges internally.
*/
public class BypassProcessor extends ChainableTokenProcessor {
private ChainableTokenProcessor tokenProcessor;
private AtomicBoolean innerPurge = new AtomicBoolean();
private AtomicBoolean permitPurge = new AtomicBoolean();

private final ChainableTokenProcessor tokenProcessor;
private final AtomicBoolean innerPurge = new AtomicBoolean();
private final AtomicBoolean permitPurge = new AtomicBoolean();

/**
* Creates a new instance for the given processor.
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/sirius/db/text/DeduplicateProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
*/
public class DeduplicateProcessor extends ChainableTokenProcessor {

private Set<String> tokens = new HashSet<>();
private boolean global;
private final Set<String> tokens = new HashSet<>();
private final boolean global;

/**
* Creates a new processor.
Expand Down
62 changes: 33 additions & 29 deletions src/main/java/sirius/db/text/PatternExtractProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,13 @@
public class PatternExtractProcessor extends ChainableTokenProcessor {

private static final Pattern EXTRACT_EMAILS = Pattern.compile("(\\p{Alnum}[^@]++)@(.+)$");
/**
* Matches numbered placeholders like {0}, {1}, {2} etc.
*/
private static final Pattern NUMBERED_PLACEHOLDER = Pattern.compile("\\{(\\d+)}");

private static class ReplacementPattern {
int groupIndex = -1;
String staticString;

ReplacementPattern(String staticString) {
this.staticString = staticString;
}

ReplacementPattern(int groupIndex) {
this.groupIndex = groupIndex;
}

void execute(Matcher matcher, StringBuilder output) {
if (groupIndex > -1) {
output.append(matcher.group(groupIndex));
} else {
output.append(staticString);
}
}
}

private Pattern pattern;
private List<List<ReplacementPattern>> replacements;
private final Pattern pattern;
private final List<List<ReplacementPattern>> replacements;

/**
* Creates a new processor.
Expand Down Expand Up @@ -82,14 +65,14 @@ public static PatternExtractProcessor createEmailExtractor() {

private List<ReplacementPattern> compileReplacementPattern(String input) {
List<ReplacementPattern> result = new ArrayList<>();
Matcher matcher = Pattern.compile("\\{(\\d+)}").matcher(input);
Matcher numberedPlaceholderMatcher = NUMBERED_PLACEHOLDER.matcher(input);
int start = 0;
while (matcher.find(start)) {
if (matcher.start() > start) {
result.add(new ReplacementPattern(input.substring(start, matcher.start())));
while (numberedPlaceholderMatcher.find(start)) {
if (numberedPlaceholderMatcher.start() > start) {
result.add(new ReplacementPattern(input.substring(start, numberedPlaceholderMatcher.start())));
}
result.add(new ReplacementPattern(Integer.parseInt(matcher.group(1))));
start = matcher.end();
result.add(new ReplacementPattern(Integer.parseInt(numberedPlaceholderMatcher.group(1))));
start = numberedPlaceholderMatcher.end();
}

if (start < input.length()) {
Expand Down Expand Up @@ -123,4 +106,25 @@ public void accept(String value) {
emit(value);
}
}

private static class ReplacementPattern {
int groupIndex = -1;
String staticString;

ReplacementPattern(String staticString) {
this.staticString = staticString;
}

ReplacementPattern(int groupIndex) {
this.groupIndex = groupIndex;
}

void execute(Matcher matcher, StringBuilder output) {
if (groupIndex > -1) {
output.append(matcher.group(groupIndex));
} else {
output.append(staticString);
}
}
}
}
4 changes: 2 additions & 2 deletions src/main/java/sirius/db/text/PatternReplaceProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ public class PatternReplaceProcessor extends ChainableTokenProcessor {

private static final Pattern CONTROL_CHARACTERS = Pattern.compile("\\p{Cntrl}");

private Pattern pattern;
private String replacement;
private final Pattern pattern;
private final String replacement;

/**
* Creates a new processor.
Expand Down
10 changes: 5 additions & 5 deletions src/main/java/sirius/db/text/PatternSplitProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
*/
public class PatternSplitProcessor extends ChainableTokenProcessor {

private static final Pattern HARD_BOUNDARY = Pattern.compile("[^\\p{L}\\d_\\-.,:/\\\\@ ]");
private static final Pattern HARD_BOUNDARY = Pattern.compile("[^\\p{L}\\d_\\-.,:/\\\\@+& ]");
private static final Pattern SOFT_BOUNDARY = Pattern.compile("[^\\p{L}\\d]");
private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}");

private Pattern pattern;
private boolean preserveOriginal;
private boolean purge;
private final Pattern pattern;
private final boolean preserveOriginal;
private final boolean purge;

/**
* Creates a new processor.
Expand All @@ -41,7 +41,7 @@ public PatternSplitProcessor(Pattern pattern, boolean preserveOriginal, boolean
/**
* Creates a processor which splits a hard "word" boundaries.
* <p>
* This are all punctuation symbols other than <tt>/ , . : \ _ -</tt>
* These are all punctuation symbols other than <tt>/ , . : \ _ - @ + &</tt>
*
* @return a new processor which splits at hard token boundaries. This will {@link #purge()} after each sub token
* being emitted
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/sirius/db/text/PipelineProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
*/
public class PipelineProcessor extends ChainableTokenProcessor {

private List<ChainableTokenProcessor> processors;
private final List<ChainableTokenProcessor> processors;

/**
* Creates a new processor.
Expand All @@ -36,8 +36,8 @@ public PipelineProcessor(List<ChainableTokenProcessor> processors) {
*/
public PipelineProcessor(Stream<ChainableTokenProcessor> processorsStream) {
this.processors = processorsStream.filter(Objects::nonNull).toList();
for (int i = 0; i < processors.size() - 1; i++) {
processors.get(i).chain(processors.get(i + 1));
for (int processorIndex = 0; processorIndex < processors.size() - 1; processorIndex++) {
processors.get(processorIndex).chain(processors.get(processorIndex + 1));
}
}

Expand Down
39 changes: 0 additions & 39 deletions src/test/java/sirius/db/text/BasicIndexTokenizerTest.java

This file was deleted.

30 changes: 0 additions & 30 deletions src/test/java/sirius/db/text/PatternExtractProcessorTest.java

This file was deleted.

22 changes: 0 additions & 22 deletions src/test/java/sirius/db/text/PatternReplaceProcessorTest.java

This file was deleted.

32 changes: 0 additions & 32 deletions src/test/java/sirius/db/text/PatternSplitProcessorTest.java

This file was deleted.

38 changes: 0 additions & 38 deletions src/test/java/sirius/db/text/PipelineProcessorTest.java

This file was deleted.

24 changes: 0 additions & 24 deletions src/test/java/sirius/db/text/ReduceCharactersProcessorTest.java

This file was deleted.

23 changes: 0 additions & 23 deletions src/test/java/sirius/db/text/ToLowercaseProcessorTest.java

This file was deleted.

Loading