Skip to content

Commit

Permalink
skip build fuzzy dfa if fuzzy text search disabled (#32818)
Browse files Browse the repository at this point in the history
This skips expensive `build_fuzzy_dfa` call when we check for readset overlaps if fuzzy text search is disabled. Looking up the token in the trie is sufficient for exact matches, and prefix tokens are already constructed in `DocumentTokens`.

GitOrigin-RevId: f8ec235035395e468428a294d1cefc583a64096e
  • Loading branch information
emmaling27 authored and Convex, Inc. committed Jan 6, 2025
1 parent 2cd2e71 commit 3d932ed
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 64 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_local_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,4 @@ jobs:
run: cargo nextest run --no-run --profile ci

- name: Run Rust tests
run: cargo nextest run --profile ci
run: DISABLE_FUZZY_TEXT_SEARCH=true cargo nextest run --profile ci
72 changes: 44 additions & 28 deletions crates/database/src/reads.rs
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ mod tests {
PackedDocument,
ResolvedDocument,
},
knobs::DISABLE_FUZZY_TEXT_SEARCH,
query::search_value_to_bytes,
testing::TestIdGenerator,
types::{
Expand Down Expand Up @@ -637,13 +638,15 @@ mod tests {
let read_set = reads.into_read_set();
let id = id_generator.user_generate(&table_name);

assert!(read_set_overlaps(
id,
&read_set,
field_path,
// If "word" is a token, it overlaps.
"Text containing word and other stuff."
)?);
if !*DISABLE_FUZZY_TEXT_SEARCH {
assert!(read_set_overlaps(
id,
&read_set,
field_path,
// If "word" is a token, it overlaps.
"Text containing word and other stuff."
)?);
}

assert!(!read_set_overlaps(
id,
Expand Down Expand Up @@ -690,26 +693,28 @@ mod tests {
field_path,
"Text containing word and other stuff."
)?);
assert!(read_set_overlaps(
id,
&read_set,
field_path,
"Text containing shword and other stuff."
)?);

// This would match if prefix is true.
assert!(!read_set_overlaps(
id,
&read_set,
field_path,
"Text containing wordddd and other stuff."
)?);
assert!(!read_set_overlaps(
id,
&read_set,
field_path,
"This text doesn't have the keyword."
)?);
if !*DISABLE_FUZZY_TEXT_SEARCH {
assert!(read_set_overlaps(
id,
&read_set,
field_path,
"Text containing shword and other stuff."
)?);

// This would match if prefix is true.
assert!(!read_set_overlaps(
id,
&read_set,
field_path,
"Text containing wordddd and other stuff."
)?);
}

Ok(())
}
Expand Down Expand Up @@ -789,13 +794,24 @@ mod tests {
let read_set = reads.into_read_set();
let id = id_generator.user_generate(&table_name);

assert!(read_set_overlaps(
id,
&read_set,
field_path,
// If "word.*" is a token, it overlaps.
"Text containing wordsythings and other stuff."
)?);
if *DISABLE_FUZZY_TEXT_SEARCH {
// Add a test case here
assert!(read_set_overlaps(
id,
&read_set,
field_path,
// If "wrd.*" is a token, it overlaps.
"Text containing wrdsythings and other stuff."
)?);
} else {
assert!(read_set_overlaps(
id,
&read_set,
field_path,
// If "word.*" is a token, it overlaps.
"Text containing wordsythings and other stuff."
)?);
}

assert!(!read_set_overlaps(
id,
Expand Down
71 changes: 39 additions & 32 deletions crates/database/src/tests/randomized_search_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use common::{
IndexMetadata,
},
floating_point::assert_approx_equal,
knobs::DISABLE_FUZZY_TEXT_SEARCH,
pause::PauseController,
persistence::Persistence,
query::{
Expand Down Expand Up @@ -861,10 +862,12 @@ async fn test_fuzzy_mem(rt: TestRuntime) -> anyhow::Result<()> {
let mut scenario = Scenario::new(rt).await?;

scenario._patch("a", "the quick brow fox", "test").await?;
let results = scenario
._query_with_scores("brown", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
if !*DISABLE_FUZZY_TEXT_SEARCH {
let results = scenario
._query_with_scores("brown", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
}

// Exact match w/o prefix will fail
let results = scenario
Expand All @@ -888,26 +891,28 @@ async fn test_fuzzy_mem(rt: TestRuntime) -> anyhow::Result<()> {
assert_eq!(results.len(), 1);

// Prefix + fuzzy
let results = scenario
._query_with_scores("ahhhhh", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);

// Edit distance 2
scenario
._patch("c", "my name is bartholomew", "test")
.await?;
let results = scenario
._query_with_scores("batholmew runs fast", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
if !*DISABLE_FUZZY_TEXT_SEARCH {
let results = scenario
._query_with_scores("ahhhhh", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
//
// Edit distance 2
scenario
._patch("c", "my name is bartholomew", "test")
.await?;
let results = scenario
._query_with_scores("batholmew runs fast", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
}

Ok(())
}

#[convex_macro::test_runtime]
async fn test_fuzzy_disk(rt: TestRuntime) -> anyhow::Result<()> {
{
if !*DISABLE_FUZZY_TEXT_SEARCH {
let mut scenario = Scenario::new(rt).await?;
scenario._patch("key1", "rakeeb wuz here", "test").await?;
scenario.backfill().await?;
Expand Down Expand Up @@ -947,22 +952,24 @@ async fn test_fuzzy_disk_snapshot_shortlist_ids_valid_with_empty_memory_index(
// See https://github.com/get-convex/convex/pull/20649
#[convex_macro::test_runtime]
async fn unrelated_sentences_are_queryable_after_flush(rt: TestRuntime) -> anyhow::Result<()> {
let mut scenario = Scenario::new(rt).await?;
scenario._patch("key1", "rakeeb wuz here", "test").await?;
scenario
._patch("key2", "some other sentence", "test")
.await?;
scenario.backfill().await?;
if !*DISABLE_FUZZY_TEXT_SEARCH {
let mut scenario = Scenario::new(rt).await?;
scenario._patch("key1", "rakeeb wuz here", "test").await?;
scenario
._patch("key2", "some other sentence", "test")
.await?;
scenario.backfill().await?;

let results = scenario
._query_with_scores("rakeem", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
let results = scenario
._query_with_scores("rakeem", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);

let results = scenario
._query_with_scores("senence", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
let results = scenario
._query_with_scores("senence", None, None, SearchVersion::V2)
.await?;
assert_eq!(results.len(), 1);
}

Ok(())
}
Expand Down
13 changes: 10 additions & 3 deletions crates/search/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use common::{
PackedDocument,
},
index::IndexKeyBytes,
knobs::DISABLE_FUZZY_TEXT_SEARCH,
query::search_value_to_bytes,
types::{
SubscriberId,
Expand Down Expand Up @@ -825,9 +826,15 @@ impl<T: Clone + Ord> SearchTermTries<T> {
// notes there), so we can get away with a symmetric search where the dfa's
// prefix is always set to false.
tokens.for_each_token(path, *prefix, |token| {
let dfa = build_fuzzy_dfa(token, *max_distance, false);
for (values, ..) in trie.intersect(dfa, None) {
result.extend(values.keys().cloned());
if *DISABLE_FUZZY_TEXT_SEARCH {
if let Some(value) = trie.get(token) {
result.extend(value.keys().cloned());
}
} else {
let dfa = build_fuzzy_dfa(token, *max_distance, false);
for (values, ..) in trie.intersect(dfa, None) {
result.extend(values.keys().cloned());
}
}
});
}
Expand Down

0 comments on commit 3d932ed

Please sign in to comment.