From 3d932edc394f6ea995947d1c5588e09f3f3b09fe Mon Sep 17 00:00:00 2001 From: Emma Forman Ling Date: Mon, 6 Jan 2025 15:45:48 -0800 Subject: [PATCH] skip build fuzzy dfa if fuzzy text search disabled (#32818) This skips expensive `build_fuzzy_dfa` call when we check for readset overlaps if fuzzy text search is disabled. Looking up the token in the trie is sufficient for exact matches, and prefix tokens are already constructed in `DocumentTokens`. GitOrigin-RevId: f8ec235035395e468428a294d1cefc583a64096e --- .github/workflows/build_local_backend.yml | 2 +- crates/database/src/reads.rs | 72 +++++++++++-------- .../src/tests/randomized_search_tests.rs | 71 +++++++++--------- crates/search/src/query.rs | 13 +++- 4 files changed, 94 insertions(+), 64 deletions(-) diff --git a/.github/workflows/build_local_backend.yml b/.github/workflows/build_local_backend.yml index edfa319e..be799035 100644 --- a/.github/workflows/build_local_backend.yml +++ b/.github/workflows/build_local_backend.yml @@ -64,4 +64,4 @@ jobs: run: cargo nextest run --no-run --profile ci - name: Run Rust tests - run: cargo nextest run --profile ci + run: DISABLE_FUZZY_TEXT_SEARCH=true cargo nextest run --profile ci diff --git a/crates/database/src/reads.rs b/crates/database/src/reads.rs index ef295d2b..868db18c 100644 --- a/crates/database/src/reads.rs +++ b/crates/database/src/reads.rs @@ -506,6 +506,7 @@ mod tests { PackedDocument, ResolvedDocument, }, + knobs::DISABLE_FUZZY_TEXT_SEARCH, query::search_value_to_bytes, testing::TestIdGenerator, types::{ @@ -637,13 +638,15 @@ mod tests { let read_set = reads.into_read_set(); let id = id_generator.user_generate(&table_name); - assert!(read_set_overlaps( - id, - &read_set, - field_path, - // If "word" is a token, it overlaps. - "Text containing word and other stuff." - )?); + if !*DISABLE_FUZZY_TEXT_SEARCH { + assert!(read_set_overlaps( + id, + &read_set, + field_path, + // If "word" is a token, it overlaps. + "Text containing word and other stuff." + )?); + } assert!(!read_set_overlaps( id, @@ -690,26 +693,28 @@ mod tests { field_path, "Text containing word and other stuff." )?); - assert!(read_set_overlaps( - id, - &read_set, - field_path, - "Text containing shword and other stuff." - )?); - - // This would match if prefix is true. - assert!(!read_set_overlaps( - id, - &read_set, - field_path, - "Text containing wordddd and other stuff." - )?); assert!(!read_set_overlaps( id, &read_set, field_path, "This text doesn't have the keyword." )?); + if !*DISABLE_FUZZY_TEXT_SEARCH { + assert!(read_set_overlaps( + id, + &read_set, + field_path, + "Text containing shword and other stuff." + )?); + + // This would match if prefix is true. + assert!(!read_set_overlaps( + id, + &read_set, + field_path, + "Text containing wordddd and other stuff." + )?); + } Ok(()) } @@ -789,13 +794,24 @@ mod tests { let read_set = reads.into_read_set(); let id = id_generator.user_generate(&table_name); - assert!(read_set_overlaps( - id, - &read_set, - field_path, - // If "word.*" is a token, it overlaps. - "Text containing wordsythings and other stuff." - )?); + if *DISABLE_FUZZY_TEXT_SEARCH { + // Add a test case here + assert!(read_set_overlaps( + id, + &read_set, + field_path, + // If "wrd.*" is a token, it overlaps. + "Text containing wrdsythings and other stuff." + )?); + } else { + assert!(read_set_overlaps( + id, + &read_set, + field_path, + // If "word.*" is a token, it overlaps. + "Text containing wordsythings and other stuff." + )?); + } assert!(!read_set_overlaps( id, diff --git a/crates/database/src/tests/randomized_search_tests.rs b/crates/database/src/tests/randomized_search_tests.rs index 4ef7cb5f..c0e999ee 100644 --- a/crates/database/src/tests/randomized_search_tests.rs +++ b/crates/database/src/tests/randomized_search_tests.rs @@ -21,6 +21,7 @@ use common::{ IndexMetadata, }, floating_point::assert_approx_equal, + knobs::DISABLE_FUZZY_TEXT_SEARCH, pause::PauseController, persistence::Persistence, query::{ @@ -861,10 +862,12 @@ async fn test_fuzzy_mem(rt: TestRuntime) -> anyhow::Result<()> { let mut scenario = Scenario::new(rt).await?; scenario._patch("a", "the quick brow fox", "test").await?; - let results = scenario - ._query_with_scores("brown", None, None, SearchVersion::V2) - .await?; - assert_eq!(results.len(), 1); + if !*DISABLE_FUZZY_TEXT_SEARCH { + let results = scenario + ._query_with_scores("brown", None, None, SearchVersion::V2) + .await?; + assert_eq!(results.len(), 1); + } // Exact match w/o prefix will fail let results = scenario @@ -888,26 +891,28 @@ async fn test_fuzzy_mem(rt: TestRuntime) -> anyhow::Result<()> { assert_eq!(results.len(), 1); // Prefix + fuzzy - let results = scenario - ._query_with_scores("ahhhhh", None, None, SearchVersion::V2) - .await?; - assert_eq!(results.len(), 1); - - // Edit distance 2 - scenario - ._patch("c", "my name is bartholomew", "test") - .await?; - let results = scenario - ._query_with_scores("batholmew runs fast", None, None, SearchVersion::V2) - .await?; - assert_eq!(results.len(), 1); + if !*DISABLE_FUZZY_TEXT_SEARCH { + let results = scenario + ._query_with_scores("ahhhhh", None, None, SearchVersion::V2) + .await?; + assert_eq!(results.len(), 1); + // + // Edit distance 2 + scenario + ._patch("c", "my name is bartholomew", "test") + .await?; + let results = scenario + ._query_with_scores("batholmew runs fast", None, None, SearchVersion::V2) + .await?; + assert_eq!(results.len(), 1); + } Ok(()) } #[convex_macro::test_runtime] async fn test_fuzzy_disk(rt: TestRuntime) -> anyhow::Result<()> { - { + if !*DISABLE_FUZZY_TEXT_SEARCH { let mut scenario = Scenario::new(rt).await?; scenario._patch("key1", "rakeeb wuz here", "test").await?; scenario.backfill().await?; @@ -947,22 +952,24 @@ async fn test_fuzzy_disk_snapshot_shortlist_ids_valid_with_empty_memory_index( // See https://github.com/get-convex/convex/pull/20649 #[convex_macro::test_runtime] async fn unrelated_sentences_are_queryable_after_flush(rt: TestRuntime) -> anyhow::Result<()> { - let mut scenario = Scenario::new(rt).await?; - scenario._patch("key1", "rakeeb wuz here", "test").await?; - scenario - ._patch("key2", "some other sentence", "test") - .await?; - scenario.backfill().await?; + if !*DISABLE_FUZZY_TEXT_SEARCH { + let mut scenario = Scenario::new(rt).await?; + scenario._patch("key1", "rakeeb wuz here", "test").await?; + scenario + ._patch("key2", "some other sentence", "test") + .await?; + scenario.backfill().await?; - let results = scenario - ._query_with_scores("rakeem", None, None, SearchVersion::V2) - .await?; - assert_eq!(results.len(), 1); + let results = scenario + ._query_with_scores("rakeem", None, None, SearchVersion::V2) + .await?; + assert_eq!(results.len(), 1); - let results = scenario - ._query_with_scores("senence", None, None, SearchVersion::V2) - .await?; - assert_eq!(results.len(), 1); + let results = scenario + ._query_with_scores("senence", None, None, SearchVersion::V2) + .await?; + assert_eq!(results.len(), 1); + } Ok(()) } diff --git a/crates/search/src/query.rs b/crates/search/src/query.rs index 538be94c..e2a66259 100644 --- a/crates/search/src/query.rs +++ b/crates/search/src/query.rs @@ -15,6 +15,7 @@ use common::{ PackedDocument, }, index::IndexKeyBytes, + knobs::DISABLE_FUZZY_TEXT_SEARCH, query::search_value_to_bytes, types::{ SubscriberId, @@ -825,9 +826,15 @@ impl SearchTermTries { // notes there), so we can get away with a symmetric search where the dfa's // prefix is always set to false. tokens.for_each_token(path, *prefix, |token| { - let dfa = build_fuzzy_dfa(token, *max_distance, false); - for (values, ..) in trie.intersect(dfa, None) { - result.extend(values.keys().cloned()); + if *DISABLE_FUZZY_TEXT_SEARCH { + if let Some(value) = trie.get(token) { + result.extend(value.keys().cloned()); + } + } else { + let dfa = build_fuzzy_dfa(token, *max_distance, false); + for (values, ..) in trie.intersect(dfa, None) { + result.extend(values.keys().cloned()); + } } }); }