julep-ai · Vedantsahai18 · Jan 12, 2025 · Jan 12, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
@@ -94,14 +94,23 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]
 
     # Extract and filter spans in a single pass
     ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels]
-    chunk_spans = [chunk for chunk in doc.noun_chunks if not chunk.root.is_stop]
+    # Add more comprehensive stopword filtering for noun chunks
+    chunk_spans = [
+        chunk
+        for chunk in doc.noun_chunks
+        if not chunk.root.is_stop and not all(token.is_stop for token in chunk)
+    ]
     all_spans = filter_spans(ent_spans + chunk_spans)
 
-    # Process spans efficiently
+    # Process spans efficiently and filter out spans that are entirely stopwords
     keywords = []
     seen_texts = set()
 
     for span in all_spans:
+        # Skip if all tokens in span are stopwords
+        if all(token.is_stop for token in span):
+            continue
+
         text = span.text.strip()
         lower_text = text.lower()
 
@@ -180,105 +189,84 @@ def union(u: str, v: str) -> None:
     return list(groups.values())
 
 
-def build_query_pattern(group_size: int, n: int) -> str:
-    """Cache query patterns for common group sizes."""
-    if group_size == 1:
-        return '"{}"'
-    return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")"
-
-
-def build_query(groups: list[set[str]], n: int = 10) -> str:
-    """Build query with cached patterns."""
-    clauses = []
-
-    for group in groups:
-        if len(group) == 1:
-            clauses.append(f'"{next(iter(group))}"')
-        else:
-            # Sort by length descending to prioritize longer phrases
-            sorted_group = sorted(group, key=len, reverse=True)
-            # Get cached pattern and format with keywords
-            pattern = build_query_pattern(len(group), n)
-            clause = pattern.format(*sorted_group)
-            clauses.append(clause)
-
-    return " OR ".join(clauses)
-
-
-@lru_cache(maxsize=100)
-def paragraph_to_custom_queries(
+@lru_cache(maxsize=1000)
+def text_to_tsvector_query(
     paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
-) -> list[str]:
+) -> str:
     """
-    Optimized paragraph processing with minimal behavior changes.
-    Added min_keywords parameter to filter out low-value queries.
+    Extracts meaningful keywords/phrases from text and joins them with OR.
+
+    Example:
+        Input: "I like basketball especially Michael Jordan"
+        Output: "basketball OR Michael Jordan"
 
     Args:
-        paragraph (str): The input paragraph to convert.
-        top_n (int): Number of top keywords to extract per sentence.
-        proximity_n (int): The proximity window for NEAR/n.
-        min_keywords (int): Minimum number of keywords required to form a query.
+        paragraph (str): The input text to process
+        top_n (int): Number of top keywords to extract per sentence
+        proximity_n (int): The proximity window for grouping related keywords
+        min_keywords (int): Minimum number of keywords required
 
     Returns:
-        list[str]: The list of custom query strings.
+        str: Keywords/phrases joined by OR
     """
     if not paragraph or not paragraph.strip():
-        return []
+        return ""
 
-    # Process entire paragraph once
     doc = nlp(paragraph)
-    queries = []
+    queries = set()  # Use set to avoid duplicates
 
-    # Process sentences
     for sent in doc.sents:
-        # Convert to doc for consistent API
         sent_doc = sent.as_doc()
 
-        # Extract and clean keywords
+        # Extract keywords
         keywords = extract_keywords(sent_doc, top_n)
         if len(keywords) < min_keywords:
             continue
 
-        # Find keyword positions using matcher
+        # Find keyword positions
         keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-
-        # Skip if no keywords found in positions
         if not keyword_positions:
             continue
 
-        # Find proximity groups and build query
+        # Group related keywords by proximity
         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-        query = build_query(groups, proximity_n)
 
-        if query:
-            queries.append(query)
+        # Add each group as a single term to our set
+        for group in groups:
+            if len(group) > 1:
+                # Sort by length descending to prioritize longer phrases
+                sorted_group = sorted(group, key=len, reverse=True)
+                # For truly proximate multi-word groups, group words
+                queries.add(" OR ".join(sorted_group))
+            else:
+                # For non-proximate words or single words, add them separately
+                queries.update(group)
 
-    return queries
+    # Join all terms with " OR "
+    return " OR ".join(queries) if queries else ""
 
 
-def batch_paragraphs_to_custom_queries(
+def batch_text_to_tsvector_queries(
     paragraphs: list[str],
     top_n: int = 10,
     proximity_n: int = 10,
     min_keywords: int = 1,
     n_process: int = 1,
-) -> list[list[str]]:
+) -> list[str]:
     """
     Processes multiple paragraphs using nlp.pipe for better performance.
 
     Args:
-        paragraphs (list[str]): list of paragraphs to process.
-        top_n (int): Number of top keywords to extract per sentence.
-        proximity_n (int): The proximity window for NEAR/n.
-        min_keywords (int): Minimum number of keywords required to form a query.
-        n_process (int): Number of processes to use for multiprocessing.
+        paragraphs (list[str]): List of paragraphs to process
+        top_n (int): Number of top keywords to include per paragraph
 
     Returns:
-        list[list[str]]: A list where each element is a list of queries for a paragraph.
+        list[str]: List of tsquery strings
     """
     results = []
+
     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
-        queries = []
+        queries = set()  # Use set to avoid duplicates
         for sent in doc.sents:
             sent_doc = sent.as_doc()
             keywords = extract_keywords(sent_doc, top_n)
@@ -288,9 +276,18 @@ def batch_paragraphs_to_custom_queries(
             if not keyword_positions:
                 continue
             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-            query = build_query(groups, proximity_n)
-            if query:
-                queries.append(query)
-        results.append(queries)
+            # Add each group as a single term to our set
+            for group in groups:
+                if len(group) > 1:
+                    # Sort by length descending to prioritize longer phrases
+                    sorted_group = sorted(group, key=len, reverse=True)
+                    # For truly proximate multi-word groups, group words
+                    queries.add(" OR ".join(sorted_group))
+                else:
+                    # For non-proximate words or single words, add them separately
+                    queries.update(group)
+
+        # Join all terms with " OR "
+        results.append(" OR ".join(queries) if queries else "")
 
     return results
diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -60,6 +60,8 @@ async def search_docs_by_text(
     # Extract owner types and IDs
     owner_types: list[str] = [owner[0] for owner in owners]
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
+    #  Pre-process rawtext query
+    # query = text_to_tsvector_query(query)
 
     return (
         search_docs_text_query,

diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py
@@ -42,6 +42,7 @@
 from .utils import (
     get_localstack,
     get_pg_dsn,
+    make_vector_with_similarity,
 )
 from .utils import (
     patch_embed_acompletion as patch_embed_acompletion_ctx,
@@ -164,6 +165,10 @@ async def test_doc(dsn=pg_dsn, developer=test_developer, agent=test_agent):
 @fixture(scope="test")
 async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test_doc):
     pool = await create_db_pool(dsn=dsn)
+    embedding_with_confidence_0 = make_vector_with_similarity(d=0.0)
+    embedding_with_confidence_05 = make_vector_with_similarity(d=0.5)
+    embedding_with_confidence_05_neg = make_vector_with_similarity(d=-0.5)
+    embedding_with_confidence_1_neg = make_vector_with_similarity(d=-1.0)
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
@@ -175,6 +180,54 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         f"[{', '.join([str(x) for x in [1.0] * 1024])}]",
     )
 
+    # Insert embedding with confidence 0 with respect to unit vector
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 1, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 1",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_0])}]",
+    )
+
+    # Insert embedding with confidence 0.5 with respect to unit vector
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 2, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 2",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_05])}]",
+    )
+
+    # Insert embedding with confidence -0.5 with respect to unit vector
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 3, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 3",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_05_neg])}]",
+    )
+
+    # Insert embedding with confidence -1 with respect to unit vector
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 4, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 4",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_1_neg])}]",
+    )
+
     yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool)