Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(agents-api): init nlp pipeline text-search #1045

Open
wants to merge 31 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e466dfe
fix(agetns-api): init nlp pipeline text-search
Vedantsahai18 Jan 12, 2025
ba39b54
refactor: Lint agents-api (CI)
Vedantsahai18 Jan 12, 2025
8c3d6be
chore: misc update
Vedantsahai18 Jan 13, 2025
1d677a2
feat(test): add new embeddings + FTS tests
Vedantsahai18 Jan 13, 2025
67fc92d
fix: Remove unused function as the conversion is done by postgres query
whiterabbit1983 Jan 13, 2025
d21f980
refactor: Lint agents-api (CI)
whiterabbit1983 Jan 13, 2025
8b19c96
fix: Remove unused function as the conversion is done by postgres query
whiterabbit1983 Jan 13, 2025
70a78b3
fix(agents-api): fixed nlp pipeline for FTS
Vedantsahai18 Jan 13, 2025
ab8e3b7
chore(tests): added test for the nlp utility + FTS search
Vedantsahai18 Jan 13, 2025
41ae093
chore: misc code refactor
Vedantsahai18 Jan 13, 2025
1b02a79
Merge branch 'dev' into x/rag-search
Vedantsahai18 Jan 13, 2025
25a2e65
refactor: Lint agents-api (CI)
Vedantsahai18 Jan 13, 2025
fd2481e
chore: misc fix
Vedantsahai18 Jan 13, 2025
1a7eca2
refactor: Lint agents-api (CI)
Vedantsahai18 Jan 13, 2025
7c99628
chore: test fix + add embedding vector generation based on the confid…
Vedantsahai18 Jan 13, 2025
cb86135
refactor: Lint agents-api (CI)
Vedantsahai18 Jan 13, 2025
61d32bd
fix(agents-api): Configure spacy for postgresql
Ahmad-mtos Jan 14, 2025
890880b
refactor: Lint agents-api (CI)
Ahmad-mtos Jan 14, 2025
27ed1f4
chore: misc refactor
Vedantsahai18 Jan 14, 2025
2f161f7
Merge branch 'dev' into x/rag-search
Vedantsahai18 Jan 14, 2025
6a07a54
Update agents-api/agents_api/common/nlp.py
Vedantsahai18 Jan 14, 2025
68a7a05
fix(agents-api): add split chunks option + nlp tests
Ahmad-mtos Jan 15, 2025
3fa200c
refactor: Lint agents-api (CI)
Ahmad-mtos Jan 15, 2025
2c25490
chore(agents-api): utilize ``text_to_tsvector_query`` in search queries
Ahmad-mtos Jan 15, 2025
363c7c6
refactor: Lint agents-api (CI)
Ahmad-mtos Jan 15, 2025
e73d786
Merge branch 'x/rag-search' into x/rag-search-nlp
Ahmad-mtos Jan 15, 2025
9eb018f
chore(agents-api): remove clean parameter from ``extract_keywords``
Ahmad-mtos Jan 15, 2025
9df8de4
refactor: Lint agents-api (CI)
Ahmad-mtos Jan 15, 2025
8fe87cb
fix(agents-api): increase test coverage + set ``split_cuncks=Ture`` a…
Ahmad-mtos Jan 15, 2025
fcd2ad3
tests hotfix
Ahmad-mtos Jan 15, 2025
e5e24aa
Merge pull request #1055 from julep-ai/x/rag-search-nlp
Ahmad-mtos Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 61 additions & 64 deletions agents-api/agents_api/common/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,23 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]

# Extract and filter spans in a single pass
ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels]
chunk_spans = [chunk for chunk in doc.noun_chunks if not chunk.root.is_stop]
# Add more comprehensive stopword filtering for noun chunks
chunk_spans = [
chunk
for chunk in doc.noun_chunks
if not chunk.root.is_stop and not all(token.is_stop for token in chunk)
]
all_spans = filter_spans(ent_spans + chunk_spans)

# Process spans efficiently
# Process spans efficiently and filter out spans that are entirely stopwords
keywords = []
seen_texts = set()

for span in all_spans:
# Skip if all tokens in span are stopwords
if all(token.is_stop for token in span):
continue

text = span.text.strip()
lower_text = text.lower()

Expand Down Expand Up @@ -180,105 +189,84 @@ def union(u: str, v: str) -> None:
return list(groups.values())


def build_query_pattern(group_size: int, n: int) -> str:
"""Cache query patterns for common group sizes."""
if group_size == 1:
return '"{}"'
return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")"


def build_query(groups: list[set[str]], n: int = 10) -> str:
"""Build query with cached patterns."""
clauses = []

for group in groups:
if len(group) == 1:
clauses.append(f'"{next(iter(group))}"')
else:
# Sort by length descending to prioritize longer phrases
sorted_group = sorted(group, key=len, reverse=True)
# Get cached pattern and format with keywords
pattern = build_query_pattern(len(group), n)
clause = pattern.format(*sorted_group)
clauses.append(clause)

return " OR ".join(clauses)


@lru_cache(maxsize=100)
def paragraph_to_custom_queries(
@lru_cache(maxsize=1000)
def text_to_tsvector_query(
paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
) -> list[str]:
) -> str:
"""
Optimized paragraph processing with minimal behavior changes.
Added min_keywords parameter to filter out low-value queries.
Extracts meaningful keywords/phrases from text and joins them with OR.

Example:
Input: "I like basketball especially Michael Jordan"
Output: "basketball OR Michael Jordan"

Args:
paragraph (str): The input paragraph to convert.
top_n (int): Number of top keywords to extract per sentence.
proximity_n (int): The proximity window for NEAR/n.
min_keywords (int): Minimum number of keywords required to form a query.
paragraph (str): The input text to process
top_n (int): Number of top keywords to extract per sentence
proximity_n (int): The proximity window for grouping related keywords
min_keywords (int): Minimum number of keywords required

Returns:
list[str]: The list of custom query strings.
str: Keywords/phrases joined by OR
"""
if not paragraph or not paragraph.strip():
return []
return ""

# Process entire paragraph once
doc = nlp(paragraph)
queries = []
queries = set() # Use set to avoid duplicates

# Process sentences
for sent in doc.sents:
# Convert to doc for consistent API
sent_doc = sent.as_doc()

# Extract and clean keywords
# Extract keywords
keywords = extract_keywords(sent_doc, top_n)
if len(keywords) < min_keywords:
continue

# Find keyword positions using matcher
# Find keyword positions
keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)

# Skip if no keywords found in positions
if not keyword_positions:
continue

# Find proximity groups and build query
# Group related keywords by proximity
groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
query = build_query(groups, proximity_n)

if query:
queries.append(query)
# Add each group as a single term to our set
for group in groups:
if len(group) > 1:
# Sort by length descending to prioritize longer phrases
sorted_group = sorted(group, key=len, reverse=True)
# For truly proximate multi-word groups, group words
queries.add(" OR ".join(sorted_group))
else:
# For non-proximate words or single words, add them separately
queries.update(group)

return queries
# Join all terms with " OR "
return " OR ".join(queries) if queries else ""


def batch_paragraphs_to_custom_queries(
def batch_text_to_tsvector_queries(
paragraphs: list[str],
top_n: int = 10,
proximity_n: int = 10,
min_keywords: int = 1,
n_process: int = 1,
) -> list[list[str]]:
) -> list[str]:
"""
Processes multiple paragraphs using nlp.pipe for better performance.

Args:
paragraphs (list[str]): list of paragraphs to process.
top_n (int): Number of top keywords to extract per sentence.
proximity_n (int): The proximity window for NEAR/n.
min_keywords (int): Minimum number of keywords required to form a query.
n_process (int): Number of processes to use for multiprocessing.
paragraphs (list[str]): List of paragraphs to process
top_n (int): Number of top keywords to include per paragraph

Returns:
list[list[str]]: A list where each element is a list of queries for a paragraph.
list[str]: List of tsquery strings
"""
results = []

for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
queries = []
queries = set() # Use set to avoid duplicates
for sent in doc.sents:
sent_doc = sent.as_doc()
keywords = extract_keywords(sent_doc, top_n)
Expand All @@ -288,9 +276,18 @@ def batch_paragraphs_to_custom_queries(
if not keyword_positions:
continue
groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
query = build_query(groups, proximity_n)
if query:
queries.append(query)
results.append(queries)
# Add each group as a single term to our set
for group in groups:
if len(group) > 1:
# Sort by length descending to prioritize longer phrases
sorted_group = sorted(group, key=len, reverse=True)
# For truly proximate multi-word groups, group words
queries.add(" OR ".join(sorted_group))
else:
# For non-proximate words or single words, add them separately
queries.update(group)

# Join all terms with " OR "
results.append(" OR ".join(queries) if queries else "")

return results
Vedantsahai18 marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions agents-api/agents_api/queries/docs/search_docs_by_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ async def search_docs_by_text(
# Extract owner types and IDs
owner_types: list[str] = [owner[0] for owner in owners]
owner_ids: list[str] = [str(owner[1]) for owner in owners]
# Pre-process rawtext query
# query = text_to_tsvector_query(query)

return (
search_docs_text_query,
Expand Down
53 changes: 53 additions & 0 deletions agents-api/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from .utils import (
get_localstack,
get_pg_dsn,
make_vector_with_similarity,
)
from .utils import (
patch_embed_acompletion as patch_embed_acompletion_ctx,
Expand Down Expand Up @@ -164,6 +165,10 @@ async def test_doc(dsn=pg_dsn, developer=test_developer, agent=test_agent):
@fixture(scope="test")
async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test_doc):
pool = await create_db_pool(dsn=dsn)
embedding_with_confidence_0 = make_vector_with_similarity(d=0.0)
embedding_with_confidence_05 = make_vector_with_similarity(d=0.5)
embedding_with_confidence_05_neg = make_vector_with_similarity(d=-0.5)
embedding_with_confidence_1_neg = make_vector_with_similarity(d=-1.0)
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
Expand All @@ -175,6 +180,54 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
f"[{', '.join([str(x) for x in [1.0] * 1024])}]",
)

# Insert embedding with confidence 0 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
VALUES ($1, $2, 0, 1, $3, $4)
""",
developer.id,
doc.id,
"Test content 1",
f"[{', '.join([str(x) for x in embedding_with_confidence_0])}]",
)

# Insert embedding with confidence 0.5 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
VALUES ($1, $2, 0, 2, $3, $4)
""",
developer.id,
doc.id,
"Test content 2",
f"[{', '.join([str(x) for x in embedding_with_confidence_05])}]",
)

# Insert embedding with confidence -0.5 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
VALUES ($1, $2, 0, 3, $3, $4)
""",
developer.id,
doc.id,
"Test content 3",
f"[{', '.join([str(x) for x in embedding_with_confidence_05_neg])}]",
)

# Insert embedding with confidence -1 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
VALUES ($1, $2, 0, 4, $3, $4)
""",
developer.id,
doc.id,
"Test content 4",
f"[{', '.join([str(x) for x in embedding_with_confidence_1_neg])}]",
)

yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool)


Expand Down
Loading
Loading