Skip to content

Commit

Permalink
chore: test fix + add embedding vector generation based on the confid…
Browse files Browse the repository at this point in the history
…ence
  • Loading branch information
Vedantsahai18 committed Jan 13, 2025
1 parent 1a7eca2 commit 7c99628
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 54 deletions.
29 changes: 23 additions & 6 deletions agents-api/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
)
from .utils import (
patch_embed_acompletion as patch_embed_acompletion_ctx,
make_vector_with_similarity,
)


Expand Down Expand Up @@ -164,6 +165,10 @@ async def test_doc(dsn=pg_dsn, developer=test_developer, agent=test_agent):
@fixture(scope="test")
async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test_doc):
pool = await create_db_pool(dsn=dsn)
embedding_with_confidence_0 = make_vector_with_similarity(d=0.0)
embedding_with_confidence_05 = make_vector_with_similarity(d=0.5)
embedding_with_confidence_05_neg = make_vector_with_similarity(d=-0.5)
embedding_with_confidence_1_neg = make_vector_with_similarity(d=-1.0)
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
Expand All @@ -175,7 +180,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
f"[{', '.join([str(x) for x in [1.0] * 1024])}]",
)

# Insert embedding with random values between 0.3 and 0.7
# Insert embedding with confidence 0 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
Expand All @@ -184,10 +189,10 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
developer.id,
doc.id,
"Test content 1",
f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]",
f"[{', '.join([str(x) for x in embedding_with_confidence_0])}]",
)

# Insert embedding with random values between -0.8 and 0.8
# Insert embedding with confidence 0.5 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
Expand All @@ -196,10 +201,10 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
developer.id,
doc.id,
"Test content 2",
f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]",
f"[{', '.join([str(x) for x in embedding_with_confidence_05])}]",
)

# Insert embedding with alternating -1 and 1
# Insert embedding with confidence -0.5 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
Expand All @@ -208,7 +213,19 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
developer.id,
doc.id,
"Test content 3",
f"[{', '.join([str(-1 if i % 2 else 1) for i in range(1024)])}]",
f"[{', '.join([str(x) for x in embedding_with_confidence_05_neg])}]",
)

# Insert embedding with confidence -1 with respect to unit vector
await pool.execute(
"""
INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
VALUES ($1, $2, 0, 4, $3, $4)
""",
developer.id,
doc.id,
"Test content 4",
f"[{', '.join([str(x) for x in embedding_with_confidence_1_neg])}]",
)

yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool)
Expand Down
51 changes: 3 additions & 48 deletions agents-api/tests/test_docs_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,55 +19,9 @@
test_user,
)

EMBEDDING_SIZE: int = 1024

import math


def make_vector_with_similarity(n: int, d: float):
"""
Returns a list `v` of length `n` such that the cosine similarity
between `v` and the all-ones vector of length `n` is approximately d.
"""
if not -1.0 <= d <= 1.0:
msg = "d must lie in [-1, 1]."
raise ValueError(msg)

# Handle special cases exactly:
if abs(d - 1.0) < 1e-12: # d ~ +1
return [1.0] * n
if abs(d + 1.0) < 1e-12: # d ~ -1
return [-1.0] * n
if abs(d) < 1e-12: # d ~ 0
v = [0.0] * n
if n >= 2:
v[0] = 1.0
v[1] = -1.0
return v

sign_d = 1.0 if d >= 0 else -1.0
from .utils import make_vector_with_similarity

# Base part: sign(d)*[1,1,...,1]
base = [sign_d] * n

# Orthogonal unit vector u with sum(u)=0; for simplicity:
# u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0]
u = [0.0] * n
if n >= 2:
u[0] = 1.0 / math.sqrt(2)
u[1] = -1.0 / math.sqrt(2)
# (if n=1, there's no truly orthogonal vector to [1], so skip)

# Solve for alpha:
# alpha^2 = n*(1 - d^2)/d^2
alpha = math.sqrt(n * (1 - d * d)) / abs(d)

# Construct v
v = [0.0] * n
for i in range(n):
v[i] = base[i] + alpha * u[i]

return v
EMBEDDING_SIZE: int = 1024


@test("query: create user doc")
Expand Down Expand Up @@ -438,6 +392,7 @@ async def _():
[
"basketball OR lebron james OR michael jordan",
"LeBron James OR Michael Jordan OR basketball",
"Michael Jordan OR basketball OR LeBron James"
],
),
# Quoted phrases
Expand Down
45 changes: 45 additions & 0 deletions agents-api/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import logging
import os
import math
import subprocess
from contextlib import asynccontextmanager, contextmanager
from unittest.mock import patch
Expand All @@ -18,6 +19,50 @@
# Replicated here to prevent circular import
EMBEDDING_SIZE: int = 1024

def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5):
"""
Returns a list `v` of length `n` such that the cosine similarity
between `v` and the all-ones vector of length `n` is approximately d.
"""
if not -1.0 <= d <= 1.0:
msg = "d must lie in [-1, 1]."
raise ValueError(msg)

# Handle special cases exactly:
if abs(d - 1.0) < 1e-12: # d ~ +1
return [1.0] * n
if abs(d + 1.0) < 1e-12: # d ~ -1
return [-1.0] * n
if abs(d) < 1e-12: # d ~ 0
v = [0.0] * n
if n >= 2:
v[0] = 1.0
v[1] = -1.0
return v

sign_d = 1.0 if d >= 0 else -1.0

# Base part: sign(d)*[1,1,...,1]
base = [sign_d] * n

# Orthogonal unit vector u with sum(u)=0; for simplicity:
# u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0]
u = [0.0] * n
if n >= 2:
u[0] = 1.0 / math.sqrt(2)
u[1] = -1.0 / math.sqrt(2)
# (if n=1, there's no truly orthogonal vector to [1], so skip)

# Solve for alpha:
# alpha^2 = n*(1 - d^2)/d^2
alpha = math.sqrt(n * (1 - d * d)) / abs(d)

# Construct v
v = [0.0] * n
for i in range(n):
v[i] = base[i] + alpha * u[i]

return v

@asynccontextmanager
async def patch_testing_temporal():
Expand Down

0 comments on commit 7c99628

Please sign in to comment.