-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy patharxiv-hybrid-query.py
74 lines (51 loc) · 2.94 KB
/
arxiv-hybrid-query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from collections import Counter
import psycopg
from pgvector.psycopg import register_vector
from InstructorEmbedding import INSTRUCTOR
### The purpose of this code is to demonstrate use PostgreSQL Full Text search alongside vector search (sparse vs
## dense vectors). This file assumes you have already imported the Arxiv data and built the HNSW index on the vector using
### cosine distance
### https://www.postgresql.org/docs/current/textsearch-tables.html#TEXTSEARCH-TABLES-SEARCH
### https://www.postgresql.org/docs/current/textsearch-controls.html
### https://www.postgresql.org/docs/current/textsearch-features.html#TEXTSEARCH-MANIPULATE-TSQUERY
DB_NAME = "arxiv_abstracts"
connect_string = f"host=localhost user=postgres password='letmein' dbname='{DB_NAME}'"
sentence_list = ["simulated", "annealing"]
tsquery_text = f"to_tsquery('english','{sentence_list[0]} & {sentence_list[1]}')"
sql = "select id, ts_rank_cd(to_tsvector(abstract), " + tsquery_text + (") as rank, abstract "
"from documents where to_tsvector( abstract) @@ " + tsquery_text + "order by rank DESC limit 40")
conn = psycopg.connect(connect_string, autocommit=True)
register_vector(conn)
# Create the ful text index
conn.execute('CREATE INDEX IF NOT EXISTS arxiv_abstract_fulltext_idx ON documents USING GIN (to_tsvector(\'english\', abstract))')
# Now do a full text search
# https://www.postgresql.org/docs/current/functions-textsearch.html
results = conn.execute(sql).fetchall()
print("-----------------------------------------------------------------\n")
print("Similar search for " + str(sentence_list) + "\n")
print("-----------------------------------------------------------------\n")
for result in results:
print("id: " + str(result[0]) + " || rank: " + str(result[1]) + " || abstract: " + result[2][:50])
############## Now a hybrid search ###############
### Reciprocal Rank Fusion (RRF) for combining full-text and vectors
### https://safjan.com/implementing-rank-fusion-in-python/
#gather 10 vector results
sentence = "simulated annealing"
instruction = "Represent the Research Paper abstract for retrieval; Input:"
model = INSTRUCTOR('hkunlp/instructor-xl')
embeddings = model.encode([[instruction,sentence]])
query_embedding = embeddings[0]
results_dense = conn.execute('SELECT id, (embedding <=> %s) as distance, abstract FROM documents ORDER BY embedding <=> %s LIMIT 40', (query_embedding, query_embedding, )).fetchall()
# turn each results in hashes
sparse_scores = {}
dense_scores = {}
for i in range(len(results)):
score = 1/(30 + i)
sparse_scores[results[i][0]] = score
dense_scores[results_dense[i][0]] = score
mixed_scores = dict(Counter(sparse_scores) + Counter(dense_scores))
mixed_scores_sorted = dict(sorted(mixed_scores.items(), key=lambda key_val: key_val[1], reverse=True))
print("We had " + str(80 - len(mixed_scores_sorted)) + "results overlap")
for k,v in mixed_scores_sorted.items():
print("ID: " + str(k) + " score: " + str(v) )
print("finished")