-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_vectordb.py
141 lines (114 loc) · 5.42 KB
/
make_vectordb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#
# The script builds a vector database out of txt documents
# based on an Azure Cosmos DB for MongoDB instance within a sharded cluster.
# Authors: Binhuan Sun ([email protected]), Pashkova Liubov ([email protected])
#
import os
import dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.azure_cosmos_db import AzureCosmosDBVectorSearch, CosmosDBSimilarityType, CosmosDBVectorSearchType
from langchain_voyageai import VoyageAIEmbeddings
import pandas as pd
import argparse
import time
from pymongo import MongoClient
dotenv.load_dotenv()
def process_txt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Extract URL and Title
doi = lines[0].strip()
title = lines[1].replace('title:', '').replace('Title:', '').strip()
# Extract page content after "TRANSCRIPT"
page_content = ''.join(lines[2:])
return Document(page_content=page_content, metadata={'source': doi, 'title': title})
def create_documents_from_directory(directory_path):
documents = []
for filename in os.listdir(directory_path):
if filename.endswith('.txt'):
doc = process_txt_file(os.path.join(directory_path, filename))
documents.append(doc)
return documents
def split_list(input_list, chunk_size):
"""Yield successive n-sized chunks from input_list."""
for i in range(0, len(input_list), chunk_size):
yield input_list[i:i + chunk_size]
if __name__ == "__main__":
script_start_time = time.time()
# Get plain text dir, collection name and vector database path from command line
parser = argparse.ArgumentParser(description='Get collection name and vector database path')
parser.add_argument('plain_text_dir', type=str, help='The path to your plain text files')
parser.add_argument('collection', type=str, help='The name of the collection of your vector database')
args = parser.parse_args()
plain_text_dir = str(args.plain_text_dir)
collection_name = str(args.collection)
directory_path = plain_text_dir
# Create documents from the directory
docs = create_documents_from_directory(directory_path)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
print([directory_path, len(docs), len(all_splits)])
# QC-remove duplicates
df_qc = pd.DataFrame({'all_splits': all_splits, 'content': [doc.page_content for doc in all_splits]})
df_qc_no_dup = df_qc.drop_duplicates(subset='content', keep='first')
all_splits_no_dup = df_qc_no_dup['all_splits'].tolist()
print('The final length of all_split: ' + str(len(all_splits_no_dup)))
# Start to build the vectordb...
# Important: here we can use only embedding models with dimensionality up to 2000,
# because for Azure Cosmos Db for MongoDB the maximum number of supported dimensions is 2000: ----
embeddings = VoyageAIEmbeddings(model="voyage-large-2-instruct", show_progress_bar=True)
split_docs_chunked = split_list(all_splits_no_dup, 41000)
# Obtain the MongoDB connection string: ----
connection_string = os.getenv("MONGODB_CONN_STRING")
# The MongoDB database instance name: ----
db_name = "pankb_llm"
# Set the name of the db index to be created: ----
index_name = "pankb_vector_store_hnsw_index"
# Connect to the MongoDB instance: ----
client = MongoClient(connection_string)
# Obtain the db collection object: ----
collection = client[db_name][collection_name]
# Drop the MongoDB collection if it exists: ----
collection.drop()
print("Creating the vector index...")
# Note: The index is created before we insert the data
# (despite it increases the total data insertion time, as the index has to be updated after each insertion).
# We create it beforehand to avoid timeout errors,
# which we get if we try to create the index after populating the Vector DB and which
# we can not get rid of (presumably due to the cluster hardware configuration can not be changed).
#
# Example of the same problem enciuntered while creating the index on the M40 tier is described here:
# https://stochasticcoder.com/2024/03/08/azure-cosmos-db-for-mongodb-hnsw-vector-search/
#
client[db_name].command({
"createIndexes": collection_name,
"indexes": [
{
"name": collection_name + "_hnsw_index",
"key": {
"vectorContent": "cosmosSearch"
},
"cosmosSearchOptions": {
"kind": "vector-hnsw",
"m": 16,
"efConstruction": 100,
"similarity": "L2",
"dimensions": 1024
}
}
]
})
print("The vector index has been successfully created.")
# Now populate the database updating the index in the meantime: ----
print("Populating the Vector DB (can take a while)...")
for split_docs_chunk in split_docs_chunked:
vectordb = AzureCosmosDBVectorSearch.from_documents(
split_docs_chunk,
embeddings,
collection=collection,
index_name=index_name)
print("Total execution time: %.2f minutes" % ((time.time() - script_start_time) / 60))