More instructions and clean up

thesteve0 · Dec 12, 2023 · b521be3 · b521be3
1 parent 37cad92
commit b521be3
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -325,3 +325,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+/arxiv_abstracts.zip
diff --git a/README.md b/README.md
@@ -1,9 +1,19 @@
 # Arvix Query
 
-Got the data set from here:
+Original Data not in qdrant format is here:
+https://alex.macrocosm.so/download
+
+Got the Qdrant database snapshot from here:
 
 https://deploy-preview-199--condescending-goldwasser-91acf0.netlify.app/documentation/datasets/
 
+This is how you upload the snapshot to Qdrant Server
+```
+curl -X POST 'http://127.0.0.1:6333/collections/COLLECTION_NAME/snapshots/upload' \
+-H 'Content-Type:multipart/form-data' \
+-F 'snapshot=@./arxiv_abstracts-3083016565637815127-2023-06-02-07-26-29.snapshot'
+```
+
 Data was originally encoded using this model
 https://huggingface.co/hkunlp/instructor-xl
 
@@ -14,12 +24,9 @@ sentence = "habitat corridors"
 instruction  = "Represent the Research Paper abstract for retrieval; Input:"
 ```
 
-
-
-Original Data not in qdrant format is here:
-https://alex.macrocosm.so/download
-
+Trying to get some Postgres working
 
 `$ docker run --name some-postgres -e POSTGRES_PASSWORD=mysecretpassword -d postgres`
 
-`podman run -d -p 5432:5432 -e POSTGRES_PASSWORD=test --name maybe ghcr.io/thesteve0/pg16-full` 
+`podman run -d -p 5432:5432 -e POSTGRES_PASSWORD=test --name maybe ghcr.io/thesteve0/pg16-full` 
+
diff --git a/play.py b/play.py
@@ -1,5 +1,4 @@
 from InstructorEmbedding import INSTRUCTOR
-import numpy as np
 from qdrant_client import QdrantClient
 import time
 
@@ -26,41 +25,45 @@
 print(f" CPU time: {t2[1] - t1[1]:.2f} seconds")
 
 
-sentence = "ocean red tide"
+sentence = "The increase in red tide dinoflagellates off the coast of Florida"
 instruction  = "Represent the Research Paper abstract for retrieval; Input:"
 
 t1 = time.perf_counter(), time.process_time()
+
 embeddings = model.encode([[instruction,sentence]])
 
 t2 = time.perf_counter(), time.process_time()
 print("\nCalculating embedding time")
 print(f" Real time: {t2[0] - t1[0]:.2f} seconds")
 print(f" CPU time: {t2[1] - t1[1]:.2f} seconds")
 
-
-#print("embeddings: " + str(embeddings[0][:5]))
-
-print("Similar search for " + "'sentence'")
+print("-----------------------------------------------------------------\n")
+print("Similar search for " + sentence + "\n")
+print("-----------------------------------------------------------------\n")
 
 client = QdrantClient("localhost", port=6333)
 search_result = client.search(
     collection_name="arvix_abs",
     query_vector=embeddings[0],
-    limit=10
+    limit=3
 )
-# print(search_result)
+
+# Now just display the results
 for scored_result in search_result :
-    print("Abstract: " + scored_result.payload["abstract"][:200] +"\n")
+    print("Abstract: " + scored_result.payload["abstract"][:400] +"\n")
+
+
+print("-----------------------------------------------------------------\n")
+print("Dissimilar search for " + sentence + "\n")
+print("-----------------------------------------------------------------\n")
 
-print("-----------------------------------------------------------------")
-print("Dissimilar search for " + "'sentence'")
 dissimilar_search_result = client.search(
     collection_name="arvix_abs",
     query_vector=-1*embeddings[0],
-    limit=10
+    limit=3
 )
 
 for scored_result in dissimilar_search_result :
-    print("Abstract: " + scored_result.payload["abstract"][:200] +"\n")
+    print("Abstract: " + scored_result.payload["abstract"][:400] +"\n")
 
 print("finished")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -325,3 +325,4 @@ cython_debug/
		# option (not recommended) you can uncomment the following to ignore the entire idea folder.
		#.idea/

		/arxiv_abstracts.zip