Changed search backend to pinecone

craigloewen-msft · Jan 5, 2024 · f72fa59 · f72fa59
1 parent c5f885f
commit f72fa59
Show file tree

Hide file tree

Showing 5 changed files with 232 additions and 715 deletions.
diff --git a/app.js b/app.js
@@ -26,8 +26,7 @@ if (process.env.NODE_ENV == 'production') {
     config.secret = process.env.secret;
     config.sessionSecret = process.env.sessionSecret;
     config.ghToken = process.env.ghToken;
-    config.azureSearchAPIKey = process.env.azureSearchAPIKey;
-    config.azureSearchURL = process.env.azureSearchURL;
+    config.pineconeAPIKey = process.env.pineconeAPIKey;
     hostPort = process.env.PORT ? process.env.PORT : 8080;
 } else {
     mongooseConnectionString = config.devMongoDBConnectionString;

diff --git a/backendsrc/embeddingsHandler.js b/backendsrc/embeddingsHandler.js
@@ -1,11 +1,6 @@
 const pythonWorkerHandler = require('./pythonWorkerHandler');
 const zmq = require('zeromq');
-const {
-    SearchClient,
-    SearchIndexClient,
-    SearchIndexerClient,
-    AzureKeyCredential,
-} = require("@azure/search-documents");
+const { Pinecone } = require("@pinecone-database/pinecone");
 
 class embeddingsHandler {
 
@@ -17,77 +12,11 @@ class embeddingsHandler {
         // Set up Python Worker 
         this.sock = new zmq.Request;
         this.pythonWorker = new pythonWorkerHandler(this.sock);
-
-        this.azureSearchURL = inConfigObject.azureSearchURL;
-        this.azureSearchAPIKey = inConfigObject.azureSearchAPIKey;
-
-        this.azureSearchIndexClient = new SearchIndexClient(inConfigObject.azureSearchURL, new AzureKeyCredential(inConfigObject.azureSearchAPIKey));
-    }
-
-    async createIndexIfNeeded() {
-        // TODO Need some kind of error check to not do this every time....
-        // Create index 
-        const result = await this.azureSearchIndexClient.createOrUpdateIndex({
-            name: embeddingsHandler.indexName,
-            fields: [
-                {
-                    type: "Edm.String",
-                    name: "issue_id",
-                    key: true,
-                    filterable: true,
-                    sortable: true,
-                },
-                {
-                    type: "Edm.String",
-                    name: "repo_id",
-                    filterable: true,
-                    sortable: true,
-                },
-                {
-                    type: "Collection(Edm.Single)",
-                    name: "title_vector",
-                    searchable: true,
-                    vectorSearchDimensions: embeddingsHandler.embeddingDimensions,
-                    vectorSearchProfile: "vector-search-profile",
-                },
-            ],
-            vectorSearch: {
-                algorithms: [{ name: "vector-search-algorithm", kind: "hnsw" }],
-                profiles: [
-                    {
-                        name: "vector-search-profile",
-                        algorithm: "vector-search-algorithm",
-                    },
-                ],
-            },
+        this.pinecone = new Pinecone({
+            environment: "gcp-starter",
+            apiKey: inConfigObject.pineconeAPIKey,
         });
-
-        return true;
-    }
-
-    async addEmbedding(inputIssue) {
-        // Get embedding from Python Worker
-        const embedding = await this.pythonWorker.getEmbedding(inputIssue.title);
-
-        const collectionName = embeddingsHandler.indexName;
-
-        const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));
-
-        // Set up index 
-        await this.createIndexIfNeeded(inputIssue.repoRef);
-
-        // Add to Azure Search
-        let uploadResult = await searchClient.uploadDocuments([
-            {
-                issue_id: inputIssue._id.toString(),
-                repo_id: inputIssue.repoRef.toString(),
-                title_vector: embedding,
-            },
-        ]);
-
-        const uploadsSucceeded = uploadResult.results.every((result) => result.succeeded);
-
-        return true;
+        this.index = this.pinecone.Index(embeddingsHandler.indexName);
     }
 
     async addMultipleEmbeddings(inputIssues) {
@@ -96,94 +25,78 @@ class embeddingsHandler {
         if (inputIssues.length != 0) {
             const titles = inputIssues.map(issue => issue.title);
             const embeddings = await this.pythonWorker.getMultipleEmbeddings(titles);
-    
-            const collectionName = embeddingsHandler.indexName;
-
-            const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));
-
-            // Set up index 
-            await this.createIndexIfNeeded(inputIssues[0].repoRef);
-
-            // Prepare documents for upload
-            const documents = inputIssues.map((issue, index) => ({
-                issue_id: issue._id.toString(),
-                repo_id: issue.repoRef.toString(),
-                title_vector: embeddings[index],
-            }));
-    
-            // Add to Azure Search
-            let uploadResult = await searchClient.uploadDocuments(documents);
-
-            const uploadsSucceeded = uploadResult.results.every((result) => result.succeeded);
-    
-            return uploadsSucceeded;
+
+            // Get list of issues grouped by repoRef with embeddings added
+            let issuesByRepo = {};
+            for (let i = 0; i < inputIssues.length; i++) {
+                let issue = inputIssues[i];
+                let embedding = embeddings[i];
+                if (!issuesByRepo[issue.repoRef.toString()]) {
+                    issuesByRepo[issue.repoRef.toString()] = [];
+                }
+                issuesByRepo[issue.repoRef.toString()].push({
+                    id: issue._id.toString(),
+                    values: embedding,
+                });
+            }
+
+            // Upsert embeddings into Pinecone
+            for (const [repoRef, issues] of Object.entries(issuesByRepo)) {
+                await this.index.namespace(repoRef).upsert(issues);
+            }
+
+            return true;
         }
         else {
             return true;
         }
-        
+
     }
 
     async removeEmbedding(inputIssue) {
-        const collectionName = this.getCollectionName(inputIssue.repoRef);
-        const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));
-
-        let deleteResult = await searchClient.deleteDocuments([
-            {
-                issue_id: inputIssue._id.toString(),
-            },
-        ]);
-
-        // Check if index is empty, if yes delete it
-        const searchResults = await searchClient.count("*");
-        if (searchResults.count == 0) {
-            await this.azureSearchIndexClient.deleteIndex(collectionName);
-        }
+        await this.index.namespace(inputIssue.repoRef.toString()).deleteOne(inputIssue._id.toString());
 
         return true;
     }
 
     async removeRepo(inputRepoRef) {
-        const collectionName = this.getCollectionName(inputRepoRef);
-        const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));
-
-        let deleteResult = await searchClient.deleteIndex(collectionName);
+        await this.index.namespace(inputRepoRef.toString()).deleteAll();
 
         return true;
     }
 
     async getSimilarIssueIDs(repo, issueTitle, issue) {
-        const collectionName = embeddingsHandler.indexName;
-
         const inputVector = await this.pythonWorker.getEmbedding(issueTitle);
 
-        const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));
+        let searchFilter = `repo_id eq '${repo._id.toString()}'`;
 
-        let searchFilter  = `repo_id eq '${repo._id.toString()}'`;
+        let numberOfReturnedIssues = 5;
 
         if (issue) {
             searchFilter += ` and issue_id ne '${issue._id.toString()}'`;
         }
 
-        const searchResults = await searchClient.search("*", {
-            // Filter to not infclude input issue
-            filter: searchFilter,
-            vectorQueries: [
-                {
-                    kind: "vector",
-                    fields: ["title_vector"],
-                    kNearestNeighborsCount: 5,
-                    // An embedding of the query "What are the most luxurious hotels?"
-                    vector: inputVector,
-                },
-            ],
+        let searchResults = await this.index.namespace(repo._id.toString()).query({
+            topK: numberOfReturnedIssues + 1,
+            vector: inputVector,
+            includeValues: false
         });
 
+        // If top result is the same issue, remove it
+        if (issue && searchResults && searchResults.matches[0] && searchResults.matches[0].id == issue._id.toString()) {
+            searchResults.matches.shift();
+        } else {
+            // If the searchResults are longe than 5 get rid of the last one
+            if (searchResults && searchResults.matches.length > numberOfReturnedIssues) {
+                searchResults.matches.pop();
+            }
+        }
+
         let formattedResults = [];
-        for await (const result of searchResults.results) {
+        for await (const result of searchResults.matches) {
             formattedResults.push({
                 score: result.score,
-                id: result.document.issue_id
+                id: result.id
             });
         }
 

diff --git a/backendsrc/oneOffScriptHelpers.js b/backendsrc/oneOffScriptHelpers.js
@@ -2,20 +2,20 @@
 module.exports = {
     async AddEmbeddingsToIssuesInRepo(inIssueDetails, inEmbeddingsHandler, inRepo) {
 
-        if (inRepo.shortURL == "microsoft/wsl") {
+        if (inRepo.shortURL == "microsoft/terminal") {
             try {
-                let fourWeeksAgo = new Date((new Date().getTime() - (4 * 7 * 24 * 60 * 60 * 1000)));
+                let startPeriod = new Date((new Date().getTime() - (20 * 12 * 4 * 7 * 24 * 60 * 60 * 1000))); // 20 years ago
                 let totalIssues = await inIssueDetails.countDocuments({
                     repoRef: inRepo._id,
-                    created_at: { $gte: fourWeeksAgo }
+                    created_at: { $gte: startPeriod }
                 });
-                let pageSize = 200;
+                let pageSize = 100;
                 let pages = Math.ceil(totalIssues / pageSize);
 
                 for (let i = 0; i < pages; i++) {
                     let issueList = await inIssueDetails.find({
                         repoRef: inRepo._id,
-                        created_at: { $gte: fourWeeksAgo }
+                        created_at: { $gte: startPeriod }
                     }).sort({ number: 1 }).skip(i * pageSize).limit(pageSize);
                     await inEmbeddingsHandler.addMultipleEmbeddings(issueList);
                     let percentComplete = ((i + 1) / pages) * 100;