Skip to content

Commit

Permalink
Changed search backend to pinecone
Browse files Browse the repository at this point in the history
  • Loading branch information
craigloewen-msft committed Jan 5, 2024
1 parent c5f885f commit f72fa59
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 715 deletions.
3 changes: 1 addition & 2 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ if (process.env.NODE_ENV == 'production') {
config.secret = process.env.secret;
config.sessionSecret = process.env.sessionSecret;
config.ghToken = process.env.ghToken;
config.azureSearchAPIKey = process.env.azureSearchAPIKey;
config.azureSearchURL = process.env.azureSearchURL;
config.pineconeAPIKey = process.env.pineconeAPIKey;
hostPort = process.env.PORT ? process.env.PORT : 8080;
} else {
mongooseConnectionString = config.devMongoDBConnectionString;
Expand Down
181 changes: 47 additions & 134 deletions backendsrc/embeddingsHandler.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
const pythonWorkerHandler = require('./pythonWorkerHandler');
const zmq = require('zeromq');
const {
SearchClient,
SearchIndexClient,
SearchIndexerClient,
AzureKeyCredential,
} = require("@azure/search-documents");
const { Pinecone } = require("@pinecone-database/pinecone");

class embeddingsHandler {

Expand All @@ -17,77 +12,11 @@ class embeddingsHandler {
// Set up Python Worker
this.sock = new zmq.Request;
this.pythonWorker = new pythonWorkerHandler(this.sock);

this.azureSearchURL = inConfigObject.azureSearchURL;
this.azureSearchAPIKey = inConfigObject.azureSearchAPIKey;

this.azureSearchIndexClient = new SearchIndexClient(inConfigObject.azureSearchURL, new AzureKeyCredential(inConfigObject.azureSearchAPIKey));
}

async createIndexIfNeeded() {
// TODO Need some kind of error check to not do this every time....
// Create index
const result = await this.azureSearchIndexClient.createOrUpdateIndex({
name: embeddingsHandler.indexName,
fields: [
{
type: "Edm.String",
name: "issue_id",
key: true,
filterable: true,
sortable: true,
},
{
type: "Edm.String",
name: "repo_id",
filterable: true,
sortable: true,
},
{
type: "Collection(Edm.Single)",
name: "title_vector",
searchable: true,
vectorSearchDimensions: embeddingsHandler.embeddingDimensions,
vectorSearchProfile: "vector-search-profile",
},
],
vectorSearch: {
algorithms: [{ name: "vector-search-algorithm", kind: "hnsw" }],
profiles: [
{
name: "vector-search-profile",
algorithm: "vector-search-algorithm",
},
],
},
this.pinecone = new Pinecone({
environment: "gcp-starter",
apiKey: inConfigObject.pineconeAPIKey,
});

return true;
}

async addEmbedding(inputIssue) {
// Get embedding from Python Worker
const embedding = await this.pythonWorker.getEmbedding(inputIssue.title);

const collectionName = embeddingsHandler.indexName;

const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));

// Set up index
await this.createIndexIfNeeded(inputIssue.repoRef);

// Add to Azure Search
let uploadResult = await searchClient.uploadDocuments([
{
issue_id: inputIssue._id.toString(),
repo_id: inputIssue.repoRef.toString(),
title_vector: embedding,
},
]);

const uploadsSucceeded = uploadResult.results.every((result) => result.succeeded);

return true;
this.index = this.pinecone.Index(embeddingsHandler.indexName);
}

async addMultipleEmbeddings(inputIssues) {
Expand All @@ -96,94 +25,78 @@ class embeddingsHandler {
if (inputIssues.length != 0) {
const titles = inputIssues.map(issue => issue.title);
const embeddings = await this.pythonWorker.getMultipleEmbeddings(titles);
const collectionName = embeddingsHandler.indexName;

const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));

// Set up index
await this.createIndexIfNeeded(inputIssues[0].repoRef);

// Prepare documents for upload
const documents = inputIssues.map((issue, index) => ({
issue_id: issue._id.toString(),
repo_id: issue.repoRef.toString(),
title_vector: embeddings[index],
}));
// Add to Azure Search
let uploadResult = await searchClient.uploadDocuments(documents);

const uploadsSucceeded = uploadResult.results.every((result) => result.succeeded);
return uploadsSucceeded;

// Get list of issues grouped by repoRef with embeddings added
let issuesByRepo = {};
for (let i = 0; i < inputIssues.length; i++) {
let issue = inputIssues[i];
let embedding = embeddings[i];
if (!issuesByRepo[issue.repoRef.toString()]) {
issuesByRepo[issue.repoRef.toString()] = [];
}
issuesByRepo[issue.repoRef.toString()].push({
id: issue._id.toString(),
values: embedding,
});
}

// Upsert embeddings into Pinecone
for (const [repoRef, issues] of Object.entries(issuesByRepo)) {
await this.index.namespace(repoRef).upsert(issues);
}

return true;
}
else {
return true;
}

}

async removeEmbedding(inputIssue) {
const collectionName = this.getCollectionName(inputIssue.repoRef);
const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));

let deleteResult = await searchClient.deleteDocuments([
{
issue_id: inputIssue._id.toString(),
},
]);

// Check if index is empty, if yes delete it
const searchResults = await searchClient.count("*");
if (searchResults.count == 0) {
await this.azureSearchIndexClient.deleteIndex(collectionName);
}
await this.index.namespace(inputIssue.repoRef.toString()).deleteOne(inputIssue._id.toString());

return true;
}

async removeRepo(inputRepoRef) {
const collectionName = this.getCollectionName(inputRepoRef);
const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));

let deleteResult = await searchClient.deleteIndex(collectionName);
await this.index.namespace(inputRepoRef.toString()).deleteAll();

return true;
}

async getSimilarIssueIDs(repo, issueTitle, issue) {
const collectionName = embeddingsHandler.indexName;

const inputVector = await this.pythonWorker.getEmbedding(issueTitle);

const searchClient = new SearchClient(this.azureSearchURL, collectionName, new AzureKeyCredential(this.azureSearchAPIKey));
let searchFilter = `repo_id eq '${repo._id.toString()}'`;

let searchFilter = `repo_id eq '${repo._id.toString()}'`;
let numberOfReturnedIssues = 5;

if (issue) {
searchFilter += ` and issue_id ne '${issue._id.toString()}'`;
}

const searchResults = await searchClient.search("*", {
// Filter to not infclude input issue
filter: searchFilter,
vectorQueries: [
{
kind: "vector",
fields: ["title_vector"],
kNearestNeighborsCount: 5,
// An embedding of the query "What are the most luxurious hotels?"
vector: inputVector,
},
],
let searchResults = await this.index.namespace(repo._id.toString()).query({
topK: numberOfReturnedIssues + 1,
vector: inputVector,
includeValues: false
});

// If top result is the same issue, remove it
if (issue && searchResults && searchResults.matches[0] && searchResults.matches[0].id == issue._id.toString()) {
searchResults.matches.shift();
} else {
// If the searchResults are longe than 5 get rid of the last one
if (searchResults && searchResults.matches.length > numberOfReturnedIssues) {
searchResults.matches.pop();
}
}

let formattedResults = [];
for await (const result of searchResults.results) {
for await (const result of searchResults.matches) {
formattedResults.push({
score: result.score,
id: result.document.issue_id
id: result.id
});
}

Expand Down
10 changes: 5 additions & 5 deletions backendsrc/oneOffScriptHelpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
module.exports = {
async AddEmbeddingsToIssuesInRepo(inIssueDetails, inEmbeddingsHandler, inRepo) {

if (inRepo.shortURL == "microsoft/wsl") {
if (inRepo.shortURL == "microsoft/terminal") {
try {
let fourWeeksAgo = new Date((new Date().getTime() - (4 * 7 * 24 * 60 * 60 * 1000)));
let startPeriod = new Date((new Date().getTime() - (20 * 12 * 4 * 7 * 24 * 60 * 60 * 1000))); // 20 years ago
let totalIssues = await inIssueDetails.countDocuments({
repoRef: inRepo._id,
created_at: { $gte: fourWeeksAgo }
created_at: { $gte: startPeriod }
});
let pageSize = 200;
let pageSize = 100;
let pages = Math.ceil(totalIssues / pageSize);

for (let i = 0; i < pages; i++) {
let issueList = await inIssueDetails.find({
repoRef: inRepo._id,
created_at: { $gte: fourWeeksAgo }
created_at: { $gte: startPeriod }
}).sort({ number: 1 }).skip(i * pageSize).limit(pageSize);
await inEmbeddingsHandler.addMultipleEmbeddings(issueList);
let percentComplete = ((i + 1) / pages) * 100;
Expand Down
Loading

0 comments on commit f72fa59

Please sign in to comment.