Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pinecone): pinecone batch upsert #927

Merged
merged 11 commits into from
Dec 11, 2024
6 changes: 6 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ require (
gorm.io/plugin/dbresolver v1.5.1
)

require (
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/pinecone-io/go-pinecone v1.1.1
)

require (
github.com/machinebox/graphql v0.2.2
github.com/matryer/is v1.4.1 // indirect
Expand Down
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbt
github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1 h1:d0Ct1dZwgwMO0Llf81Eu+Lyj6kwqXdqHP/WsSkEria0=
github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1/go.mod h1:f3HCSN1fBWjcpGtXyM119MJgeQl838v6so/PQOqvE1w=
Expand Down Expand Up @@ -516,6 +517,8 @@ github.com/apache/arrow/go/arrow v0.0.0-20210818145353-234c94e4ce64/go.mod h1:2q
github.com/apache/arrow/go/arrow v0.0.0-20211013220434-5962184e7a30/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
github.com/apache/arrow/go/v15 v15.0.2 h1:60IliRbiyTWCWjERBCkO1W4Qun9svcYoZrSLcyOsMLE=
github.com/apache/arrow/go/v15 v15.0.2/go.mod h1:DGXsR3ajT524njufqf95822i+KTh+yea1jass9YXgjA=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/araddon/dateparse v0.0.0-20180729174819-cfd92a431d0e/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI=
github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1 h1:TEBmxO80TM04L8IuMWk77SGL1HomBmKTdzdJLLWznxI=
github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI=
Expand Down Expand Up @@ -579,6 +582,7 @@ github.com/bkaradzic/go-lz4 v1.0.0/go.mod h1:0YdlkowM3VswSROI7qDxhRvJ3sLhlFrRRwj
github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84=
github.com/blang/semver v3.1.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
github.com/bshuster-repo/logrus-logstash-hook v0.4.1/go.mod h1:zsTqEiSzDgAa/8GZR7E1qaXrhYNDKBYy5/dWPTIflbk=
Expand Down Expand Up @@ -1391,6 +1395,7 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
Expand Down Expand Up @@ -1604,6 +1609,8 @@ github.com/npillmayer/nestext v0.1.3/go.mod h1:h2lrijH8jpicr25dFY+oAJLyzlya6jhnu
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA=
github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
Expand Down Expand Up @@ -1692,6 +1699,8 @@ github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi
github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ=
github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pinecone-io/go-pinecone v1.1.1 h1:pKoIiYcBIbrR7gaq0JXPiVnNEtevFYeq/AYL7T0NbbE=
github.com/pinecone-io/go-pinecone v1.1.1/go.mod h1:KfJhn4yThX293+fbtrZLnxe2PJYo8557Py062W4FYKk=
github.com/pkg/browser v0.0.0-20210706143420-7d21f8c997e2/go.mod h1:HKlIX3XHQyzLZPlr7++PzdhaXEj94dEiJgZDTsxEqUI=
github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8/go.mod h1:HKlIX3XHQyzLZPlr7++PzdhaXEj94dEiJgZDTsxEqUI=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
Expand Down Expand Up @@ -1843,6 +1852,7 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
github.com/stefanberger/go-pkcs11uri v0.0.0-20201008174630-78d3cae3a980/go.mod h1:AO3tvPzVZ/ayst6UlUKUv6rcPQInYe3IknH3jYhAKu8=
Expand Down
44 changes: 43 additions & 1 deletion pkg/component/data/pinecone/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ The Pinecone component is a data component that allows users to build and search
It can carry out the following tasks:
- [Query](#query)
- [Upsert](#upsert)
- [Batch Upsert](#batch-upsert)
- [Rerank](#rerank)


Expand Down Expand Up @@ -102,7 +103,7 @@ Retrieve the ids of the most similar items in a namespace, along with their simi

### Upsert

Writes vectors into a namespace. If a new value is upserted for an existing vector id, it will overwrite the previous value.
Writes vectors into a namespace. If a new value is upserted for an existing vector id, it will overwrite the previous value. This task will be soon replaced by `TASK_BATCH_UPSERT`, which extends its functionality.

<div class="markdown-col-no-wrap" data-col-1 data-col-2>

Expand All @@ -120,6 +121,47 @@ Writes vectors into a namespace. If a new value is upserted for an existing vect



<div class="markdown-col-no-wrap" data-col-1 data-col-2>

| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Upserted Count | `upserted-count` | integer | Number of records modified or added. |
</div>


### Batch Upsert

Writes vectors into a namespace. If a new value is upserted for an existing vector ID, it will overwrite the previous value.

<div class="markdown-col-no-wrap" data-col-1 data-col-2>

| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_BATCH_UPSERT` |
| [Vectors](#batch-upsert-vectors) (required) | `vectors` | array[object] | Array of vectors to upsert |
| Namespace | `namespace` | string | The namespace to query. |
</div>


<details>
<summary> Input Objects in Batch Upsert</summary>

<h4 id="batch-upsert-vectors">Vectors</h4>

Array of vectors to upsert

<div class="markdown-col-no-wrap" data-col-1 data-col-2>

| Field | Field ID | Type | Note |
| :--- | :--- | :--- | :--- |
| ID | `id` | string | The unique ID of the vector. |
| Metadata | `metadata` | object | The vector metadata. This is a set of key-value pairs that can be used to store additional information about the vector. The values can have the following types: string, number, boolean, or array of strings. |
| Values | `values` | array | An array of dimensions for the vector to be saved. |
</div>
</details>



<div class="markdown-col-no-wrap" data-col-1 data-col-2>

| Output | ID | Type | Description |
Expand Down
95 changes: 63 additions & 32 deletions pkg/component/data/pinecone/v0/component_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@ import (
"net/http/httptest"
"testing"

"github.com/pinecone-io/go-pinecone/pinecone"
"google.golang.org/protobuf/types/known/structpb"

qt "github.com/frankban/quicktest"

"github.com/instill-ai/pipeline-backend/pkg/component/base"
"github.com/instill-ai/pipeline-backend/pkg/component/internal/mock"
"github.com/instill-ai/pipeline-backend/pkg/component/internal/util/httpclient"
"github.com/instill-ai/pipeline-backend/pkg/data"
"github.com/instill-ai/pipeline-backend/pkg/data/format"
"github.com/instill-ai/x/errmsg"
)

Expand All @@ -24,7 +27,7 @@ const (
namespace = "pantone"
threshold = 0.9

upsertOK = `{"upsertedCount": 1}`
upsertOK = `{"upsertedCount": 2}`

queryOK = `
{
Expand All @@ -46,23 +49,29 @@ const (
}`

errResp = `
{
"code": 3,
"message": "Cannot provide both ID and vector at the same time.",
"details": []
}`

{
"code": 3,
"message": "Cannot provide both ID and vector at the same time.",
"details": []
}`
)

func newValue(in any) format.Value {
v, _ := data.NewValue(in)
return v
}

var (
vectorA = vector{
ID: "A",
Values: []float64{2.23},
Metadata: map[string]any{"color": "pumpkin"},
Values: []float32{2.23},
Metadata: map[string]format.Value{"color": newValue("pumpkin")},
}
vectorB = vector{
ID: "B",
Values: []float64{3.32},
Metadata: map[string]any{"color": "cerulean"},
Values: []float32{3.32},
Metadata: map[string]format.Value{"color": newValue("cerulean")},
}
queryByVector = queryInput{
Namespace: "color-schemes",
Expand Down Expand Up @@ -94,6 +103,11 @@ func TestComponent_Execute(t *testing.T) {
c := qt.New(t)
ctx := context.Background()

pvA, err := vectorA.toPinecone()
c.Assert(err, qt.IsNil)
pvB, err := vectorB.toPinecone()
c.Assert(err, qt.IsNil)

testcases := []struct {
name string

Expand All @@ -108,15 +122,15 @@ func TestComponent_Execute(t *testing.T) {
{
name: "ok - upsert",

task: taskUpsert,
execIn: upsertInput{
vector: vectorA,
task: taskBatchUpsert,
execIn: taskBatchUpsertInput{
Vectors: []vector{vectorA, vectorB},
Namespace: namespace,
},
wantExec: upsertOutput{RecordsUpserted: 1},
wantExec: taskUpsertOutput{UpsertedCount: 2},

wantClientPath: upsertPath,
wantClientReq: upsertReq{Vectors: []vector{vectorA}, Namespace: namespace},
wantClientReq: upsertReq{Vectors: []*pinecone.Vector{pvA, pvB}, Namespace: namespace},
clientResp: upsertOK,
},
{
Expand All @@ -128,11 +142,11 @@ func TestComponent_Execute(t *testing.T) {
Namespace: "color-schemes",
Matches: []match{
{
vector: vectorA,
Vector: pvA,
Score: 0.99,
},
{
vector: vectorB,
Vector: pvB,
Score: 0.87,
},
},
Expand All @@ -151,7 +165,7 @@ func TestComponent_Execute(t *testing.T) {
Namespace: "color-schemes",
Matches: []match{
{
vector: vectorA,
Vector: pvA,
Score: 0.99,
},
},
Expand All @@ -170,11 +184,11 @@ func TestComponent_Execute(t *testing.T) {
Namespace: "color-schemes",
Matches: []match{
{
vector: vectorA,
Vector: pvA,
Score: 0.99,
},
{
vector: vectorB,
Vector: pvB,
Score: 0.87,
},
},
Expand Down Expand Up @@ -234,22 +248,39 @@ func TestComponent_Execute(t *testing.T) {
})
c.Assert(err, qt.IsNil)

pbIn, err := base.ConvertToStructpb(tc.execIn)
c.Assert(err, qt.IsNil)
wantJSON, err := json.Marshal(tc.wantExec)

ir, ow, eh, job := mock.GenerateMockJob(c)
ir.ReadMock.Return(pbIn, nil)
ow.WriteMock.Optional().Set(func(ctx context.Context, output *structpb.Struct) (err error) {
wantJSON, err := json.Marshal(tc.wantExec)
c.Assert(err, qt.IsNil)

switch tc.task {
case taskBatchUpsert:
ir.ReadDataMock.Set(func(ctx context.Context, in any) error {
switch in := in.(type) {
case *taskBatchUpsertInput:
*in = tc.execIn.(taskBatchUpsertInput)
}
return nil
})

ow.WriteDataMock.Optional().Set(func(ctx context.Context, output any) error {
c.Check(wantJSON, qt.JSONEquals, output)
return nil
})
default:
pbIn, err := base.ConvertToStructpb(tc.execIn)
c.Assert(err, qt.IsNil)
c.Check(wantJSON, qt.JSONEquals, output.AsMap())
return nil
})
eh.ErrorMock.Optional()

ir.ReadMock.Return(pbIn, nil)
ow.WriteMock.Optional().Set(func(ctx context.Context, output *structpb.Struct) (err error) {
c.Check(wantJSON, qt.JSONEquals, output.AsMap())
return nil
})
}

eh.ErrorMock.Optional()
err = exec.Execute(ctx, []*base.Job{job})
c.Assert(err, qt.IsNil)

})
}

Expand All @@ -270,7 +301,7 @@ func TestComponent_Execute(t *testing.T) {
exec, err := cmp.CreateExecution(base.ComponentExecution{
Component: cmp,
Setup: setup,
Task: taskUpsert,
Task: taskQuery,
})
c.Assert(err, qt.IsNil)

Expand All @@ -296,7 +327,7 @@ func TestComponent_Execute(t *testing.T) {
exec, err := cmp.CreateExecution(base.ComponentExecution{
Component: cmp,
Setup: setup,
Task: taskUpsert,
Task: taskQuery,
})
c.Assert(err, qt.IsNil)

Expand Down
1 change: 1 addition & 0 deletions pkg/component/data/pinecone/v0/config/definition.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"availableTasks": [
"TASK_QUERY",
"TASK_UPSERT",
"TASK_BATCH_UPSERT",
"TASK_RERANK"
],
"custom": false,
Expand Down
Loading
Loading