-
Notifications
You must be signed in to change notification settings - Fork 123
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Milvus-doc-bot
authored and
Milvus-doc-bot
committed
Oct 28, 2024
1 parent
075fca4
commit aae217b
Showing
4 changed files
with
85 additions
and
63 deletions.
There are no files selected for viewing
2 changes: 1 addition & 1 deletion
2
localization/v2.4.x/site/en/integrations/integrate_with_sentencetransformers.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
{"codeList":["pip install pymilvus sentence-transformers datasets tqdm\n","from datasets import load_dataset\nfrom pymilvus import MilvusClient, connections\nfrom pymilvus import FieldSchema, CollectionSchema, DataType, Collection\nfrom sentence_transformers import SentenceTransformer\nfrom tqdm import tqdm\n","embedding_dim = 384\ncollection_name = \"movie_embeddings\"\n","ds = load_dataset(\"vishnupriyavr/wiki-movie-plots-with-summaries\", split=\"train\")\nprint(ds)\n","connections.connect(uri=\"./sentence_transformers_example.db\")\n","fields = [\n FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=True),\n FieldSchema(name='title', dtype=DataType.VARCHAR, max_length=256),\n FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, dim=embedding_dim)\n]\n\nschema = CollectionSchema(fields=fields, enable_dynamic_field=False)\ncollection = Collection(name=collection_name, schema=schema)\n","params = {\n 'index_type':\"FLAT\",\n 'metric_type': \"IP\"\n }\n\ncollection.create_index(\n 'embedding',\n params\n)\n","model = SentenceTransformer(\"all-MiniLM-L12-v2\")\n","for batch in tqdm(ds.batch(batch_size=512)):\n embeddings = model.encode(batch['PlotSummary'])\n data = [{\"title\": title, \"embedding\": embedding} for title, embedding in zip(batch['Title'], embeddings)]\n res = collection.insert(data=data)\n","collection.flush()\nprint(collection.num_entities)\n","queries = [\n 'A shark terrorizes an LA beach.',\n 'An archaeologist searches for ancient artifacts while fighting Nazis.',\n 'Teenagers in detention learn about themselves.',\n 'A teenager fakes illness to get off school and have adventures with two friends.',\n 'A young couple with a kid look after a hotel during winter and the husband goes insane.',\n 'Four turtles fight bad guys.'\n ]\n\n# Search the database based on input text\ndef embed_search(data):\n embeds = model.encode(data) \n return [x for x in embeds]\n\nsearch_data = embed_search(queries)\n\nres = collection.search(\n data=search_data,\n anns_field=\"embedding\",\n param={},\n limit=3,\n output_fields=['title']\n)\n\nfor idx, hits in enumerate(res):\n print('Title:', queries[idx])\n # print('Search Time:', end-start)\n print('Results:')\n for hit in hits:\n print( hit.entity.get('title'), '(', round(hit.distance, 2), ')')\n print()\n","Title: An archaeologist searches for ancient artifacts while fighting Nazis.\nResults:\n\"Pimpernel\" Smith ( 0.48 )\nPhantom of Chinatown ( 0.42 )\nCounterblast ( 0.41 )\n\nTitle: Teenagers in detention learn about themselves.\nResults:\nThe Breakfast Club ( 0.54 )\nUp the Academy ( 0.46 )\nFame ( 0.43 )\n\nTitle: A teenager fakes illness to get off school and have adventures with two friends.\nResults:\nFerris Bueller's Day Off ( 0.48 )\nFever Lake ( 0.47 )\nA Walk to Remember ( 0.45 )\n\nTitle: A young couple with a kid look after a hotel during winter and the husband goes insane.\nResults:\nAlways a Bride ( 0.54 )\nFast and Loose ( 0.49 )\nThe Shining ( 0.48 )\n\nTitle: Four turtles fight bad guys.\nResults:\nTMNT 2: Out of the Shadows ( 0.49 )\nTeenage Mutant Ninja Turtles II: The Secret of the Ooze ( 0.47 )\nGamera: Super Monster ( 0.43 )\n"],"headingContent":"Movie Search Using Milvus and SentenceTransformers","anchorList":[{"label":"Movie Search Using Milvus and SentenceTransformers","href":"Movie-Search-Using-Milvus-and-SentenceTransformers","type":1,"isActive":false},{"label":"Required Libraries","href":"Required-Libraries","type":2,"isActive":false},{"label":"Downloading and Opening the Dataset","href":"Downloading-and-Opening-the-Dataset","type":2,"isActive":false},{"label":"Connecting to the Database","href":"Connecting-to-the-Database","type":2,"isActive":false},{"label":"Inserting the Data","href":"Inserting-the-Data","type":2,"isActive":false},{"label":"Performing the Search","href":"Performing-the-Search","type":2,"isActive":false}]} | ||
{"codeList":["pip install pymilvus sentence-transformers datasets tqdm\n","from datasets import load_dataset\nfrom pymilvus import MilvusClient\nfrom pymilvus import FieldSchema, CollectionSchema, DataType\nfrom sentence_transformers import SentenceTransformer\nfrom tqdm import tqdm\n","embedding_dim = 384\ncollection_name = \"movie_embeddings\"\n","ds = load_dataset(\"vishnupriyavr/wiki-movie-plots-with-summaries\", split=\"train\")\nprint(ds)\n","client = MilvusClient(uri=\"./sentence_transformers_example.db\")\n","fields = [\n FieldSchema(name=\"id\", dtype=DataType.INT64, is_primary=True, auto_id=True),\n FieldSchema(name=\"title\", dtype=DataType.VARCHAR, max_length=256),\n FieldSchema(name=\"embedding\", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim),\n FieldSchema(name=\"year\", dtype=DataType.INT64),\n FieldSchema(name=\"origin\", dtype=DataType.VARCHAR, max_length=64),\n]\n\nschema = CollectionSchema(fields=fields, enable_dynamic_field=False)\nclient.create_collection(collection_name=collection_name, schema=schema)\n","index_params = client.prepare_index_params()\nindex_params.add_index(field_name=\"embedding\", index_type=\"FLAT\", metric_type=\"IP\")\nclient.create_index(collection_name, index_params)\n","model = SentenceTransformer(\"all-MiniLM-L12-v2\")\n","for batch in tqdm(ds.batch(batch_size=512)):\n embeddings = model.encode(batch[\"PlotSummary\"])\n data = [\n {\"title\": title, \"embedding\": embedding, \"year\": year, \"origin\": origin}\n for title, embedding, year, origin in zip(\n batch[\"Title\"], embeddings, batch[\"Release Year\"], batch[\"Origin/Ethnicity\"]\n )\n ]\n res = client.insert(collection_name=collection_name, data=data)\n","queries = [\n 'A shark terrorizes an LA beach.',\n 'An archaeologist searches for ancient artifacts while fighting Nazis.',\n 'Teenagers in detention learn about themselves.',\n 'A teenager fakes illness to get off school and have adventures with two friends.',\n 'A young couple with a kid look after a hotel during winter and the husband goes insane.',\n 'Four turtles fight bad guys.'\n ]\n\n# Search the database based on input text\ndef embed_query(data):\n vectors = model.encode(data)\n return [x for x in vectors]\n\n\nquery_vectors = embed_query(queries)\n\nres = client.search(\n collection_name=collection_name,\n data=query_vectors,\n filter='origin == \"American\" and year > 1945 and year < 2000',\n anns_field=\"embedding\",\n limit=3,\n output_fields=[\"title\"],\n)\n\nfor idx, hits in enumerate(res):\n print(\"Query:\", queries[idx])\n print(\"Results:\")\n for hit in hits:\n print(hit[\"entity\"].get(\"title\"), \"(\", round(hit[\"distance\"], 2), \")\")\n print()\n","Query: An archaeologist searches for ancient artifacts while fighting Nazis.\nResults:\nLove Slaves of the Amazons ( 0.4 )\nA Time to Love and a Time to Die ( 0.39 )\nThe Fifth Element ( 0.39 )\n\nQuery: Teenagers in detention learn about themselves.\nResults:\nThe Breakfast Club ( 0.54 )\nUp the Academy ( 0.46 )\nFame ( 0.43 )\n\nQuery: A teenager fakes illness to get off school and have adventures with two friends.\nResults:\nFerris Bueller's Day Off ( 0.48 )\nFever Lake ( 0.47 )\nLosin' It ( 0.39 )\n\nQuery: A young couple with a kid look after a hotel during winter and the husband goes insane.\nResults:\nThe Shining ( 0.48 )\nThe Four Seasons ( 0.42 )\nHighball ( 0.41 )\n\nQuery: Four turtles fight bad guys.\nResults:\nTeenage Mutant Ninja Turtles II: The Secret of the Ooze ( 0.47 )\nDevil May Hare ( 0.43 )\nAttack of the Giant Leeches ( 0.42 )\n"],"headingContent":"Movie Search Using Milvus and SentenceTransformers","anchorList":[{"label":"Movie Search Using Milvus and SentenceTransformers","href":"Movie-Search-Using-Milvus-and-SentenceTransformers","type":1,"isActive":false},{"label":"Required Libraries","href":"Required-Libraries","type":2,"isActive":false},{"label":"Downloading and Opening the Dataset","href":"Downloading-and-Opening-the-Dataset","type":2,"isActive":false},{"label":"Connecting to the Database","href":"Connecting-to-the-Database","type":2,"isActive":false},{"label":"Inserting the Data","href":"Inserting-the-Data","type":2,"isActive":false},{"label":"Performing the Search","href":"Performing-the-Search","type":2,"isActive":false}]} |
Oops, something went wrong.