From b9788dc2f167a51f8cf8a7644c2c85cef7d8a1da Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Fri, 8 Nov 2024 14:08:41 +0800 Subject: [PATCH] test: add rbac backup restore check in ci Signed-off-by: zhuwenxing --- example/prepare_data.py | 210 ++++++++++++++++++++-------------------- example/verify_data.py | 208 ++++++++++++++++++++------------------- 2 files changed, 213 insertions(+), 205 deletions(-) diff --git a/example/prepare_data.py b/example/prepare_data.py index 531e0700..067e0303 100644 --- a/example/prepare_data.py +++ b/example/prepare_data.py @@ -15,108 +15,108 @@ FieldSchema, CollectionSchema, DataType, Collection, ) - -fmt = "\n=== {:30} ===\n" -search_latency_fmt = "search latency = {:.4f}s" -num_entities, dim = 3000, 8 - -################################################################################# -# 1. connect to Milvus -# Add a new connection alias `default` for Milvus server in `localhost:19530` -# Actually the "default" alias is a buildin in PyMilvus. -# If the address of Milvus is the same as `localhost:19530`, you can omit all -# parameters and call the method as: `connections.connect()`. -# -# Note: the `using` parameter of the following methods is default to "default". -print(fmt.format("start connecting to Milvus")) - -host = os.environ.get('MILVUS_HOST') -if host == None: - host = "localhost" -print(fmt.format(f"Milvus host: {host}")) -connections.connect("default", host=host, port="19530") - -has = utility.has_collection("hello_milvus") -print(f"Does collection hello_milvus exist in Milvus: {has}") - -################################################################################# -# 2. create collection -# We're going to create a collection with 3 fields. -# +-+------------+------------+------------------+------------------------------+ -# | | field name | field type | other attributes | field description | -# +-+------------+------------+------------------+------------------------------+ -# |1| "pk" | Int64 | is_primary=True | "primary field" | -# | | | | auto_id=False | | -# +-+------------+------------+------------------+------------------------------+ -# |2| "random" | Double | | "a double field" | -# +-+------------+------------+------------------+------------------------------+ -# |3|"embeddings"| FloatVector| dim=8 | "float vector with dim 8" | -# +-+------------+------------+------------------+------------------------------+ -fields = [ - FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), - FieldSchema(name="random", dtype=DataType.DOUBLE), - FieldSchema(name="var", dtype=DataType.VARCHAR, max_length=65535), - FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) -] - -schema = CollectionSchema(fields, "hello_milvus") - -print(fmt.format("Create collection `hello_milvus`")) -hello_milvus = Collection("hello_milvus", schema, consistency_level="Strong") - -################################################################################ -# 3. insert data -# We are going to insert 3000 rows of data into `hello_milvus` -# Data to be inserted must be organized in fields. -# -# The insert() method returns: -# - either automatically generated primary keys by Milvus if auto_id=True in the schema; -# - or the existing primary key field from the entities if auto_id=False in the schema. - -print(fmt.format("Start inserting entities")) -rng = np.random.default_rng(seed=19530) -entities = [ - # provide the pk field because `auto_id` is set to False - [i for i in range(num_entities)], - rng.random(num_entities).tolist(), # field random, only supports list - [str(i) for i in range(num_entities)], - rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list -] - -insert_result = hello_milvus.insert(entities) -hello_milvus.flush() -print(f"Number of entities in hello_milvus: {hello_milvus.num_entities}") # check the num_entites - -# create another collection -fields2 = [ - FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True), - FieldSchema(name="random", dtype=DataType.DOUBLE), - FieldSchema(name="var", dtype=DataType.VARCHAR, max_length=65535), - FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) -] - -schema2 = CollectionSchema(fields2, "hello_milvus2") - -print(fmt.format("Create collection `hello_milvus2`")) -hello_milvus2 = Collection("hello_milvus2", schema2, consistency_level="Strong") - -entities2 = [ - rng.random(num_entities).tolist(), # field random, only supports list - [str(i) for i in range(num_entities)], - rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list -] - -insert_result2 = hello_milvus2.insert(entities2) -hello_milvus2.flush() -insert_result2 = hello_milvus2.insert(entities2) -hello_milvus2.flush() - -# index_params = {"index_type": "IVF_FLAT", "params": {"nlist": 128}, "metric_type": "L2"} -# hello_milvus.create_index("embeddings", index_params) -# hello_milvus2.create_index(field_name="var",index_name="scalar_index") - -# index_params2 = {"index_type": "Trie"} -# hello_milvus2.create_index("var", index_params2) - -print(f"Number of entities in hello_milvus2: {hello_milvus2.num_entities}") # check the num_entites - +import argparse + + + +def main(uri="http://127.0.0.1:19530", token="root:Milvus"): + fmt = "\n=== {:30} ===\n" + num_entities, dim = 3000, 8 + + ################################################################################# + # 1. connect to Milvus + # Add a new connection alias `default` for Milvus server in `localhost:19530` + # Actually the "default" alias is a buildin in PyMilvus. + # If the address of Milvus is the same as `localhost:19530`, you can omit all + # parameters and call the method as: `connections.connect()`. + # + # Note: the `using` parameter of the following methods is default to "default". + print(fmt.format("start connecting to Milvus")) + + print(fmt.format(f"Milvus uri: {uri}")) + connections.connect("default", uri=uri, token=token) + + has = utility.has_collection("hello_milvus") + print(f"Does collection hello_milvus exist in Milvus: {has}") + + ################################################################################# + # 2. create collection + # We're going to create a collection with 3 fields. + # +-+------------+------------+------------------+------------------------------+ + # | | field name | field type | other attributes | field description | + # +-+------------+------------+------------------+------------------------------+ + # |1| "pk" | Int64 | is_primary=True | "primary field" | + # | | | | auto_id=False | | + # +-+------------+------------+------------------+------------------------------+ + # |2| "random" | Double | | "a double field" | + # +-+------------+------------+------------------+------------------------------+ + # |3|"embeddings"| FloatVector| dim=8 | "float vector with dim 8" | + # +-+------------+------------+------------------+------------------------------+ + fields = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), + FieldSchema(name="random", dtype=DataType.DOUBLE), + FieldSchema(name="var", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) + ] + + schema = CollectionSchema(fields, "hello_milvus") + + print(fmt.format("Create collection `hello_milvus`")) + hello_milvus = Collection("hello_milvus", schema, consistency_level="Strong") + + ################################################################################ + # 3. insert data + # We are going to insert 3000 rows of data into `hello_milvus` + # Data to be inserted must be organized in fields. + # + # The insert() method returns: + # - either automatically generated primary keys by Milvus if auto_id=True in the schema; + # - or the existing primary key field from the entities if auto_id=False in the schema. + + print(fmt.format("Start inserting entities")) + rng = np.random.default_rng(seed=19530) + entities = [ + # provide the pk field because `auto_id` is set to False + [i for i in range(num_entities)], + rng.random(num_entities).tolist(), # field random, only supports list + [str(i) for i in range(num_entities)], + rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list + ] + + insert_result = hello_milvus.insert(entities) + hello_milvus.flush() + print(f"Number of entities in hello_milvus: {hello_milvus.num_entities}") # check the num_entites + + # create another collection + fields2 = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="random", dtype=DataType.DOUBLE), + FieldSchema(name="var", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) + ] + + schema2 = CollectionSchema(fields2, "hello_milvus2") + + print(fmt.format("Create collection `hello_milvus2`")) + hello_milvus2 = Collection("hello_milvus2", schema2, consistency_level="Strong") + + entities2 = [ + rng.random(num_entities).tolist(), # field random, only supports list + [str(i) for i in range(num_entities)], + rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list + ] + + insert_result2 = hello_milvus2.insert(entities2) + hello_milvus2.flush() + insert_result2 = hello_milvus2.insert(entities2) + hello_milvus2.flush() + + print(f"Number of entities in hello_milvus2: {hello_milvus2.num_entities}") # check the num_entities + + +if __name__ == "__main__": + args = argparse.ArgumentParser(description="prepare data") + args.add_argument("--uri", type=str, default="http://127.0.0.1:19530", help="Milvus server uri") + args.add_argument("--token", type=str, default="root:Milvus", help="Milvus server token") + args = args.parse_args() + main(args.uri, args.token) diff --git a/example/verify_data.py b/example/verify_data.py index df2058b0..01db7218 100644 --- a/example/verify_data.py +++ b/example/verify_data.py @@ -7,107 +7,115 @@ FieldSchema, CollectionSchema, DataType, Collection, ) +import argparse -fmt = "\n=== {:30} ===\n" -search_latency_fmt = "search latency = {:.4f}s" -num_entities, dim = 3000, 8 -rng = np.random.default_rng(seed=19530) -entities = [ - # provide the pk field because `auto_id` is set to False - [i for i in range(num_entities)], - rng.random(num_entities).tolist(), # field random, only supports list - rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list -] - -################################################################################ -# 1. get recovered collection hello_milvus_recover -print(fmt.format("start connecting to Milvus")) -host = os.environ.get('MILVUS_HOST') -if host == None: - host = "localhost" -print(fmt.format(f"Milvus host: {host}")) -connections.connect("default", host=host, port="19530") - -recover_collections = ["hello_milvus_recover", "hello_milvus2_recover"] - -for recover_collection_name in recover_collections: - has = utility.has_collection(recover_collection_name) - print(f"Does collection {recover_collection_name} exist in Milvus: {has}") - recover_collection = Collection(recover_collection_name) - print(recover_collection.schema) - recover_collection.flush() - - print(f"Number of entities in Milvus: {recover_collection_name} : {recover_collection.num_entities}") # check the num_entites - ################################################################################ - # 4. create index - # We are going to create an IVF_FLAT index for hello_milvus_recover collection. - # create_index() can only be applied to `FloatVector` and `BinaryVector` fields. - print(fmt.format("Start Creating index IVF_FLAT")) - index = { - "index_type": "IVF_FLAT", - "metric_type": "L2", - "params": {"nlist": 128}, - } - - recover_collection.create_index("embeddings", index) +def main(uri, token): + fmt = "\n=== {:30} ===\n" + search_latency_fmt = "search latency = {:.4f}s" + num_entities, dim = 3000, 8 + rng = np.random.default_rng(seed=19530) + entities = [ + # provide the pk field because `auto_id` is set to False + [i for i in range(num_entities)], + rng.random(num_entities).tolist(), # field random, only supports list + rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list + ] ################################################################################ - # 5. search, query, and hybrid search - # After data were inserted into Milvus and indexed, you can perform: - # - search based on vector similarity - # - query based on scalar filtering(boolean, int, etc.) - # - hybrid search based on vector similarity and scalar filtering. - # - - # Before conducting a search or a query, you need to load the data in `hello_milvus` into memory. - print(fmt.format("Start loading")) - recover_collection.load() - - # ----------------------------------------------------------------------------- - # search based on vector similarity - print(fmt.format("Start searching based on vector similarity")) - vectors_to_search = entities[-1][-2:] - search_params = { - "metric_type": "L2", - "params": {"nprobe": 10}, - } - - start_time = time.time() - result = recover_collection.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["random"]) - end_time = time.time() - - for hits in result: - for hit in hits: - print(f"hit: {hit}, random field: {hit.entity.get('random')}") - print(search_latency_fmt.format(end_time - start_time)) - - # ----------------------------------------------------------------------------- - # query based on scalar filtering(boolean, int, etc.) - print(fmt.format("Start querying with `random > 0.5`")) - - start_time = time.time() - result = recover_collection.query(expr="random > 0.5", output_fields=["random", "embeddings"]) - end_time = time.time() - - print(f"query result:\n-{result[0]}") - print(search_latency_fmt.format(end_time - start_time)) - - # ----------------------------------------------------------------------------- - # hybrid search - print(fmt.format("Start hybrid searching with `random > 0.5`")) - - start_time = time.time() - result = recover_collection.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > 0.5", output_fields=["random"]) - end_time = time.time() - - for hits in result: - for hit in hits: - print(f"hit: {hit}, random field: {hit.entity.get('random')}") - print(search_latency_fmt.format(end_time - start_time)) - - ############################################################################### - # 7. drop collection - # Finally, drop the hello_milvus, hello_milvus_recover collection - print(fmt.format(f"Drop collection {recover_collection_name}")) - utility.drop_collection(recover_collection_name) \ No newline at end of file + # 1. get recovered collection hello_milvus_recover + print(fmt.format("start connecting to Milvus")) + print(fmt.format(f"Milvus uri: {uri}")) + connections.connect("default", uri=uri, token=token) + + recover_collections = ["hello_milvus_recover", "hello_milvus2_recover"] + + for recover_collection_name in recover_collections: + has = utility.has_collection(recover_collection_name) + print(f"Does collection {recover_collection_name} exist in Milvus: {has}") + recover_collection = Collection(recover_collection_name) + print(recover_collection.schema) + recover_collection.flush() + + print(f"Number of entities in Milvus: {recover_collection_name} : {recover_collection.num_entities}") # check the num_entites + + ################################################################################ + # 4. create index + # We are going to create an IVF_FLAT index for hello_milvus_recover collection. + # create_index() can only be applied to `FloatVector` and `BinaryVector` fields. + print(fmt.format("Start Creating index IVF_FLAT")) + index = { + "index_type": "IVF_FLAT", + "metric_type": "L2", + "params": {"nlist": 128}, + } + + recover_collection.create_index("embeddings", index) + + ################################################################################ + # 5. search, query, and hybrid search + # After data were inserted into Milvus and indexed, you can perform: + # - search based on vector similarity + # - query based on scalar filtering(boolean, int, etc.) + # - hybrid search based on vector similarity and scalar filtering. + # + + # Before conducting a search or a query, you need to load the data in `hello_milvus` into memory. + print(fmt.format("Start loading")) + recover_collection.load() + + # ----------------------------------------------------------------------------- + # search based on vector similarity + print(fmt.format("Start searching based on vector similarity")) + vectors_to_search = entities[-1][-2:] + search_params = { + "metric_type": "L2", + "params": {"nprobe": 10}, + } + + start_time = time.time() + result = recover_collection.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["random"]) + end_time = time.time() + + for hits in result: + for hit in hits: + print(f"hit: {hit}, random field: {hit.entity.get('random')}") + print(search_latency_fmt.format(end_time - start_time)) + + # ----------------------------------------------------------------------------- + # query based on scalar filtering(boolean, int, etc.) + print(fmt.format("Start querying with `random > 0.5`")) + + start_time = time.time() + result = recover_collection.query(expr="random > 0.5", output_fields=["random", "embeddings"]) + end_time = time.time() + + print(f"query result:\n-{result[0]}") + print(search_latency_fmt.format(end_time - start_time)) + + # ----------------------------------------------------------------------------- + # hybrid search + print(fmt.format("Start hybrid searching with `random > 0.5`")) + + start_time = time.time() + result = recover_collection.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > 0.5", output_fields=["random"]) + end_time = time.time() + + for hits in result: + for hit in hits: + print(f"hit: {hit}, random field: {hit.entity.get('random')}") + print(search_latency_fmt.format(end_time - start_time)) + + ############################################################################### + # 7. drop collection + # Finally, drop the hello_milvus, hello_milvus_recover collection + print(fmt.format(f"Drop collection {recover_collection_name}")) + utility.drop_collection(recover_collection_name) + + +if __name__ == "__main__": + args = argparse.ArgumentParser(description="verify data") + args.add_argument("--uri", type=str, default="http://127.0.0.1:19530", help="Milvus server uri") + args.add_argument("--token",type=str, default="root:Milvus", help="Milvus server token") + args = args.parse_args() + main(args.uri, args.token) \ No newline at end of file