diff --git a/Dockerfile.opendistroforelasticsearch b/Dockerfile.opendistroforelasticsearch new file mode 100644 index 0000000..878a9e4 --- /dev/null +++ b/Dockerfile.opendistroforelasticsearch @@ -0,0 +1,10 @@ +FROM amazon/opendistro-for-elasticsearch:1.6.0 + +WORKDIR /usr/share/elasticsearch/ + +COPY config/opendistroforelasticsearch /usr/share/elasticsearch/config +COPY bin/opendistroforelasticsearch/create-index.sh /usr/share/elasticsearch/create-index.sh + +ENV http.port=19200 +ENV opendistro_security.disabled=true +ENV knn.algo_param.index_thread_qty=1 diff --git a/bin/build.sh b/bin/build.sh index 5e57b52..ccd38e1 100755 --- a/bin/build.sh +++ b/bin/build.sh @@ -1,3 +1,4 @@ #!/bin/sh docker build . -f Dockerfile.elastic --tag es_benchmark:1.1 --rm +docker build . -f Dockerfile.opendistroforelasticsearch --tag oes_benchmark:1.1 --rm docker build . -f Dockerfile.vespa --tag vespa_benchmark:1.1 --rm diff --git a/bin/check-recall.py b/bin/check-recall.py index 6f38fe2..1982b91 100644 --- a/bin/check-recall.py +++ b/bin/check-recall.py @@ -50,6 +50,29 @@ def get_elastic_result(query_vector): hits.append(int(id)) return hits +def get_opendistroforelastic_result(query_vector): + oes_script_query = { + 'knn': { + 'vector': { + 'vector': query_vector, + 'k': '10' + } + } + } + oes_body={ + 'size': 10, + 'timeout': '15s', + 'query': oes_script_query + } + response = requests.post('http://localhost:19200/doc/_search', json=oes_body) + response.raise_for_status() + hits=[] + for h in response.json()['hits']['hits']: + id = h['_id'] + score = h['_score'] + hits.append(int(id)) + return hits + def compute_recall(real_neighbors,computed_neighbors, n=10): real_neighbors = real_neighbors[0:n] recalled = 0 @@ -60,18 +83,23 @@ def compute_recall(real_neighbors,computed_neighbors, n=10): average_recall_elastic = [] average_recall_vespa = [] +average_recall_opendistroforelastic = [] for i,vector in enumerate(data['test'][0:1000]): real_neighbors = data['neighbors'][i] distances = data['distances'][i] vector=vector.tolist() + computed_neighbors_oes = get_opendistroforelastic_result(vector) computed_neighbors_es = get_elastic_result(vector) computed_neighbors_vespa = get_vespa_result(vector) + recall_opendistroforelastic = compute_recall(real_neighbors, computed_neighbors_oes) recall_vespa = compute_recall(real_neighbors, computed_neighbors_vespa) recall_elastic = compute_recall(real_neighbors, computed_neighbors_es) + average_recall_opendistroforelastic.append(recall_opendistroforelastic) average_recall_elastic.append(recall_elastic) average_recall_vespa.append(recall_vespa) print('Average recall Vespa = %f' % np.average(average_recall_vespa)) print('Average recall Elastic = %f' % np.average(average_recall_elastic)) +print('Average recall Opendistro for Elastic = %f' % np.average(average_recall_opendistroforelastic)) diff --git a/bin/do-benchmark.sh b/bin/do-benchmark.sh index 68bb5e8..1c3f0d3 100755 --- a/bin/do-benchmark.sh +++ b/bin/do-benchmark.sh @@ -3,6 +3,10 @@ echo "Elastic NNS" docker run -v $(pwd)/data/:/tmp/queries --net=host --rm --entrypoint /opt/vespa/bin/vespa-fbench docker.io/vespaengine/vespa \ -P -H "Content-Type:application/json" -q /tmp/queries/elastic/queries.txt -s 180 -n 1 -c 0 -i 20 -o /tmp/queries/result.es.txt localhost 9200 +echo " Open Distribution for Elastic NSS" +docker run -v $(pwd)/data/:/tmp/queries --net=host --rm --entrypoint /opt/vespa/bin/vespa-fbench docker.io/vespaengine/vespa \ +-P -H "Content-Type:application/json" -q /tmp/queries/opendistroforelasticsearch/queries.txt -s 18 -n 1 -c 0 -i 20 -o /tmp/queries/result.oes.txt localhost 19200 + echo "Vespa NNS" docker run -v $(pwd)/data/:/tmp/queries --net=host --rm --entrypoint /opt/vespa/bin/vespa-fbench docker.io/vespaengine/vespa \ -P -H "Content-Type:application/json" -q /tmp/queries/vespa/queries_ann.txt -s 180 -n 1 -c 0 -i 20 -o /tmp/queries/result_ann.vespa.txt localhost 8080 diff --git a/bin/make-feed.py b/bin/make-feed.py index d725294..bd3a11f 100644 --- a/bin/make-feed.py +++ b/bin/make-feed.py @@ -25,6 +25,8 @@ def feed_to_es_and_vespa(data): response.raise_for_status() response = requests.post('http://localhost:9200/doc/_doc/%i' %docid, json=es_body) response.raise_for_status() + response = requests.post('http://localhost:19200/doc/_doc/%i' %docid, json=es_body) + response.raise_for_status() nthreads=18 with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as executor: diff --git a/bin/make-queries.py b/bin/make-queries.py index 8c5b541..8c99b9b 100644 --- a/bin/make-queries.py +++ b/bin/make-queries.py @@ -6,6 +6,7 @@ file= sys.argv[1] test= h5py.File(file, 'r')['test'] +oes_queries = open('data/opendistroforelasticsearch/queries.txt', 'w') es_queries = open('data/elastic/queries.txt', 'w') vespa_queries_ann = open('data/vespa/queries_ann.txt', 'w') @@ -20,6 +21,8 @@ 'timeout': '15s', 'ranking.softtimeout.enable': 'false' } + vespa_queries_ann.write('/search/\n') + vespa_queries_ann.write(json.dumps(vespa_body_ann) + '\n') es_script_query = { 'script_score': { @@ -38,5 +41,18 @@ es_queries.write('/doc/_search\n') es_queries.write(json.dumps(es_body) + '\n') - vespa_queries_ann.write('/search/\n') - vespa_queries_ann.write(json.dumps(vespa_body_ann) + '\n') + oes_script_query = { + 'knn': { + 'vector': { + 'vector': query_vector, + 'k': '10' + } + } + } + oes_body={ + 'size': 10, + 'timeout': '15s', + 'query': oes_script_query + } + oes_queries.write('/doc/_search\n') + oes_queries.write(json.dumps(oes_body) + '\n') diff --git a/bin/opendistroforelasticsearch/create-index.sh b/bin/opendistroforelasticsearch/create-index.sh new file mode 100755 index 0000000..d99fa76 --- /dev/null +++ b/bin/opendistroforelasticsearch/create-index.sh @@ -0,0 +1,2 @@ +#!/bin/sh +curl -s -X PUT "localhost:19200/doc?pretty" -H "Content-Type:application/json" -d @config/index.json diff --git a/bin/opendistroforelasticsearch/feed.py b/bin/opendistroforelasticsearch/feed.py new file mode 100644 index 0000000..c3d31ff --- /dev/null +++ b/bin/opendistroforelasticsearch/feed.py @@ -0,0 +1,30 @@ +import requests +import gzip +import os +import concurrent.futures + +def postES(data): + docid,line = data + line = line.strip() + response = requests.post('http://localhost:19200/doc/_doc/%i' %docid, data=line,headers={"Content-Type":"application/json"}) + return response.status_code + +feed_files = [f for f in os.listdir(".") if f.endswith("json.gz")] +feed_files.sort() +docid=0 +nthreads=8 + +ok=0 +notok=0 +with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as executor: + for file in feed_files: + with gzip.open(file, 'rb') as f: + futures = [executor.submit(postES,data) for data in enumerate(f)] + for result in concurrent.futures.as_completed(futures): + if result.result() == 200: + ok+=1 + else: + notok+=1 + +print("Feed documents %i - ok %i - not ok %i" %(ok+notok,ok,notok)) + diff --git a/bin/run.sh b/bin/run.sh index 48602e4..7cecea2 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -1,3 +1,4 @@ #!/bin/sh docker run --privileged --name es -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e ES_JAVA_OPTS="-Xms8g -Xmx8g" -d es_benchmark:1.1 +docker run --privileged --name oes -p 19200:19200 -p 19300:19300 -e "discovery.type=single-node" -e ES_JAVA_OPTS="-Xms16g -Xmx16g" -d oes_benchmark:1.1 docker run --privileged --name vespa -p 8080:8080 -d vespa_benchmark:1.1 diff --git a/config/opendistroforelasticsearch/index.json b/config/opendistroforelasticsearch/index.json new file mode 100644 index 0000000..96f8533 --- /dev/null +++ b/config/opendistroforelasticsearch/index.json @@ -0,0 +1,22 @@ +{ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "index.knn": true + }, + "mappings": { + "dynamic": "false", + "_source": { + "enabled": "false" + }, + "properties": { + "id": { + "type": "integer" + }, + "vector": { + "type": "knn_vector", + "dimension":960 + } + } + } +} diff --git a/data/opendistroforelasticsearch/queries.txt b/data/opendistroforelasticsearch/queries.txt new file mode 100644 index 0000000..e69de29