Skip to content

Commit

Permalink
add performance test of WAND stopword tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
arnej27959 committed Oct 30, 2024
1 parent 0650d6c commit fd3ca75
Show file tree
Hide file tree
Showing 3 changed files with 338 additions and 0 deletions.
133 changes: 133 additions & 0 deletions tests/performance/wand_stopwords/MicroBmSearcher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright Vespa.ai. All rights reserved.
package com.yahoo.test;

import com.yahoo.prelude.query.AndItem;
import com.yahoo.prelude.query.CompositeItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.OrItem;
import com.yahoo.prelude.query.WeakAndItem;
import com.yahoo.search.*;
import com.yahoo.search.result.*;
import com.yahoo.search.searchchain.*;
import com.yahoo.yolean.chain.After;
import com.yahoo.yolean.chain.Before;
import com.yahoo.data.access.*;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

@After(PhaseNames.TRANSFORMED_QUERY)
@Before(PhaseNames.BLENDED_RESULT)
public class MicroBmSearcher extends Searcher {

Query changeRoot(Query query, CompositeItem newRoot) {
Query newQuery = query.clone();
Item oldRoot = newQuery.getModel().getQueryTree().getRoot();
if (oldRoot instanceof CompositeItem old) {
for (Item child : old.items()) {
newRoot.addItem(child);
}
}
newQuery.getModel().getQueryTree().setRoot(newRoot);
return newQuery;
}

Set<String> getHitIds(Result result) {
Set<String> set = new HashSet<>();
for (Hit hit : result.hits().asList()) {
if (hit.isMeta()) continue;
String id = hit.getDisplayId();
set.add(id);
}
return set;
}

double quality(Set<String> expected, Set<String> actual) {
int count = 0;
for (String id : expected) {
if (actual.contains(id)) {
++count;
}
}
return (double)count / (double)expected.size();
}

double timeQuery(Query query, Execution execution) {
return timeQuery(query, execution, 5);
}
double timeQuery(Query query, Execution execution, int count) {
List<Long> timings = new ArrayList<>();
for (int i = 0; i < count; i++) {
long before = System.nanoTime();
Result result = execution.search(query);
long after = System.nanoTime();
timings.add(after - before);
}
Collections.sort(timings);
int idx = 20 * count / 100;
return timings.get(idx) * 1.0e-6;
}

@Override
public Result search(Query weakAndQuery, Execution execution) {
Result weakAndResult = execution.search(weakAndQuery);
execution.fill(weakAndResult);
var weakAndSet = getHitIds(weakAndResult);

Query orQuery = changeRoot(weakAndQuery, new OrItem());
long before = System.nanoTime();
Result orResult = execution.search(orQuery);
long after = System.nanoTime();
double orTime = (after - before) * 1.0e-6;
execution.fill(orResult);
var orSet = getHitIds(orResult);

Query andQuery = changeRoot(weakAndQuery, new AndItem());
Result andResult = execution.search(andQuery);
execution.fill(andResult);
var andSet = getHitIds(andResult);

Query weakAndQuery20 = changeRoot(weakAndQuery, new WeakAndItem());
weakAndQuery20.properties().set("rankproperty.vespa.matching.weakand.stop_word_limit", "0.20");
Result weakAndResult20 = execution.search(weakAndQuery20);
execution.fill(weakAndResult20);
var weakAndSet20 = getHitIds(weakAndResult20);

Query weakAndQuery05 = changeRoot(weakAndQuery, new WeakAndItem());
weakAndQuery05.properties().set("rankproperty.vespa.matching.weakand.stop_word_limit", "0.05");
Result weakAndResult05 = execution.search(weakAndQuery05);
execution.fill(weakAndResult05);
var weakAndSet05 = getHitIds(weakAndResult05);

// double orTime = timeQuery(orQuery, execution, 1);
double weakAndTime = timeQuery(weakAndQuery, execution);
double weakAndTime20 = timeQuery(weakAndQuery20, execution);
double weakAndTime05 = timeQuery(weakAndQuery05, execution);
double andTime = timeQuery(andQuery, execution);

Hit meta = new Hit("meta");
meta.setMeta(true);
meta.setField("andQuality", quality(orSet, andSet));
meta.setField("weakAndQuality", quality(orSet, weakAndSet));
meta.setField("weakAndQuality20", quality(orSet, weakAndSet20));
meta.setField("weakAndQuality05", quality(orSet, weakAndSet05));
meta.setField("orHits", orResult.getTotalHitCount());
meta.setField("andHits", andResult.getTotalHitCount());
meta.setField("weakAndHits", weakAndResult.getTotalHitCount());
meta.setField("weakAndHits20", weakAndResult20.getTotalHitCount());
meta.setField("weakAndHits05", weakAndResult05.getTotalHitCount());
meta.setField("orTime", orTime);
meta.setField("andTime", andTime);
meta.setField("weakAndTime", weakAndTime);
meta.setField("weakAndTime20", weakAndTime20);
meta.setField("weakAndTime05", weakAndTime05);
Result result = new Result(weakAndQuery);
result.setTotalHitCount(weakAndResult.getTotalHitCount());
result.hits().add(meta);
return result;
}

}
170 changes: 170 additions & 0 deletions tests/performance/wand_stopwords/wand_stopwords.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Copyright Vespa.ai. All rights reserved.

require 'performance_test'
require 'app_generator/search_app'
require 'performance/fbench'
require 'performance/wand_performance/wand_performance_specs'
require 'pp'

class WandStopWordsTest < PerformanceTest

def setup
super
set_owner('arnej')
end

def initialize(*args)
super(*args)
end

def prepare
super
end

def deploy_and_start
significance_model = "https://data.vespa-cloud.com/tests/performance/significance_model/enwiki-20240801.json.zst"
add_bundle(selfdir + 'MicroBmSearcher.java')
searcher = Searcher.new('com.yahoo.test.MicroBmSearcher')
deploy_app(
SearchApp.new.
sd(selfdir + 'wikimedia.sd').
threads_per_search(1).
container(Container.new.
search(Searching.new.
significance(Significance.new.model_url(significance_model)).
chain(Chain.new('default', 'vespa').add(searcher))).
docproc(DocumentProcessing.new).
documentapi(ContainerDocumentApi.new)))
start
end

def test_wand_with_stopwords
set_description('Test performance and quality of Vespa Wand with stop words')
deploy_and_start
if File.exist?(selfdir + 'just-1k.json')
# for faster turnaround during development:
doc_count = 1000
feed_and_wait_for_docs('wikimedia', doc_count,
{ :file => selfdir + 'just-1k.json',
:client => :vespa_feed_client
})
assert_hitcount('yql=select title from wikimedia where true', doc_count)
measure_wand_quality
end
doc_count = 1000000
feed_file('enwiki-20240801-pages.1M.jsonl.zst')
wait_for_hitcount('yql=select title from wikimedia where true', doc_count)
measure_wand_quality
end

def max(a,b)
a>b ? a : b
end

def min(a,b)
a<b ? a : b
end

def measure_wand_quality
andQ = []
orQ = []
waQ = []
wa20Q = []
wa05Q = []
andH = []
orH = []
waH = []
wa20H = []
wa05H = []
andT = []
orT = []
waT = []
wa20T = []
wa05T = []
counter = 0
q_file = download_file('squad2-questions.raw.141k.txt.zst', vespa.adminserver)
vespa.adminserver.execute("zstdcat #{q_file} | head -n 1000 > #{q_file}.raw")
vespa.adminserver.execute("mv #{q_file} #{selfdir}", :exceptiononfailure => false)
(1..500).each do |counter|
line = vespa.adminserver.execute("sed -n #{counter}p < #{q_file}.raw", :noecho => true)
line.gsub!(/\W/, ' ')
q = "/search/?query=#{line}&hits=100&timeout=100"
r = search(q)
h = r.hit[0]
andQ.append(h.field['andQuality'])
waQ.append(h.field['weakAndQuality'])
wa20Q.append(h.field['weakAndQuality20'])
wa05Q.append(h.field['weakAndQuality05'])
andH.append(h.field['andHits'])
orH.append(h.field['orHits'])
waH.append(h.field['weakAndHits'])
wa20H.append(h.field['weakAndHits20'])
wa05H.append(h.field['weakAndHits05'])
andT.append(h.field['andTime'])
orT.append(h.field['orTime'])
waT.append(h.field['weakAndTime'])
wa20T.append(h.field['weakAndTime20'])
wa05T.append(h.field['weakAndTime05'])
quality = h.field['weakAndQuality']
wantedHits = max(h.field['andHits'], min(100, h.field['orHits']))
hitsFactor = (1000 * h.field['weakAndHits']) / wantedHits
hitsFactor = hitsFactor / 1000.0
orHitsFactor = (1000 * h.field['orHits']) / h.field['weakAndHits']
orHitsFactor = orHitsFactor / 1000.0
speedup = (1000 * h.field['orTime']) / h.field['weakAndTime']
speedup = speedup.to_i / 1000.0
puts "quality: #{quality} speedup: #{speedup} with #{h.field['weakAndHits']} hits, factors #{hitsFactor} / #{orHitsFactor} for query: #{line}"
end
sz = andQ.size
puts "== Average and median over #{sz} results =="
process("AND-recall", "recall@100", andQ)
process("WeakAnd-100-recall", "recall@100", waQ)
process("WeakAnd-20-recall", "recall@100", wa20Q)
process("WeakAnd-5-recall", "recall@100", wa05Q)

process("AND-hits", "hits", andH)
process("WeakAnd-100-hits", "hits", waH)
process("WeakAnd-20-hits", "hits", wa20H)
process("WeakAnd-5-hits", "hits", wa05H)
process("OR-hits", "hits", orH)

process("AND-ms", "latency", andT)
process("WeakAnd-100-ms", "latency", waT)
process("WeakAnd-20-ms", "latency", wa20T)
process("WeakAnd-5-ms", "latency", wa05T)
process("OR-ms", "latency", orT)
end

def feed_file(feed_file)
node_file = download_file(feed_file, vespa.adminserver)
feed({:file => node_file,
:client => :vespa_feed_client,
:compression => 'none',
:localfile => true,
:silent => true,
:disable_tls => false})
vespa.adminserver.execute("mv #{node_file} #{selfdir}", :exceptiononfailure => false)
end

def download_file(file_name, vespa_node)
download_file_from_s3(file_name, vespa_node, 'wikipedia')
end

def process(legend, type, values)
sz = values.size
values.sort!
report(legend, type, values[sz/2], values.sum / sz)
end

def report(legend, type, median, avg)
puts "#{legend}: median #{median} with average #{avg}"
write_report([parameter_filler('legend', legend),
parameter_filler('type', type),
metric_filler('median', median),
metric_filler('average', avg)])
end

def teardown
super
end
end
35 changes: 35 additions & 0 deletions tests/performance/wand_stopwords/wikimedia.sd
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
schema wikimedia {

document wikimedia {
field id type int {
indexing: attribute | summary
}
field title type string {
indexing: index | summary
index: enable-bm25
}
field text type string {
indexing: index
index: enable-bm25
}
}

document-summary small {
summary id {}
}

fieldset default {
fields: title, text
}

rank-profile default {
rank-properties {
vespa.matching.weakand.stop_word_limit: 1.0
}
first-phase {
# expression: bm25(title) + bm25(text)
expression: nativeRank
}
}

}

0 comments on commit fd3ca75

Please sign in to comment.