-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add performance test of WAND stopword tuning
- Loading branch information
1 parent
0650d6c
commit fd3ca75
Showing
3 changed files
with
338 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
// Copyright Vespa.ai. All rights reserved. | ||
package com.yahoo.test; | ||
|
||
import com.yahoo.prelude.query.AndItem; | ||
import com.yahoo.prelude.query.CompositeItem; | ||
import com.yahoo.prelude.query.Item; | ||
import com.yahoo.prelude.query.OrItem; | ||
import com.yahoo.prelude.query.WeakAndItem; | ||
import com.yahoo.search.*; | ||
import com.yahoo.search.result.*; | ||
import com.yahoo.search.searchchain.*; | ||
import com.yahoo.yolean.chain.After; | ||
import com.yahoo.yolean.chain.Before; | ||
import com.yahoo.data.access.*; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
@After(PhaseNames.TRANSFORMED_QUERY) | ||
@Before(PhaseNames.BLENDED_RESULT) | ||
public class MicroBmSearcher extends Searcher { | ||
|
||
Query changeRoot(Query query, CompositeItem newRoot) { | ||
Query newQuery = query.clone(); | ||
Item oldRoot = newQuery.getModel().getQueryTree().getRoot(); | ||
if (oldRoot instanceof CompositeItem old) { | ||
for (Item child : old.items()) { | ||
newRoot.addItem(child); | ||
} | ||
} | ||
newQuery.getModel().getQueryTree().setRoot(newRoot); | ||
return newQuery; | ||
} | ||
|
||
Set<String> getHitIds(Result result) { | ||
Set<String> set = new HashSet<>(); | ||
for (Hit hit : result.hits().asList()) { | ||
if (hit.isMeta()) continue; | ||
String id = hit.getDisplayId(); | ||
set.add(id); | ||
} | ||
return set; | ||
} | ||
|
||
double quality(Set<String> expected, Set<String> actual) { | ||
int count = 0; | ||
for (String id : expected) { | ||
if (actual.contains(id)) { | ||
++count; | ||
} | ||
} | ||
return (double)count / (double)expected.size(); | ||
} | ||
|
||
double timeQuery(Query query, Execution execution) { | ||
return timeQuery(query, execution, 5); | ||
} | ||
double timeQuery(Query query, Execution execution, int count) { | ||
List<Long> timings = new ArrayList<>(); | ||
for (int i = 0; i < count; i++) { | ||
long before = System.nanoTime(); | ||
Result result = execution.search(query); | ||
long after = System.nanoTime(); | ||
timings.add(after - before); | ||
} | ||
Collections.sort(timings); | ||
int idx = 20 * count / 100; | ||
return timings.get(idx) * 1.0e-6; | ||
} | ||
|
||
@Override | ||
public Result search(Query weakAndQuery, Execution execution) { | ||
Result weakAndResult = execution.search(weakAndQuery); | ||
execution.fill(weakAndResult); | ||
var weakAndSet = getHitIds(weakAndResult); | ||
|
||
Query orQuery = changeRoot(weakAndQuery, new OrItem()); | ||
long before = System.nanoTime(); | ||
Result orResult = execution.search(orQuery); | ||
long after = System.nanoTime(); | ||
double orTime = (after - before) * 1.0e-6; | ||
execution.fill(orResult); | ||
var orSet = getHitIds(orResult); | ||
|
||
Query andQuery = changeRoot(weakAndQuery, new AndItem()); | ||
Result andResult = execution.search(andQuery); | ||
execution.fill(andResult); | ||
var andSet = getHitIds(andResult); | ||
|
||
Query weakAndQuery20 = changeRoot(weakAndQuery, new WeakAndItem()); | ||
weakAndQuery20.properties().set("rankproperty.vespa.matching.weakand.stop_word_limit", "0.20"); | ||
Result weakAndResult20 = execution.search(weakAndQuery20); | ||
execution.fill(weakAndResult20); | ||
var weakAndSet20 = getHitIds(weakAndResult20); | ||
|
||
Query weakAndQuery05 = changeRoot(weakAndQuery, new WeakAndItem()); | ||
weakAndQuery05.properties().set("rankproperty.vespa.matching.weakand.stop_word_limit", "0.05"); | ||
Result weakAndResult05 = execution.search(weakAndQuery05); | ||
execution.fill(weakAndResult05); | ||
var weakAndSet05 = getHitIds(weakAndResult05); | ||
|
||
// double orTime = timeQuery(orQuery, execution, 1); | ||
double weakAndTime = timeQuery(weakAndQuery, execution); | ||
double weakAndTime20 = timeQuery(weakAndQuery20, execution); | ||
double weakAndTime05 = timeQuery(weakAndQuery05, execution); | ||
double andTime = timeQuery(andQuery, execution); | ||
|
||
Hit meta = new Hit("meta"); | ||
meta.setMeta(true); | ||
meta.setField("andQuality", quality(orSet, andSet)); | ||
meta.setField("weakAndQuality", quality(orSet, weakAndSet)); | ||
meta.setField("weakAndQuality20", quality(orSet, weakAndSet20)); | ||
meta.setField("weakAndQuality05", quality(orSet, weakAndSet05)); | ||
meta.setField("orHits", orResult.getTotalHitCount()); | ||
meta.setField("andHits", andResult.getTotalHitCount()); | ||
meta.setField("weakAndHits", weakAndResult.getTotalHitCount()); | ||
meta.setField("weakAndHits20", weakAndResult20.getTotalHitCount()); | ||
meta.setField("weakAndHits05", weakAndResult05.getTotalHitCount()); | ||
meta.setField("orTime", orTime); | ||
meta.setField("andTime", andTime); | ||
meta.setField("weakAndTime", weakAndTime); | ||
meta.setField("weakAndTime20", weakAndTime20); | ||
meta.setField("weakAndTime05", weakAndTime05); | ||
Result result = new Result(weakAndQuery); | ||
result.setTotalHitCount(weakAndResult.getTotalHitCount()); | ||
result.hits().add(meta); | ||
return result; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Copyright Vespa.ai. All rights reserved. | ||
|
||
require 'performance_test' | ||
require 'app_generator/search_app' | ||
require 'performance/fbench' | ||
require 'performance/wand_performance/wand_performance_specs' | ||
require 'pp' | ||
|
||
class WandStopWordsTest < PerformanceTest | ||
|
||
def setup | ||
super | ||
set_owner('arnej') | ||
end | ||
|
||
def initialize(*args) | ||
super(*args) | ||
end | ||
|
||
def prepare | ||
super | ||
end | ||
|
||
def deploy_and_start | ||
significance_model = "https://data.vespa-cloud.com/tests/performance/significance_model/enwiki-20240801.json.zst" | ||
add_bundle(selfdir + 'MicroBmSearcher.java') | ||
searcher = Searcher.new('com.yahoo.test.MicroBmSearcher') | ||
deploy_app( | ||
SearchApp.new. | ||
sd(selfdir + 'wikimedia.sd'). | ||
threads_per_search(1). | ||
container(Container.new. | ||
search(Searching.new. | ||
significance(Significance.new.model_url(significance_model)). | ||
chain(Chain.new('default', 'vespa').add(searcher))). | ||
docproc(DocumentProcessing.new). | ||
documentapi(ContainerDocumentApi.new))) | ||
start | ||
end | ||
|
||
def test_wand_with_stopwords | ||
set_description('Test performance and quality of Vespa Wand with stop words') | ||
deploy_and_start | ||
if File.exist?(selfdir + 'just-1k.json') | ||
# for faster turnaround during development: | ||
doc_count = 1000 | ||
feed_and_wait_for_docs('wikimedia', doc_count, | ||
{ :file => selfdir + 'just-1k.json', | ||
:client => :vespa_feed_client | ||
}) | ||
assert_hitcount('yql=select title from wikimedia where true', doc_count) | ||
measure_wand_quality | ||
end | ||
doc_count = 1000000 | ||
feed_file('enwiki-20240801-pages.1M.jsonl.zst') | ||
wait_for_hitcount('yql=select title from wikimedia where true', doc_count) | ||
measure_wand_quality | ||
end | ||
|
||
def max(a,b) | ||
a>b ? a : b | ||
end | ||
|
||
def min(a,b) | ||
a<b ? a : b | ||
end | ||
|
||
def measure_wand_quality | ||
andQ = [] | ||
orQ = [] | ||
waQ = [] | ||
wa20Q = [] | ||
wa05Q = [] | ||
andH = [] | ||
orH = [] | ||
waH = [] | ||
wa20H = [] | ||
wa05H = [] | ||
andT = [] | ||
orT = [] | ||
waT = [] | ||
wa20T = [] | ||
wa05T = [] | ||
counter = 0 | ||
q_file = download_file('squad2-questions.raw.141k.txt.zst', vespa.adminserver) | ||
vespa.adminserver.execute("zstdcat #{q_file} | head -n 1000 > #{q_file}.raw") | ||
vespa.adminserver.execute("mv #{q_file} #{selfdir}", :exceptiononfailure => false) | ||
(1..500).each do |counter| | ||
line = vespa.adminserver.execute("sed -n #{counter}p < #{q_file}.raw", :noecho => true) | ||
line.gsub!(/\W/, ' ') | ||
q = "/search/?query=#{line}&hits=100&timeout=100" | ||
r = search(q) | ||
h = r.hit[0] | ||
andQ.append(h.field['andQuality']) | ||
waQ.append(h.field['weakAndQuality']) | ||
wa20Q.append(h.field['weakAndQuality20']) | ||
wa05Q.append(h.field['weakAndQuality05']) | ||
andH.append(h.field['andHits']) | ||
orH.append(h.field['orHits']) | ||
waH.append(h.field['weakAndHits']) | ||
wa20H.append(h.field['weakAndHits20']) | ||
wa05H.append(h.field['weakAndHits05']) | ||
andT.append(h.field['andTime']) | ||
orT.append(h.field['orTime']) | ||
waT.append(h.field['weakAndTime']) | ||
wa20T.append(h.field['weakAndTime20']) | ||
wa05T.append(h.field['weakAndTime05']) | ||
quality = h.field['weakAndQuality'] | ||
wantedHits = max(h.field['andHits'], min(100, h.field['orHits'])) | ||
hitsFactor = (1000 * h.field['weakAndHits']) / wantedHits | ||
hitsFactor = hitsFactor / 1000.0 | ||
orHitsFactor = (1000 * h.field['orHits']) / h.field['weakAndHits'] | ||
orHitsFactor = orHitsFactor / 1000.0 | ||
speedup = (1000 * h.field['orTime']) / h.field['weakAndTime'] | ||
speedup = speedup.to_i / 1000.0 | ||
puts "quality: #{quality} speedup: #{speedup} with #{h.field['weakAndHits']} hits, factors #{hitsFactor} / #{orHitsFactor} for query: #{line}" | ||
end | ||
sz = andQ.size | ||
puts "== Average and median over #{sz} results ==" | ||
process("AND-recall", "recall@100", andQ) | ||
process("WeakAnd-100-recall", "recall@100", waQ) | ||
process("WeakAnd-20-recall", "recall@100", wa20Q) | ||
process("WeakAnd-5-recall", "recall@100", wa05Q) | ||
|
||
process("AND-hits", "hits", andH) | ||
process("WeakAnd-100-hits", "hits", waH) | ||
process("WeakAnd-20-hits", "hits", wa20H) | ||
process("WeakAnd-5-hits", "hits", wa05H) | ||
process("OR-hits", "hits", orH) | ||
|
||
process("AND-ms", "latency", andT) | ||
process("WeakAnd-100-ms", "latency", waT) | ||
process("WeakAnd-20-ms", "latency", wa20T) | ||
process("WeakAnd-5-ms", "latency", wa05T) | ||
process("OR-ms", "latency", orT) | ||
end | ||
|
||
def feed_file(feed_file) | ||
node_file = download_file(feed_file, vespa.adminserver) | ||
feed({:file => node_file, | ||
:client => :vespa_feed_client, | ||
:compression => 'none', | ||
:localfile => true, | ||
:silent => true, | ||
:disable_tls => false}) | ||
vespa.adminserver.execute("mv #{node_file} #{selfdir}", :exceptiononfailure => false) | ||
end | ||
|
||
def download_file(file_name, vespa_node) | ||
download_file_from_s3(file_name, vespa_node, 'wikipedia') | ||
end | ||
|
||
def process(legend, type, values) | ||
sz = values.size | ||
values.sort! | ||
report(legend, type, values[sz/2], values.sum / sz) | ||
end | ||
|
||
def report(legend, type, median, avg) | ||
puts "#{legend}: median #{median} with average #{avg}" | ||
write_report([parameter_filler('legend', legend), | ||
parameter_filler('type', type), | ||
metric_filler('median', median), | ||
metric_filler('average', avg)]) | ||
end | ||
|
||
def teardown | ||
super | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
schema wikimedia { | ||
|
||
document wikimedia { | ||
field id type int { | ||
indexing: attribute | summary | ||
} | ||
field title type string { | ||
indexing: index | summary | ||
index: enable-bm25 | ||
} | ||
field text type string { | ||
indexing: index | ||
index: enable-bm25 | ||
} | ||
} | ||
|
||
document-summary small { | ||
summary id {} | ||
} | ||
|
||
fieldset default { | ||
fields: title, text | ||
} | ||
|
||
rank-profile default { | ||
rank-properties { | ||
vespa.matching.weakand.stop_word_limit: 1.0 | ||
} | ||
first-phase { | ||
# expression: bm25(title) + bm25(text) | ||
expression: nativeRank | ||
} | ||
} | ||
|
||
} |