Skip to content

Commit

Permalink
Merge pull request #4233 from vespa-engine/vekterli/add-mmap-vs-direc…
Browse files Browse the repository at this point in the history
…tio-search-test

Add WIP of test that compares search performance with mmap and Direct IO
  • Loading branch information
vekterli authored Oct 22, 2024
2 parents 130be08 + 1b43f92 commit 65fd5ec
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 3 deletions.
6 changes: 3 additions & 3 deletions lib/performance/stat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,8 @@ def printable_result(params={})
rb.open_group('System')
rb.single_metric('CPU utilization', m[:cpu_util] * 100.0, :suffix => '%')
rb.avg_metric('Number of forks done', m[:fork])
rb.avg_metric('Pages paged out', m[:swap][:paged_out])
rb.avg_metric('Pages swapped out', m[:swap][:swapped_out],:warn_if_exceeding => 0)
rb.avg_metric('Pages swapped out', m[:swap][:swapped_out], :warn_if_exceeding => 0)
rb.avg_metric('Pages swapped in', m[:swap][:swapped_in], :warn_if_exceeding => 0)
rb.close_group

rb.open_group('Network')
Expand All @@ -351,7 +351,7 @@ def printable_result(params={})
rb.avg_metric('Connections timed out', m[:network][:tcp][:conn_timeout], :warn_if_exceeding => 0)
rb.avg_metric('Segments sent', m[:network][:tcp][:out_segs])
rb.avg_metric('Segments received', m[:network][:tcp][:in_segs])
rb.avg_metric('Segments retransmitted', m[:network][:tcp][:retrans_segs], :warn_if_exceeding => 0)
rb.avg_metric('Segments retransmitted', m[:network][:tcp][:retrans_segs])
rb.avg_metric('Listen overflows', m[:network][:tcp][:listen_overflow], :warn_if_exceeding => 0)
rb.close_group
m[:network][:if].each do |ni, ni_m|
Expand Down
2 changes: 2 additions & 0 deletions tests/performance/mmap_vs_directio/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
enwiki-20240801-pages.6819k.jsonl.zst
squad2-questions.fbench.141k.txt
130 changes: 130 additions & 0 deletions tests/performance/mmap_vs_directio/mmap_vs_directio.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Copyright Vespa.ai. All rights reserved.

require 'performance_test'
require 'app_generator/search_app'
require 'performance/stat'

class MmapVsDirectIoTest < PerformanceTest

def timeout_seconds
1200
end

def setup
super
set_owner('vekterli')
end

def teardown
super
end

def test_wikipedia_corpus_search_performance
set_description('Test search performance on English Wikipedia corpus and query set '+
'when file reading is done via either mmap or Direct IO')
deploy_app(make_app(search_direct_io: false))
@search_node = vespa.search['search'].first
@container = vespa.container.values.first
start

query_file_name = 'squad2-questions.fbench.141k.txt'
report_io_stat_deltas do
feed_file('enwiki-20240801-pages.6819k.jsonl.zst')
end

@search_node.trigger_flush # Shovel everything into a disk index
@search_node.execute("du -hS #{Environment.instance.vespa_home}/var/db/vespa/search/cluster.search/")

# One-shot warmup round with many clients. This helps measure contention for paging in data.
# Note that we don't tag as "warmup=true", as we want profiling enabled here as well.
puts "Warming up mmap'ed region with 64 clients"
report_io_stat_deltas do
benchmark_queries(query_file_name, 'mmap_warmup', 64, false)
end
puts "Searching with mmap-backed search stores"

[8, 16, 32, 64].each do |clients|
report_io_stat_deltas do
benchmark_queries(query_file_name, 'mmap', clients, false)
end
end

vespa.stop_content_node('search', 0)

puts "Redeploying with Direct IO for searches"
deploy_app(make_app(search_direct_io: true))
# Model has changed under our feet, must refresh remote objects.
@search_node = vespa.search['search'].first
@container = vespa.container.values.first

vespa.start_content_node('search', 0)
sleep 2 # Allow for container health pings to catch up

puts "Searching with Direct IO-backed search stores"
[8, 16, 32, 64].each do |clients|
report_io_stat_deltas do
benchmark_queries(query_file_name, 'directio', clients, false)
end
end

stop
end

def feed_file(feed_file, n_docs = -1)
node_file = download_file(feed_file, vespa.adminserver)
# JSONL source, so `head` works nicely to efficiently limit to N docs from the feed
limit_cmd = n_docs <= 0 ? '' : " | head -#{n_docs}"

run_stream_feeder("zstdcat #{node_file}#{limit_cmd}", [],
{:client => :vespa_feed_client,
:compression => 'none',
:localfile => true,
:silent => true,
:disable_tls => false})
end

def download_file(file_name, vespa_node)
download_file_from_s3(file_name, vespa_node, 'wikipedia')
end

def make_app(search_direct_io:)
SearchApp.new.sd(selfdir + 'wikimedia.sd').
container(Container.new('default').
search(Searching.new).
docproc(DocumentProcessing.new).
documentapi(ContainerDocumentApi.new)).
indexing_cluster('default').
indexing_chain('indexing').
config(ConfigOverride.new('vespa.config.search.core.proton').
add('search', ConfigValue.new('io', search_direct_io ? 'DIRECTIO' : 'MMAP')))
end

def report_io_stat_deltas
stat_before = @search_node.performance_snapshot
yield
stat_after = @search_node.performance_snapshot
puts Perf::Stat::snapshot_period(stat_before, stat_after).printable_result
end

# TODO dedupe
def benchmark_queries(query_file, type, clients, warmup = false, runtime = 20)
node_file = download_file(query_file, @container)
label = "#{type}_#{clients}"
result_file = dirs.tmpdir + "result_#{label}.txt" # TODO don't include?
fillers = [parameter_filler("label", label),
parameter_filler("type", type),
parameter_filler("clients", clients)]
profiler_start if not warmup
run_fbench2(@container,
node_file,
{:clients => clients,
:use_post => false,
:runtime => runtime,
:result_file => result_file},
fillers)
profiler_report(label) if not warmup
@container.execute("head -12 #{result_file}")
end

end

16 changes: 16 additions & 0 deletions tests/performance/mmap_vs_directio/prepare_fbench_queries.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright Vespa.ai. All rights reserved.
require 'erb'

# Simple script which takes in newline-separated raw queries on stdin (or a file)
# and outputs a fbench-formatted HTTP GET query per line. Only `yql` and `query`
# parameters are populated; other parameters should be appended explicitly by tests.

def uri_enc(str)
ERB::Util.url_encode(str)
end

yql = uri_enc('select * from wikimedia where userQuery()')

ARGF.each_line do |line|
puts "/search/?yql=#{yql}&query=#{uri_enc(line)}"
end
24 changes: 24 additions & 0 deletions tests/performance/mmap_vs_directio/wikimedia.sd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
schema wikimedia {

document wikimedia {
field title type string {
indexing: index | summary
index: enable-bm25
}
field text type string {
indexing: index
index: enable-bm25
}
}

fieldset default {
fields: title, text
}

rank-profile default {
first-phase {
expression: bm25(title) + bm25(text)
}
}

}

0 comments on commit 65fd5ec

Please sign in to comment.