From 3896470d81fbb813fcede43c72cf7ed1107196d1 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:53:32 +0100 Subject: [PATCH 01/80] feat: add cached peer book with higher ttls --- main.go | 6 +++++ server.go | 21 ++++++++++++++- server_addr_book.go | 64 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 server_addr_book.go diff --git a/main.go b/main.go index 8b0fc55..7c33147 100644 --- a/main.go +++ b/main.go @@ -38,6 +38,12 @@ func main() { EnvVars: []string{"SOMEGUY_ACCELERATED_DHT"}, Usage: "run the accelerated DHT client", }, + &cli.BoolFlag{ + Name: "cached-addr-book", + Value: true, + EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK"}, + Usage: "use separate cached address book instead of the one provided by the libp2p host", + }, &cli.StringSliceFlag{ Name: "provider-endpoints", Value: cli.NewStringSlice(cidContactEndpoint), diff --git a/server.go b/server.go index 2a50ec8..064ab32 100644 --- a/server.go +++ b/server.go @@ -22,7 +22,9 @@ import ( "github.com/libp2p/go-libp2p" dht "github.com/libp2p/go-libp2p-kad-dht" "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/core/routing" + "github.com/libp2p/go-libp2p/p2p/host/peerstore/pstoremem" "github.com/libp2p/go-libp2p/p2p/net/connmgr" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -42,6 +44,7 @@ func withRequestLogger(next http.Handler) http.Handler { type config struct { listenAddress string acceleratedDHTClient bool + cachedAddrBook bool contentEndpoints []string peerEndpoints []string @@ -80,6 +83,14 @@ func start(ctx context.Context, cfg *config) error { dhtRouting = standardDHT } + var cachedAddrBook peerstore.AddrBook + + if cfg.cachedAddrBook { + fmt.Println("Using cached address book to speed up peer discovery") + cachedAddrBook = pstoremem.NewAddrBook() + go manageAddrBook(ctx, cachedAddrBook, h) + } + crRouters, err := getCombinedRouting(cfg.contentEndpoints, dhtRouting) if err != nil { return err @@ -109,11 +120,19 @@ func start(ctx context.Context, cfg *config) error { _ = tp.Shutdown(ctx) }() + handlerOpts := []server.Option{ + server.WithPrometheusRegistry(prometheus.DefaultRegisterer), + } + + if cachedAddrBook != nil { + handlerOpts = append(handlerOpts, server.WithCachedAddrBook(cachedAddrBook)) + } + handler := server.Handler(&composableRouter{ providers: crRouters, peers: prRouters, ipns: ipnsRouters, - }, server.WithPrometheusRegistry(prometheus.DefaultRegisterer)) + }, handlerOpts...) // Add CORS. handler = cors.New(cors.Options{ diff --git a/server_addr_book.go b/server_addr_book.go new file mode 100644 index 0000000..b982455 --- /dev/null +++ b/server_addr_book.go @@ -0,0 +1,64 @@ +package main + +import ( + "context" + "io" + "math" + "time" + + "github.com/libp2p/go-libp2p/core/event" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/network" + "github.com/libp2p/go-libp2p/core/peerstore" +) + +// By default, we keep recently connected peers for 48 hours, which leaves enough to probe +const RecentlyConnectedAddrTTL = time.Hour * 48 +const ConnectedAddrTTL = math.MaxInt64 + +func manageAddrBook(ctx context.Context, addrBook peerstore.AddrBook, host host.Host) { + sub, err := host.EventBus().Subscribe([]interface{}{ + &event.EvtPeerIdentificationCompleted{}, + &event.EvtPeerConnectednessChanged{}, + }) + if err != nil { + logger.Errorf("failed to subscribe to peer identification events: %v", err) + return + } + defer sub.Close() + + for { + select { + case <-ctx.Done(): + cabCloser, ok := addrBook.(io.Closer) + if ok { + errClose := cabCloser.Close() + if errClose != nil { + logger.Warnf("failed to close addr book: %v", errClose) + } + } + return + case ev := <-sub.Out(): + switch ev := ev.(type) { + case event.EvtPeerIdentificationCompleted: + if ev.SignedPeerRecord != nil { + cab, ok := peerstore.GetCertifiedAddrBook(addrBook) + if ok { + ttl := RecentlyConnectedAddrTTL + if host.Network().Connectedness(ev.Peer) == network.Connected { + ttl = ConnectedAddrTTL + } + _, err := cab.ConsumePeerRecord(ev.SignedPeerRecord, ttl) + if err != nil { + logger.Warnf("failed to consume signed peer record: %v", err) + } + } + } + case event.EvtPeerConnectednessChanged: + if ev.Connectedness != network.Connected { + addrBook.UpdateAddrs(ev.Peer, ConnectedAddrTTL, RecentlyConnectedAddrTTL) + } + } + } + } +} From 7dd33ca57dbfca4a930cb2d8e1325a1544c19a7c Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:15:53 +0100 Subject: [PATCH 02/80] feat: initial implementation of active peer probing --- go.mod | 10 +++- go.sum | 12 ++++ main.go | 3 +- server.go | 12 +--- server_addr_book.go | 126 +++++++++++++++++++++++++++++++++++++--- server_cached_router.go | 88 ++++++++++++++++++++++++++++ 6 files changed, 231 insertions(+), 20 deletions(-) create mode 100644 server_cached_router.go diff --git a/go.mod b/go.mod index 71c327e..59fd59d 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/ipfs/boxo v0.24.4-0.20241119003055-e38f236348d6 github.com/ipfs/go-cid v0.4.1 github.com/ipfs/go-log/v2 v2.5.1 - github.com/libp2p/go-libp2p v0.37.0 + github.com/libp2p/go-libp2p v0.37.1 github.com/libp2p/go-libp2p-kad-dht v0.28.1 github.com/libp2p/go-libp2p-record v0.2.0 github.com/multiformats/go-multiaddr v0.13.0 @@ -34,6 +34,7 @@ require ( github.com/andybalholm/brotli v1.1.0 // indirect github.com/benbjohnson/clock v1.3.5 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/btcsuite/btcd v0.20.1-beta // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/cgroups v1.1.0 // indirect @@ -73,16 +74,20 @@ require ( github.com/libp2p/go-cidranger v1.1.0 // indirect github.com/libp2p/go-flow-metrics v0.2.0 // indirect github.com/libp2p/go-libp2p-asn-util v0.4.1 // indirect + github.com/libp2p/go-libp2p-core v0.3.0 // indirect github.com/libp2p/go-libp2p-kbucket v0.6.4 // indirect + github.com/libp2p/go-libp2p-peerstore v0.1.4 // indirect github.com/libp2p/go-libp2p-routing-helpers v0.7.4 // indirect github.com/libp2p/go-libp2p-xor v0.1.0 // indirect github.com/libp2p/go-msgio v0.3.0 // indirect github.com/libp2p/go-nat v0.2.0 // indirect github.com/libp2p/go-netroute v0.2.1 // indirect + github.com/libp2p/go-openssl v0.1.0 // indirect github.com/libp2p/go-reuseport v0.4.0 // indirect github.com/libp2p/go-yamux/v4 v4.0.1 // indirect github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-pointer v0.0.1 // indirect github.com/miekg/dns v1.1.62 // indirect github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b // indirect github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc // indirect @@ -93,7 +98,7 @@ require ( github.com/multiformats/go-multiaddr-dns v0.4.0 // indirect github.com/multiformats/go-multiaddr-fmt v0.1.0 // indirect github.com/multiformats/go-multicodec v0.9.0 // indirect - github.com/multiformats/go-multistream v0.5.0 // indirect + github.com/multiformats/go-multistream v0.6.0 // indirect github.com/multiformats/go-varint v0.0.7 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/onsi/ginkgo/v2 v2.20.2 // indirect @@ -129,6 +134,7 @@ require ( github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/samber/lo v1.47.0 // indirect github.com/slok/go-http-metrics v0.12.0 // indirect + github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 // indirect diff --git a/go.sum b/go.sum index 0d4312f..67fe7a3 100644 --- a/go.sum +++ b/go.sum @@ -28,6 +28,7 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bradfitz/go-smtpd v0.0.0-20170404230938-deb6d6237625/go.mod h1:HYsPBTaaSFSlLx/70C2HPIMNZpVV8+vt/A+FMnYP11g= github.com/btcsuite/btcd v0.0.0-20190824003749-130ea5bddde3/go.mod h1:3J08xEfcugPacsc34/LKRU2yO7YmuT8yt28J8k2+rrI= +github.com/btcsuite/btcd v0.20.1-beta h1:Ik4hyJqN8Jfyv3S4AGBOmyouMsYE3EdYODkMbQjwPGw= github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13Px/pDuV7OomQ= github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA= github.com/btcsuite/btcutil v0.0.0-20190425235716-9e5f4b9a998d/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg= @@ -275,15 +276,19 @@ github.com/libp2p/go-flow-metrics v0.2.0 h1:EIZzjmeOE6c8Dav0sNv35vhZxATIXWZg6j/C github.com/libp2p/go-flow-metrics v0.2.0/go.mod h1:st3qqfu8+pMfh+9Mzqb2GTiwrAGjIPszEjZmtksN8Jc= github.com/libp2p/go-libp2p v0.37.0 h1:8K3mcZgwTldydMCNOiNi/ZJrOB9BY+GlI3UxYzxBi9A= github.com/libp2p/go-libp2p v0.37.0/go.mod h1:GOKmSN99scDuYGTwaTbQPR8Nt6dxrK3ue7OjW2NGDg4= +github.com/libp2p/go-libp2p v0.37.1 h1:9p6fLUGmegmI1VuD9y7jgKvisMYNl44HQSiEmPUNi4c= +github.com/libp2p/go-libp2p v0.37.1/go.mod h1:K7H2RGSoEYdi6v85xlSzqW2oqGz7t98nq+b2eRdfvW8= github.com/libp2p/go-libp2p-asn-util v0.4.1 h1:xqL7++IKD9TBFMgnLPZR6/6iYhawHKHl950SO9L6n94= github.com/libp2p/go-libp2p-asn-util v0.4.1/go.mod h1:d/NI6XZ9qxw67b4e+NgpQexCIiFYJjErASrYW4PFDN8= github.com/libp2p/go-libp2p-core v0.2.4/go.mod h1:STh4fdfa5vDYr0/SzYYeqnt+E6KfEV5VxfIrm0bcI0g= +github.com/libp2p/go-libp2p-core v0.3.0 h1:F7PqduvrztDtFsAa/bcheQ3azmNo+Nq7m8hQY5GiUW8= github.com/libp2p/go-libp2p-core v0.3.0/go.mod h1:ACp3DmS3/N64c2jDzcV429ukDpicbL6+TrrxANBjPGw= github.com/libp2p/go-libp2p-kad-dht v0.28.1 h1:DVTfzG8Ybn88g9RycIq47evWCRss5f0Wm8iWtpwyHso= github.com/libp2p/go-libp2p-kad-dht v0.28.1/go.mod h1:0wHURlSFdAC42+wF7GEmpLoARw8JuS8do2guCtc/Y/w= github.com/libp2p/go-libp2p-kbucket v0.3.1/go.mod h1:oyjT5O7tS9CQurok++ERgc46YLwEpuGoFq9ubvoUOio= github.com/libp2p/go-libp2p-kbucket v0.6.4 h1:OjfiYxU42TKQSB8t8WYd8MKhYhMJeO2If+NiuKfb6iQ= github.com/libp2p/go-libp2p-kbucket v0.6.4/go.mod h1:jp6w82sczYaBsAypt5ayACcRJi0lgsba7o4TzJKEfWA= +github.com/libp2p/go-libp2p-peerstore v0.1.4 h1:d23fvq5oYMJ/lkkbO4oTwBp/JP+I/1m5gZJobNXCE/k= github.com/libp2p/go-libp2p-peerstore v0.1.4/go.mod h1:+4BDbDiiKf4PzpANZDAT+knVdLxvqh7hXOujessqdzs= github.com/libp2p/go-libp2p-record v0.2.0 h1:oiNUOCWno2BFuxt3my4i1frNrt7PerzB3queqa1NkQ0= github.com/libp2p/go-libp2p-record v0.2.0/go.mod h1:I+3zMkvvg5m2OcSdoL0KPljyJyvNDFGKX7QdlpYUcwk= @@ -302,6 +307,8 @@ github.com/libp2p/go-netroute v0.2.1 h1:V8kVrpD8GK0Riv15/7VN6RbUQ3URNZVosw7H2v9t github.com/libp2p/go-netroute v0.2.1/go.mod h1:hraioZr0fhBjG0ZRXJJ6Zj2IVEVNx6tDTFQfSmcq7mQ= github.com/libp2p/go-openssl v0.0.3/go.mod h1:unDrJpgy3oFr+rqXsarWifmJuNnJR4chtO1HmaZjggc= github.com/libp2p/go-openssl v0.0.4/go.mod h1:unDrJpgy3oFr+rqXsarWifmJuNnJR4chtO1HmaZjggc= +github.com/libp2p/go-openssl v0.1.0 h1:LBkKEcUv6vtZIQLVTegAil8jbNpJErQ9AnT+bWV+Ooo= +github.com/libp2p/go-openssl v0.1.0/go.mod h1:OiOxwPpL3n4xlenjx2h7AwSGaFSC/KZvf6gNdOBQMtc= github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQscQm2s= github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= @@ -319,6 +326,8 @@ github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hd github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-pointer v0.0.1 h1:n+XhsuGeVO6MEAp7xyEukFINEa+Quek5psIR/ylA6o0= +github.com/mattn/go-pointer v0.0.1/go.mod h1:2zXcozF6qYGgmsG+SeTZz3oAbFLdD3OWqnUbNvJZAlc= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= github.com/microcosm-cc/bluemonday v1.0.1/go.mod h1:hsXNsILzKxV+sX77C5b8FSuKF00vh2OMYv+xgHpAMF4= @@ -373,6 +382,8 @@ github.com/multiformats/go-multihash v0.2.3 h1:7Lyc8XfX/IY2jWb/gI7JP+o7JEq9hOa7B github.com/multiformats/go-multihash v0.2.3/go.mod h1:dXgKXCXjBzdscBLk9JkjINiEsCKRVch90MdaGiKsvSM= github.com/multiformats/go-multistream v0.5.0 h1:5htLSLl7lvJk3xx3qT/8Zm9J4K8vEOf/QGkvOGQAyiE= github.com/multiformats/go-multistream v0.5.0/go.mod h1:n6tMZiwiP2wUsR8DgfDWw1dydlEqV3l6N3/GBsX6ILA= +github.com/multiformats/go-multistream v0.6.0 h1:ZaHKbsL404720283o4c/IHQXiS6gb8qAN5EIJ4PN5EA= +github.com/multiformats/go-multistream v0.6.0/go.mod h1:MOyoG5otO24cHIg8kf9QW2/NozURlkP/rvi2FQJyCPg= github.com/multiformats/go-varint v0.0.1/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= github.com/multiformats/go-varint v0.0.5/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= github.com/multiformats/go-varint v0.0.7 h1:sWSGR+f/eu5ABZA2ZpYKBILXTTs9JWpdEM/nEGOHFS8= @@ -519,6 +530,7 @@ github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3 github.com/smola/gocompat v0.2.0/go.mod h1:1B0MlxbmoZNo3h8guHp8HztB3BSYR5itql9qtVc0ypY= github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE= github.com/sourcegraph/syntaxhighlight v0.0.0-20170531221838-bd320f5d308e/go.mod h1:HuIsMU8RRBOtsCgI77wP899iHVBQpCmg4ErYMZB+2IA= +github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 h1:RC6RW7j+1+HkWaX/Yh71Ee5ZHaHYt7ZP4sQgUrm6cDU= github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572/go.mod h1:w0SWMsp6j9O/dk4/ZpIhL+3CkG8ofA2vuv7k+ltqUMc= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= diff --git a/main.go b/main.go index 7c33147..3f1a3da 100644 --- a/main.go +++ b/main.go @@ -42,7 +42,7 @@ func main() { Name: "cached-addr-book", Value: true, EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK"}, - Usage: "use separate cached address book instead of the one provided by the libp2p host", + Usage: "use a cached address book to improve peer routing performance", }, &cli.StringSliceFlag{ Name: "provider-endpoints", @@ -123,6 +123,7 @@ func main() { cfg := &config{ listenAddress: ctx.String("listen-address"), acceleratedDHTClient: ctx.Bool("accelerated-dht"), + cachedAddrBook: ctx.Bool("cached-addr-book"), contentEndpoints: ctx.StringSlice("provider-endpoints"), peerEndpoints: ctx.StringSlice("peer-endpoints"), diff --git a/server.go b/server.go index 064ab32..b042338 100644 --- a/server.go +++ b/server.go @@ -22,9 +22,7 @@ import ( "github.com/libp2p/go-libp2p" dht "github.com/libp2p/go-libp2p-kad-dht" "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/core/routing" - "github.com/libp2p/go-libp2p/p2p/host/peerstore/pstoremem" "github.com/libp2p/go-libp2p/p2p/net/connmgr" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -83,12 +81,12 @@ func start(ctx context.Context, cfg *config) error { dhtRouting = standardDHT } - var cachedAddrBook peerstore.AddrBook + var cachedAddrBook *cachedAddrBook if cfg.cachedAddrBook { fmt.Println("Using cached address book to speed up peer discovery") - cachedAddrBook = pstoremem.NewAddrBook() - go manageAddrBook(ctx, cachedAddrBook, h) + cachedAddrBook = newCachedAddrBook() + go cachedAddrBook.background(ctx, h) } crRouters, err := getCombinedRouting(cfg.contentEndpoints, dhtRouting) @@ -124,10 +122,6 @@ func start(ctx context.Context, cfg *config) error { server.WithPrometheusRegistry(prometheus.DefaultRegisterer), } - if cachedAddrBook != nil { - handlerOpts = append(handlerOpts, server.WithCachedAddrBook(cachedAddrBook)) - } - handler := server.Handler(&composableRouter{ providers: crRouters, peers: prRouters, diff --git a/server_addr_book.go b/server_addr_book.go index b982455..334ece9 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -4,19 +4,53 @@ import ( "context" "io" "math" + "sync/atomic" "time" + "github.com/ipfs/boxo/routing/http/types" + ma "github.com/multiformats/go-multiaddr" + "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" + "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/peerstore" + "github.com/libp2p/go-libp2p/p2p/host/peerstore/pstoremem" ) -// By default, we keep recently connected peers for 48 hours, which leaves enough to probe -const RecentlyConnectedAddrTTL = time.Hour * 48 +// The TTL to keep recently connected peers for. This should be enough time to probe +const RecentlyConnectedAddrTTL = time.Hour * 24 + +// Connected peers don't expire until they disconnect const ConnectedAddrTTL = math.MaxInt64 -func manageAddrBook(ctx context.Context, addrBook peerstore.AddrBook, host host.Host) { +// How long to wait since last connection before probing a peer again +const PeerProbeThreshold = time.Hour + +// How often to run the probe peers function +const ProbeInterval = time.Minute * 15 + +type peerState struct { + lastConnTime time.Time // time we were connected to this peer + lastConnAddr ma.Multiaddr // last address we connected to this peer on + returnCount atomic.Int32 // number of times we've returned this peer + connectFailures atomic.Int32 // number of times we've failed to connect to this peer +} + +type cachedAddrBook struct { + peers map[peer.ID]*peerState // PeerID -> peer state + addrBook peerstore.AddrBook // PeerID -> []Multiaddr with TTL expirations + isProbing bool // Whether we are currently probing peers +} + +func newCachedAddrBook() *cachedAddrBook { + return &cachedAddrBook{ + peers: make(map[peer.ID]*peerState), + addrBook: pstoremem.NewAddrBook(), + } +} + +func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { sub, err := host.EventBus().Subscribe([]interface{}{ &event.EvtPeerIdentificationCompleted{}, &event.EvtPeerConnectednessChanged{}, @@ -27,10 +61,13 @@ func manageAddrBook(ctx context.Context, addrBook peerstore.AddrBook, host host. } defer sub.Close() + probeTicker := time.NewTicker(ProbeInterval) + defer probeTicker.Stop() + for { select { case <-ctx.Done(): - cabCloser, ok := addrBook.(io.Closer) + cabCloser, ok := cab.addrBook.(io.Closer) if ok { errClose := cabCloser.Close() if errClose != nil { @@ -41,11 +78,19 @@ func manageAddrBook(ctx context.Context, addrBook peerstore.AddrBook, host host. case ev := <-sub.Out(): switch ev := ev.(type) { case event.EvtPeerIdentificationCompleted: + // Update the peer state with the last connected address and time + cab.peers[ev.Peer] = &peerState{ + lastConnTime: time.Now(), + lastConnAddr: ev.Conn.RemoteMultiaddr(), + returnCount: atomic.Int32{}, + connectFailures: atomic.Int32{}, + } if ev.SignedPeerRecord != nil { - cab, ok := peerstore.GetCertifiedAddrBook(addrBook) + logger.Debug("Caching signed peer record") + cab, ok := peerstore.GetCertifiedAddrBook(cab.addrBook) if ok { ttl := RecentlyConnectedAddrTTL - if host.Network().Connectedness(ev.Peer) == network.Connected { + if host.Network().Connectedness(ev.Peer) == network.Connected || host.Network().Connectedness(ev.Peer) == network.Limited { ttl = ConnectedAddrTTL } _, err := cab.ConsumePeerRecord(ev.SignedPeerRecord, ttl) @@ -53,12 +98,77 @@ func manageAddrBook(ctx context.Context, addrBook peerstore.AddrBook, host host. logger.Warnf("failed to consume signed peer record: %v", err) } } + } else { + logger.Debug("No signed peer record, caching listen addresses") + // We don't have a signed peer record, so we use the listen addresses + host.Peerstore().AddAddrs(ev.Peer, ev.ListenAddrs, ConnectedAddrTTL) } case event.EvtPeerConnectednessChanged: - if ev.Connectedness != network.Connected { - addrBook.UpdateAddrs(ev.Peer, ConnectedAddrTTL, RecentlyConnectedAddrTTL) + // If the peer is not connected or limited, we update the TTL + if ev.Connectedness != network.Connected && ev.Connectedness != network.Limited { + cab.addrBook.UpdateAddrs(ev.Peer, ConnectedAddrTTL, RecentlyConnectedAddrTTL) } } + case <-probeTicker.C: + if cab.isProbing { + logger.Debug("Skipping peer probe, still running") + continue + } + logger.Debug("Running peer probe") + cab.probePeers(ctx, host) } } } + +// Loops over all peers with addresses and probes them if they haven't been probed recently +func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { + cab.isProbing = true + defer func() { cab.isProbing = false }() + + for _, p := range cab.addrBook.PeersWithAddrs() { + if host.Network().Connectedness(p) == network.Connected || host.Network().Connectedness(p) == network.Limited { + // No need to probe connected peers + continue + } + + lastConnTime := cab.peers[p].lastConnTime + + if time.Since(lastConnTime) < PeerProbeThreshold { + // Don't probe recently connected peers + continue + } + + addrs := cab.addrBook.Addrs(p) + + if len(addrs) == 0 { + // No addresses to probe + continue + } + + // If connect succeeds and identify runs, the background loop will update the peer state and cache + // TODO: introduce some concurrency + ctx, cancel := context.WithTimeout(ctx, time.Second*10) + defer cancel() + err := host.Connect(ctx, peer.AddrInfo{ + ID: p, + // TODO: Should we should probe the last connected address or all addresses? + Addrs: addrs, + }) + if err != nil { + logger.Warnf("failed to connect to peer %s: %v", p, err) + cab.peers[p].connectFailures.Add(1) + } + } +} + +// Returns the cached addresses for a peer, incrementing the return count +func (cab *cachedAddrBook) getCachedAddrs(p *peer.ID) []types.Multiaddr { + addrs := cab.addrBook.Addrs(*p) + cab.peers[*p].returnCount.Add(1) // increment the return count + + var cachedAddrs []types.Multiaddr + for _, addr := range addrs { + cachedAddrs = append(cachedAddrs, types.Multiaddr{Multiaddr: addr}) + } + return cachedAddrs +} diff --git a/server_cached_router.go b/server_cached_router.go new file mode 100644 index 0000000..e169577 --- /dev/null +++ b/server_cached_router.go @@ -0,0 +1,88 @@ +package main + +import ( + "context" + "reflect" + "time" + + "github.com/ipfs/boxo/routing/http/server" + "github.com/ipfs/boxo/routing/http/types" + "github.com/ipfs/boxo/routing/http/types/iter" + "github.com/ipfs/go-cid" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/core/routing" +) + +var _ server.ContentRouter = cachedRouter{} + +// cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers + +type cachedRouter struct { + router + cachedAddrBook *cachedAddrBook +} + +func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) (iter.ResultIter[types.Record], error) { + it, err := r.router.FindProviders(ctx, key, limit) + if err != nil { + return nil, err + } + + return iter.Map(it, func(v iter.Result[types.Record]) iter.Result[types.Record] { + if v.Err != nil || v.Val == nil { + return v + } + + switch v.Val.GetSchema() { + case types.SchemaPeer: + result, ok := v.Val.(*types.PeerRecord) + if !ok { + logger.Errorw("problem casting find providers result", "Schema", v.Val.GetSchema(), "Type", reflect.TypeOf(v).String()) + return v + } + if len(result.Addrs) == 0 { + result.Addrs = r.cachedAddrBook.getCachedAddrs(result.ID) + } + + v.Val = result + + //lint:ignore SA1019 // ignore staticcheck + case types.SchemaBitswap: + //lint:ignore SA1019 // ignore staticcheck + result, ok := v.Val.(*types.BitswapRecord) + if !ok { + logger.Errorw("problem casting find providers result", "Schema", v.Val.GetSchema(), "Type", reflect.TypeOf(v).String()) + return v + } + + // TODO: use cachedAddrBook to filter private addresses + v.Val = result + } + + return v + }), nil +} + +func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { + it, err := r.router.FindPeers(ctx, pid, limit) + if err != nil { + return nil, err + } + + return iter.Map(it, func(v iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { + if v.Err != nil || v.Val == nil { + return v + } + + // If no addresses were found by router, use cached addresses + if len(v.Val.Addrs) == 0 { + v.Val.Addrs = r.cachedAddrBook.getCachedAddrs(v.Val.ID) + } + return v + }), nil +} + +//lint:ignore SA1019 // ignore staticcheck +func (r cachedRouter) ProvideBitswap(ctx context.Context, req *server.BitswapWriteProvideRequest) (time.Duration, error) { + return 0, routing.ErrNotSupported +} From 0e86ea416ffc47917dd22c39eaee56dc60813490 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:26:06 +0100 Subject: [PATCH 03/80] feat: use the cached router --- server.go | 16 ++++++++-------- server_cached_router.go | 17 ++++++++++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/server.go b/server.go index b042338..81f0c38 100644 --- a/server.go +++ b/server.go @@ -89,17 +89,17 @@ func start(ctx context.Context, cfg *config) error { go cachedAddrBook.background(ctx, h) } - crRouters, err := getCombinedRouting(cfg.contentEndpoints, dhtRouting) + crRouters, err := getCombinedRouting(cfg.contentEndpoints, dhtRouting, cachedAddrBook) if err != nil { return err } - prRouters, err := getCombinedRouting(cfg.peerEndpoints, dhtRouting) + prRouters, err := getCombinedRouting(cfg.peerEndpoints, dhtRouting, cachedAddrBook) if err != nil { return err } - ipnsRouters, err := getCombinedRouting(cfg.ipnsEndpoints, dhtRouting) + ipnsRouters, err := getCombinedRouting(cfg.ipnsEndpoints, dhtRouting, cachedAddrBook) if err != nil { return err } @@ -229,9 +229,9 @@ func newHost(cfg *config) (host.Host, error) { return h, nil } -func getCombinedRouting(endpoints []string, dht routing.Routing) (router, error) { +func getCombinedRouting(endpoints []string, dht routing.Routing, cachedAddrBook *cachedAddrBook) (router, error) { if len(endpoints) == 0 { - return sanitizeRouter{libp2pRouter{routing: dht}}, nil + return cachedRouter{sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook}, nil } var routers []router @@ -249,9 +249,9 @@ func getCombinedRouting(endpoints []string, dht routing.Routing) (router, error) routers = append(routers, clientRouter{Client: drclient}) } - return sanitizeRouter{parallelRouter{ - routers: append(routers, libp2pRouter{routing: dht}), - }}, nil + return parallelRouter{ + routers: append(routers, cachedRouter{sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook}), + }, nil } func withTracingAndDebug(next http.Handler, authToken string) http.Handler { diff --git a/server_cached_router.go b/server_cached_router.go index e169577..8720a36 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -15,8 +15,7 @@ import ( var _ server.ContentRouter = cachedRouter{} -// cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers - +// cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers without multiaddrs type cachedRouter struct { router cachedAddrBook *cachedAddrBook @@ -41,7 +40,9 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return v } if len(result.Addrs) == 0 { - result.Addrs = r.cachedAddrBook.getCachedAddrs(result.ID) + cachedAddrs := r.cachedAddrBook.getCachedAddrs(result.ID) + logger.Debugw("no addresses found for peer, using cached addresses", "peer", result.ID, "cachedAddrs", cachedAddrs) + result.Addrs = cachedAddrs } v.Val = result @@ -55,7 +56,11 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return v } - // TODO: use cachedAddrBook to filter private addresses + if len(result.Addrs) == 0 { + cachedAddrs := r.cachedAddrBook.getCachedAddrs(result.ID) + logger.Debugw("no addresses found for peer, using cached addresses", "peer", result.ID, "cachedAddrs", cachedAddrs) + result.Addrs = cachedAddrs + } v.Val = result } @@ -76,7 +81,9 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it // If no addresses were found by router, use cached addresses if len(v.Val.Addrs) == 0 { - v.Val.Addrs = r.cachedAddrBook.getCachedAddrs(v.Val.ID) + cachedAddrs := r.cachedAddrBook.getCachedAddrs(v.Val.ID) + logger.Debugw("no addresses found for peer, using cached addresses", "peer", v.Val.ID, "cachedAddrs", cachedAddrs) + v.Val.Addrs = cachedAddrs } return v }), nil From ec2a67ae2e895c71de59f371995aa8c5a27de755 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:27:48 +0100 Subject: [PATCH 04/80] chore: go mod tidy --- go.mod | 6 ------ go.sum | 12 ------------ 2 files changed, 18 deletions(-) diff --git a/go.mod b/go.mod index 59fd59d..5510198 100644 --- a/go.mod +++ b/go.mod @@ -34,7 +34,6 @@ require ( github.com/andybalholm/brotli v1.1.0 // indirect github.com/benbjohnson/clock v1.3.5 // indirect github.com/beorn7/perks v1.0.1 // indirect - github.com/btcsuite/btcd v0.20.1-beta // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/cgroups v1.1.0 // indirect @@ -74,20 +73,16 @@ require ( github.com/libp2p/go-cidranger v1.1.0 // indirect github.com/libp2p/go-flow-metrics v0.2.0 // indirect github.com/libp2p/go-libp2p-asn-util v0.4.1 // indirect - github.com/libp2p/go-libp2p-core v0.3.0 // indirect github.com/libp2p/go-libp2p-kbucket v0.6.4 // indirect - github.com/libp2p/go-libp2p-peerstore v0.1.4 // indirect github.com/libp2p/go-libp2p-routing-helpers v0.7.4 // indirect github.com/libp2p/go-libp2p-xor v0.1.0 // indirect github.com/libp2p/go-msgio v0.3.0 // indirect github.com/libp2p/go-nat v0.2.0 // indirect github.com/libp2p/go-netroute v0.2.1 // indirect - github.com/libp2p/go-openssl v0.1.0 // indirect github.com/libp2p/go-reuseport v0.4.0 // indirect github.com/libp2p/go-yamux/v4 v4.0.1 // indirect github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd // indirect github.com/mattn/go-isatty v0.0.20 // indirect - github.com/mattn/go-pointer v0.0.1 // indirect github.com/miekg/dns v1.1.62 // indirect github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b // indirect github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc // indirect @@ -134,7 +129,6 @@ require ( github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/samber/lo v1.47.0 // indirect github.com/slok/go-http-metrics v0.12.0 // indirect - github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 // indirect diff --git a/go.sum b/go.sum index 67fe7a3..261b798 100644 --- a/go.sum +++ b/go.sum @@ -28,7 +28,6 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bradfitz/go-smtpd v0.0.0-20170404230938-deb6d6237625/go.mod h1:HYsPBTaaSFSlLx/70C2HPIMNZpVV8+vt/A+FMnYP11g= github.com/btcsuite/btcd v0.0.0-20190824003749-130ea5bddde3/go.mod h1:3J08xEfcugPacsc34/LKRU2yO7YmuT8yt28J8k2+rrI= -github.com/btcsuite/btcd v0.20.1-beta h1:Ik4hyJqN8Jfyv3S4AGBOmyouMsYE3EdYODkMbQjwPGw= github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13Px/pDuV7OomQ= github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA= github.com/btcsuite/btcutil v0.0.0-20190425235716-9e5f4b9a998d/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg= @@ -274,21 +273,17 @@ github.com/libp2p/go-flow-metrics v0.0.1/go.mod h1:Iv1GH0sG8DtYN3SVJ2eG221wMiNpZ github.com/libp2p/go-flow-metrics v0.0.3/go.mod h1:HeoSNUrOJVK1jEpDqVEiUOIXqhbnS27omG0uWU5slZs= github.com/libp2p/go-flow-metrics v0.2.0 h1:EIZzjmeOE6c8Dav0sNv35vhZxATIXWZg6j/C08XmmDw= github.com/libp2p/go-flow-metrics v0.2.0/go.mod h1:st3qqfu8+pMfh+9Mzqb2GTiwrAGjIPszEjZmtksN8Jc= -github.com/libp2p/go-libp2p v0.37.0 h1:8K3mcZgwTldydMCNOiNi/ZJrOB9BY+GlI3UxYzxBi9A= -github.com/libp2p/go-libp2p v0.37.0/go.mod h1:GOKmSN99scDuYGTwaTbQPR8Nt6dxrK3ue7OjW2NGDg4= github.com/libp2p/go-libp2p v0.37.1 h1:9p6fLUGmegmI1VuD9y7jgKvisMYNl44HQSiEmPUNi4c= github.com/libp2p/go-libp2p v0.37.1/go.mod h1:K7H2RGSoEYdi6v85xlSzqW2oqGz7t98nq+b2eRdfvW8= github.com/libp2p/go-libp2p-asn-util v0.4.1 h1:xqL7++IKD9TBFMgnLPZR6/6iYhawHKHl950SO9L6n94= github.com/libp2p/go-libp2p-asn-util v0.4.1/go.mod h1:d/NI6XZ9qxw67b4e+NgpQexCIiFYJjErASrYW4PFDN8= github.com/libp2p/go-libp2p-core v0.2.4/go.mod h1:STh4fdfa5vDYr0/SzYYeqnt+E6KfEV5VxfIrm0bcI0g= -github.com/libp2p/go-libp2p-core v0.3.0 h1:F7PqduvrztDtFsAa/bcheQ3azmNo+Nq7m8hQY5GiUW8= github.com/libp2p/go-libp2p-core v0.3.0/go.mod h1:ACp3DmS3/N64c2jDzcV429ukDpicbL6+TrrxANBjPGw= github.com/libp2p/go-libp2p-kad-dht v0.28.1 h1:DVTfzG8Ybn88g9RycIq47evWCRss5f0Wm8iWtpwyHso= github.com/libp2p/go-libp2p-kad-dht v0.28.1/go.mod h1:0wHURlSFdAC42+wF7GEmpLoARw8JuS8do2guCtc/Y/w= github.com/libp2p/go-libp2p-kbucket v0.3.1/go.mod h1:oyjT5O7tS9CQurok++ERgc46YLwEpuGoFq9ubvoUOio= github.com/libp2p/go-libp2p-kbucket v0.6.4 h1:OjfiYxU42TKQSB8t8WYd8MKhYhMJeO2If+NiuKfb6iQ= github.com/libp2p/go-libp2p-kbucket v0.6.4/go.mod h1:jp6w82sczYaBsAypt5ayACcRJi0lgsba7o4TzJKEfWA= -github.com/libp2p/go-libp2p-peerstore v0.1.4 h1:d23fvq5oYMJ/lkkbO4oTwBp/JP+I/1m5gZJobNXCE/k= github.com/libp2p/go-libp2p-peerstore v0.1.4/go.mod h1:+4BDbDiiKf4PzpANZDAT+knVdLxvqh7hXOujessqdzs= github.com/libp2p/go-libp2p-record v0.2.0 h1:oiNUOCWno2BFuxt3my4i1frNrt7PerzB3queqa1NkQ0= github.com/libp2p/go-libp2p-record v0.2.0/go.mod h1:I+3zMkvvg5m2OcSdoL0KPljyJyvNDFGKX7QdlpYUcwk= @@ -307,8 +302,6 @@ github.com/libp2p/go-netroute v0.2.1 h1:V8kVrpD8GK0Riv15/7VN6RbUQ3URNZVosw7H2v9t github.com/libp2p/go-netroute v0.2.1/go.mod h1:hraioZr0fhBjG0ZRXJJ6Zj2IVEVNx6tDTFQfSmcq7mQ= github.com/libp2p/go-openssl v0.0.3/go.mod h1:unDrJpgy3oFr+rqXsarWifmJuNnJR4chtO1HmaZjggc= github.com/libp2p/go-openssl v0.0.4/go.mod h1:unDrJpgy3oFr+rqXsarWifmJuNnJR4chtO1HmaZjggc= -github.com/libp2p/go-openssl v0.1.0 h1:LBkKEcUv6vtZIQLVTegAil8jbNpJErQ9AnT+bWV+Ooo= -github.com/libp2p/go-openssl v0.1.0/go.mod h1:OiOxwPpL3n4xlenjx2h7AwSGaFSC/KZvf6gNdOBQMtc= github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQscQm2s= github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= @@ -326,8 +319,6 @@ github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hd github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-pointer v0.0.1 h1:n+XhsuGeVO6MEAp7xyEukFINEa+Quek5psIR/ylA6o0= -github.com/mattn/go-pointer v0.0.1/go.mod h1:2zXcozF6qYGgmsG+SeTZz3oAbFLdD3OWqnUbNvJZAlc= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= github.com/microcosm-cc/bluemonday v1.0.1/go.mod h1:hsXNsILzKxV+sX77C5b8FSuKF00vh2OMYv+xgHpAMF4= @@ -380,8 +371,6 @@ github.com/multiformats/go-multihash v0.0.10/go.mod h1:YSLudS+Pi8NHE7o6tb3D8vrpK github.com/multiformats/go-multihash v0.0.13/go.mod h1:VdAWLKTwram9oKAatUcLxBNUjdtcVwxObEQBtRfuyjc= github.com/multiformats/go-multihash v0.2.3 h1:7Lyc8XfX/IY2jWb/gI7JP+o7JEq9hOa7BFvVU9RSh+U= github.com/multiformats/go-multihash v0.2.3/go.mod h1:dXgKXCXjBzdscBLk9JkjINiEsCKRVch90MdaGiKsvSM= -github.com/multiformats/go-multistream v0.5.0 h1:5htLSLl7lvJk3xx3qT/8Zm9J4K8vEOf/QGkvOGQAyiE= -github.com/multiformats/go-multistream v0.5.0/go.mod h1:n6tMZiwiP2wUsR8DgfDWw1dydlEqV3l6N3/GBsX6ILA= github.com/multiformats/go-multistream v0.6.0 h1:ZaHKbsL404720283o4c/IHQXiS6gb8qAN5EIJ4PN5EA= github.com/multiformats/go-multistream v0.6.0/go.mod h1:MOyoG5otO24cHIg8kf9QW2/NozURlkP/rvi2FQJyCPg= github.com/multiformats/go-varint v0.0.1/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= @@ -530,7 +519,6 @@ github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3 github.com/smola/gocompat v0.2.0/go.mod h1:1B0MlxbmoZNo3h8guHp8HztB3BSYR5itql9qtVc0ypY= github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE= github.com/sourcegraph/syntaxhighlight v0.0.0-20170531221838-bd320f5d308e/go.mod h1:HuIsMU8RRBOtsCgI77wP899iHVBQpCmg4ErYMZB+2IA= -github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 h1:RC6RW7j+1+HkWaX/Yh71Ee5ZHaHYt7ZP4sQgUrm6cDU= github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572/go.mod h1:w0SWMsp6j9O/dk4/ZpIhL+3CkG8ofA2vuv7k+ltqUMc= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= From fe681405a006b027e40c6dc5707c071363c7b7f4 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:36:52 +0100 Subject: [PATCH 05/80] feat: log probe duration --- server_addr_book.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server_addr_book.go b/server_addr_book.go index 334ece9..4dde25a 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -115,7 +115,10 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { continue } logger.Debug("Running peer probe") + start := time.Now() cab.probePeers(ctx, host) + elapsed := time.Since(start) + logger.Debugf("Finished peer probe in %s", elapsed) } } } From 06c2d0ccdcc2d4353a1946957b04dbea36a6286f Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:49:23 +0100 Subject: [PATCH 06/80] chore: log in probe loop --- server_addr_book.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server_addr_book.go b/server_addr_book.go index 4dde25a..2ac0c81 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -128,7 +128,8 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { cab.isProbing = true defer func() { cab.isProbing = false }() - for _, p := range cab.addrBook.PeersWithAddrs() { + for i, p := range cab.addrBook.PeersWithAddrs() { + logger.Debugf("Probe %d: PeerID: %s", i+1, p) if host.Network().Connectedness(p) == network.Connected || host.Network().Connectedness(p) == network.Limited { // No need to probe connected peers continue From fc7678320f20887552e970308398c2c053baaa84 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:52:12 +0100 Subject: [PATCH 07/80] fix: update peer state if doesn't exist --- server_addr_book.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/server_addr_book.go b/server_addr_book.go index 2ac0c81..6808e17 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -79,12 +79,18 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { switch ev := ev.(type) { case event.EvtPeerIdentificationCompleted: // Update the peer state with the last connected address and time - cab.peers[ev.Peer] = &peerState{ - lastConnTime: time.Now(), - lastConnAddr: ev.Conn.RemoteMultiaddr(), - returnCount: atomic.Int32{}, - connectFailures: atomic.Int32{}, + if _, exists := cab.peers[ev.Peer]; !exists { + cab.peers[ev.Peer] = &peerState{ + lastConnTime: time.Now(), + lastConnAddr: ev.Conn.RemoteMultiaddr(), + returnCount: atomic.Int32{}, + connectFailures: atomic.Int32{}, + } + } else { + cab.peers[ev.Peer].lastConnTime = time.Now() + cab.peers[ev.Peer].lastConnAddr = ev.Conn.RemoteMultiaddr() } + if ev.SignedPeerRecord != nil { logger.Debug("Caching signed peer record") cab, ok := peerstore.GetCertifiedAddrBook(cab.addrBook) From e904c3e2cbb447c0547230fb4dec41b740cdeb30 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 20:10:27 +0100 Subject: [PATCH 08/80] fix: add addresses to cached address book --- server_addr_book.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server_addr_book.go b/server_addr_book.go index 6808e17..143a6fd 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -107,7 +107,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { } else { logger.Debug("No signed peer record, caching listen addresses") // We don't have a signed peer record, so we use the listen addresses - host.Peerstore().AddAddrs(ev.Peer, ev.ListenAddrs, ConnectedAddrTTL) + cab.addrBook.AddAddrs(ev.Peer, ev.ListenAddrs, ConnectedAddrTTL) } case event.EvtPeerConnectednessChanged: // If the peer is not connected or limited, we update the TTL From 814ae5847d0cfa542c72fb7f828804b850621197 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 20:19:09 +0100 Subject: [PATCH 09/80] fix: wrap with cached router only if available --- server.go | 16 ++++++++++---- server_addr_book.go | 52 ++++++++++++++++++++++++--------------------- server_test.go | 10 ++++----- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/server.go b/server.go index 81f0c38..b84b6c3 100644 --- a/server.go +++ b/server.go @@ -230,11 +230,19 @@ func newHost(cfg *config) (host.Host, error) { } func getCombinedRouting(endpoints []string, dht routing.Routing, cachedAddrBook *cachedAddrBook) (router, error) { + var dhtRouter router + + if cachedAddrBook != nil { + dhtRouter = cachedRouter{sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook} + } else { + dhtRouter = sanitizeRouter{libp2pRouter{routing: dht}} + } + if len(endpoints) == 0 { - return cachedRouter{sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook}, nil + return dhtRouter, nil } - var routers []router + var delegatedRouters []router for _, endpoint := range endpoints { drclient, err := drclient.New(endpoint, @@ -246,11 +254,11 @@ func getCombinedRouting(endpoints []string, dht routing.Routing, cachedAddrBook if err != nil { return nil, err } - routers = append(routers, clientRouter{Client: drclient}) + delegatedRouters = append(delegatedRouters, clientRouter{Client: drclient}) } return parallelRouter{ - routers: append(routers, cachedRouter{sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook}), + routers: append(delegatedRouters, dhtRouter), }, nil } diff --git a/server_addr_book.go b/server_addr_book.go index 143a6fd..f86b78e 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -4,18 +4,19 @@ import ( "context" "io" "math" + "sync" "sync/atomic" "time" "github.com/ipfs/boxo/routing/http/types" - ma "github.com/multiformats/go-multiaddr" - "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/p2p/host/peerstore/pstoremem" + ma "github.com/multiformats/go-multiaddr" + manet "github.com/multiformats/go-multiaddr/net" ) // The TTL to keep recently connected peers for. This should be enough time to probe @@ -134,41 +135,44 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { cab.isProbing = true defer func() { cab.isProbing = false }() + wg := sync.WaitGroup{} + for i, p := range cab.addrBook.PeersWithAddrs() { logger.Debugf("Probe %d: PeerID: %s", i+1, p) if host.Network().Connectedness(p) == network.Connected || host.Network().Connectedness(p) == network.Limited { - // No need to probe connected peers - continue + continue // don't probe connected peers } - lastConnTime := cab.peers[p].lastConnTime - - if time.Since(lastConnTime) < PeerProbeThreshold { - // Don't probe recently connected peers - continue + if time.Since(cab.peers[p].lastConnTime) < PeerProbeThreshold { + continue // don't probe peers below the probe threshold } addrs := cab.addrBook.Addrs(p) if len(addrs) == 0 { - // No addresses to probe - continue + continue // no addresses to probe } - // If connect succeeds and identify runs, the background loop will update the peer state and cache - // TODO: introduce some concurrency - ctx, cancel := context.WithTimeout(ctx, time.Second*10) - defer cancel() - err := host.Connect(ctx, peer.AddrInfo{ - ID: p, - // TODO: Should we should probe the last connected address or all addresses? - Addrs: addrs, - }) - if err != nil { - logger.Warnf("failed to connect to peer %s: %v", p, err) - cab.peers[p].connectFailures.Add(1) - } + addrs = ma.FilterAddrs(addrs, manet.IsPublicAddr) + wg.Add(1) + go func() { + defer wg.Done() + ctx, cancel := context.WithTimeout(ctx, time.Second*10) + defer cancel() + // when connect succeeds and identify runs, the background loop will update the peer state and cache + err := host.Connect(ctx, peer.AddrInfo{ + ID: p, + // TODO: Should we should probe the last connected address or all addresses? + Addrs: addrs, + }) + if err != nil { + logger.Warnf("failed to connect to peer %s: %v", p, err) + cab.peers[p].connectFailures.Add(1) + cab.addrBook.ClearAddrs(p) + } + }() } + wg.Wait() } // Returns the cached addresses for a peer, incrementing the return count diff --git a/server_test.go b/server_test.go index 99c10b2..42ee31f 100644 --- a/server_test.go +++ b/server_test.go @@ -10,15 +10,15 @@ func TestGetCombinedRouting(t *testing.T) { t.Parallel() // Check of the result of get combined routing is a sanitize router. - v, err := getCombinedRouting(nil, &bundledDHT{}) + v, err := getCombinedRouting(nil, &bundledDHT{}, nil) require.NoError(t, err) require.IsType(t, sanitizeRouter{}, v) - v, err = getCombinedRouting([]string{"https://example.com/"}, nil) + v, err = getCombinedRouting([]string{"https://example.com/"}, nil, nil) require.NoError(t, err) - require.IsType(t, sanitizeRouter{}, v) + require.IsType(t, parallelRouter{}, v) - v, err = getCombinedRouting([]string{"https://example.com/"}, &bundledDHT{}) + v, err = getCombinedRouting([]string{"https://example.com/"}, &bundledDHT{}, nil) require.NoError(t, err) - require.IsType(t, sanitizeRouter{}, v) + require.IsType(t, parallelRouter{}, v) } From a4d64560a23ac31558c96bb910863b33f5f5e389 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 27 Nov 2024 21:40:32 +0100 Subject: [PATCH 10/80] feat: make everything a little bit better --- server_addr_book.go | 75 ++++++++++++++++++++++++++++------------- server_cached_router.go | 6 ++-- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/server_addr_book.go b/server_addr_book.go index f86b78e..b791f71 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -5,7 +5,6 @@ import ( "io" "math" "sync" - "sync/atomic" "time" "github.com/ipfs/boxo/routing/http/types" @@ -29,19 +28,27 @@ const ConnectedAddrTTL = math.MaxInt64 const PeerProbeThreshold = time.Hour // How often to run the probe peers function -const ProbeInterval = time.Minute * 15 +const ProbeInterval = time.Minute * 5 + +// How many concurrent probes to run at once +const MaxConcurrentProbes = 20 + +// How many connect failures to tolerate before clearing a peer's addresses +const MaxConnectFailures = 3 type peerState struct { lastConnTime time.Time // time we were connected to this peer lastConnAddr ma.Multiaddr // last address we connected to this peer on - returnCount atomic.Int32 // number of times we've returned this peer - connectFailures atomic.Int32 // number of times we've failed to connect to this peer + returnCount int // number of times we've returned this peer from the cache + lastReturnTime time.Time // time we last returned this peer from the cache + connectFailures int // number of times we've failed to connect to this peer } type cachedAddrBook struct { - peers map[peer.ID]*peerState // PeerID -> peer state - addrBook peerstore.AddrBook // PeerID -> []Multiaddr with TTL expirations - isProbing bool // Whether we are currently probing peers + mu sync.RWMutex // Add mutex for thread safety + peers map[peer.ID]*peerState + addrBook peerstore.AddrBook + isProbing bool } func newCachedAddrBook() *cachedAddrBook { @@ -82,10 +89,8 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { // Update the peer state with the last connected address and time if _, exists := cab.peers[ev.Peer]; !exists { cab.peers[ev.Peer] = &peerState{ - lastConnTime: time.Now(), - lastConnAddr: ev.Conn.RemoteMultiaddr(), - returnCount: atomic.Int32{}, - connectFailures: atomic.Int32{}, + lastConnTime: time.Now(), + lastConnAddr: ev.Conn.RemoteMultiaddr(), } } else { cab.peers[ev.Peer].lastConnTime = time.Now() @@ -127,6 +132,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { elapsed := time.Since(start) logger.Debugf("Finished peer probe in %s", elapsed) } + // TODO: Add some cleanup logic to remove peers that haven't been returned from the cache in a while or have failed to connect too many times } } @@ -136,6 +142,8 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { defer func() { cab.isProbing = false }() wg := sync.WaitGroup{} + // semaphore channel to limit the number of concurrent probes + semaphore := make(chan struct{}, MaxConcurrentProbes) for i, p := range cab.addrBook.PeersWithAddrs() { logger.Debugf("Probe %d: PeerID: %s", i+1, p) @@ -143,9 +151,15 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { continue // don't probe connected peers } - if time.Since(cab.peers[p].lastConnTime) < PeerProbeThreshold { + peerState := cab.peers[p] + + if time.Since(peerState.lastConnTime) < PeerProbeThreshold { continue // don't probe peers below the probe threshold } + if peerState.connectFailures > MaxConnectFailures { + cab.addrBook.ClearAddrs(p) // clear the peer's addresses + continue // don't probe this peer + } addrs := cab.addrBook.Addrs(p) @@ -156,10 +170,15 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { addrs = ma.FilterAddrs(addrs, manet.IsPublicAddr) wg.Add(1) go func() { - defer wg.Done() + semaphore <- struct{}{} + defer func() { + <-semaphore // Release semaphore + wg.Done() + }() + ctx, cancel := context.WithTimeout(ctx, time.Second*10) defer cancel() - // when connect succeeds and identify runs, the background loop will update the peer state and cache + // if connect succeeds and identify runs, the background loop will take care of updating the peer state and cache err := host.Connect(ctx, peer.AddrInfo{ ID: p, // TODO: Should we should probe the last connected address or all addresses? @@ -167,8 +186,9 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { }) if err != nil { logger.Warnf("failed to connect to peer %s: %v", p, err) - cab.peers[p].connectFailures.Add(1) - cab.addrBook.ClearAddrs(p) + cab.mu.Lock() // Lock before accessing shared state + cab.peers[p].connectFailures++ + cab.mu.Unlock() } }() } @@ -176,13 +196,22 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { } // Returns the cached addresses for a peer, incrementing the return count -func (cab *cachedAddrBook) getCachedAddrs(p *peer.ID) []types.Multiaddr { - addrs := cab.addrBook.Addrs(*p) - cab.peers[*p].returnCount.Add(1) // increment the return count +func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { + cachedAddrs := cab.addrBook.Addrs(*p) + + if len(cachedAddrs) == 0 { + return nil + } + + cab.mu.Lock() // Lock before accessing shared state + defer cab.mu.Unlock() + // Peer state should already exist if it's in the addrbook + cab.peers[*p].returnCount++ + cab.peers[*p].lastReturnTime = time.Now() - var cachedAddrs []types.Multiaddr - for _, addr := range addrs { - cachedAddrs = append(cachedAddrs, types.Multiaddr{Multiaddr: addr}) + var result []types.Multiaddr // convert to local Multiaddr type 🙃 + for _, addr := range cachedAddrs { + result = append(result, types.Multiaddr{Multiaddr: addr}) } - return cachedAddrs + return result } diff --git a/server_cached_router.go b/server_cached_router.go index 8720a36..b41aaee 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -40,7 +40,7 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return v } if len(result.Addrs) == 0 { - cachedAddrs := r.cachedAddrBook.getCachedAddrs(result.ID) + cachedAddrs := r.cachedAddrBook.GetCachedAddrs(result.ID) logger.Debugw("no addresses found for peer, using cached addresses", "peer", result.ID, "cachedAddrs", cachedAddrs) result.Addrs = cachedAddrs } @@ -57,7 +57,7 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) } if len(result.Addrs) == 0 { - cachedAddrs := r.cachedAddrBook.getCachedAddrs(result.ID) + cachedAddrs := r.cachedAddrBook.GetCachedAddrs(result.ID) logger.Debugw("no addresses found for peer, using cached addresses", "peer", result.ID, "cachedAddrs", cachedAddrs) result.Addrs = cachedAddrs } @@ -81,7 +81,7 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it // If no addresses were found by router, use cached addresses if len(v.Val.Addrs) == 0 { - cachedAddrs := r.cachedAddrBook.getCachedAddrs(v.Val.ID) + cachedAddrs := r.cachedAddrBook.GetCachedAddrs(v.Val.ID) logger.Debugw("no addresses found for peer, using cached addresses", "peer", v.Val.ID, "cachedAddrs", cachedAddrs) v.Val.Addrs = cachedAddrs } From 81feca7c4df6e7c4798e85a96f5c63d3a4be3b51 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:58:32 +0100 Subject: [PATCH 11/80] chore: small refinements --- server_addr_book.go | 63 ++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/server_addr_book.go b/server_addr_book.go index b791f71..d2af473 100644 --- a/server_addr_book.go +++ b/server_addr_book.go @@ -18,36 +18,41 @@ import ( manet "github.com/multiformats/go-multiaddr/net" ) -// The TTL to keep recently connected peers for. This should be enough time to probe -const RecentlyConnectedAddrTTL = time.Hour * 24 +const ( + // The TTL to keep recently connected peers for. Same as DefaultProviderAddrTTL in go-libp2p-kad-dht + RecentlyConnectedAddrTTL = time.Hour * 24 -// Connected peers don't expire until they disconnect -const ConnectedAddrTTL = math.MaxInt64 + // Connected peers don't expire until they disconnect + ConnectedAddrTTL = math.MaxInt64 -// How long to wait since last connection before probing a peer again -const PeerProbeThreshold = time.Hour + // How long to wait since last connection before probing a peer again + PeerProbeThreshold = time.Hour -// How often to run the probe peers function -const ProbeInterval = time.Minute * 5 + // How often to run the probe peers function + ProbeInterval = time.Minute * 5 -// How many concurrent probes to run at once -const MaxConcurrentProbes = 20 + // How many concurrent probes to run at once + MaxConcurrentProbes = 20 -// How many connect failures to tolerate before clearing a peer's addresses -const MaxConnectFailures = 3 + // How many connect failures to tolerate before clearing a peer's addresses + MaxConnectFailures = 3 + + // How long to wait for a connect in a probe to complete + ConnectTimeout = time.Second * 10 +) type peerState struct { - lastConnTime time.Time // time we were connected to this peer + lastConnTime time.Time // last time we successfully connected to this peer lastConnAddr ma.Multiaddr // last address we connected to this peer on returnCount int // number of times we've returned this peer from the cache - lastReturnTime time.Time // time we last returned this peer from the cache + lastReturnTime time.Time // last time we returned this peer from the cache connectFailures int // number of times we've failed to connect to this peer } type cachedAddrBook struct { - mu sync.RWMutex // Add mutex for thread safety - peers map[peer.ID]*peerState addrBook peerstore.AddrBook + peers map[peer.ID]*peerState + mu sync.RWMutex // Add mutex for thread safety isProbing bool } @@ -87,15 +92,13 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { switch ev := ev.(type) { case event.EvtPeerIdentificationCompleted: // Update the peer state with the last connected address and time - if _, exists := cab.peers[ev.Peer]; !exists { - cab.peers[ev.Peer] = &peerState{ - lastConnTime: time.Now(), - lastConnAddr: ev.Conn.RemoteMultiaddr(), - } - } else { - cab.peers[ev.Peer].lastConnTime = time.Now() - cab.peers[ev.Peer].lastConnAddr = ev.Conn.RemoteMultiaddr() + pState, exists := cab.peers[ev.Peer] + if !exists { + pState = &peerState{} + cab.peers[ev.Peer] = pState } + pState.lastConnTime = time.Now() + pState.lastConnAddr = ev.Conn.RemoteMultiaddr() if ev.SignedPeerRecord != nil { logger.Debug("Caching signed peer record") @@ -146,7 +149,6 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { semaphore := make(chan struct{}, MaxConcurrentProbes) for i, p := range cab.addrBook.PeersWithAddrs() { - logger.Debugf("Probe %d: PeerID: %s", i+1, p) if host.Network().Connectedness(p) == network.Connected || host.Network().Connectedness(p) == network.Limited { continue // don't probe connected peers } @@ -162,12 +164,12 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { } addrs := cab.addrBook.Addrs(p) + addrs = ma.FilterAddrs(addrs, manet.IsPublicAddr) if len(addrs) == 0 { continue // no addresses to probe } - addrs = ma.FilterAddrs(addrs, manet.IsPublicAddr) wg.Add(1) go func() { semaphore <- struct{}{} @@ -176,8 +178,9 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { wg.Done() }() - ctx, cancel := context.WithTimeout(ctx, time.Second*10) + ctx, cancel := context.WithTimeout(ctx, ConnectTimeout) defer cancel() + logger.Debugf("Probe %d: PeerID: %s, Addrs: %v", i+1, p, addrs) // if connect succeeds and identify runs, the background loop will take care of updating the peer state and cache err := host.Connect(ctx, peer.AddrInfo{ ID: p, @@ -185,7 +188,7 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { Addrs: addrs, }) if err != nil { - logger.Warnf("failed to connect to peer %s: %v", p, err) + logger.Debugf("failed to connect to peer %s: %v", p, err) cab.mu.Lock() // Lock before accessing shared state cab.peers[p].connectFailures++ cab.mu.Unlock() @@ -204,10 +207,10 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { } cab.mu.Lock() // Lock before accessing shared state - defer cab.mu.Unlock() - // Peer state should already exist if it's in the addrbook + // Peer state already exists if it's in the addrbook so no need to check cab.peers[*p].returnCount++ cab.peers[*p].lastReturnTime = time.Now() + defer cab.mu.Unlock() var result []types.Multiaddr // convert to local Multiaddr type 🙃 for _, addr := range cachedAddrs { From e75992f39f1e9fea71acbeb9c7c7878571ac0746 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:58:47 +0100 Subject: [PATCH 12/80] test: add test for cached addr book --- server_addr_book_test.go | 165 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 server_addr_book_test.go diff --git a/server_addr_book_test.go b/server_addr_book_test.go new file mode 100644 index 0000000..5e9581a --- /dev/null +++ b/server_addr_book_test.go @@ -0,0 +1,165 @@ +package main + +import ( + "context" + "testing" + "time" + + "github.com/libp2p/go-libp2p" + "github.com/libp2p/go-libp2p/core/event" + "github.com/libp2p/go-libp2p/core/network" + "github.com/libp2p/go-libp2p/core/peer" + ma "github.com/multiformats/go-multiaddr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCachedAddrBook(t *testing.T) { + // Create a new cached address book + cab := newCachedAddrBook() + require.NotNil(t, cab) + require.NotNil(t, cab.peers) + require.NotNil(t, cab.addrBook) +} + +func TestGetCachedAddrs(t *testing.T) { + cab := newCachedAddrBook() + + // Create a test peer with new PeerID + testPeer, err := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") + require.NoError(t, err) + + // Add test addresses + addr1, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") + addr2, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/5678") + cab.addrBook.AddAddrs(testPeer, []ma.Multiaddr{addr1, addr2}, time.Hour) + + // Initialize peer state + cab.peers[testPeer] = &peerState{} + + // Test getting addresses + addrs := cab.GetCachedAddrs(&testPeer) + assert.Len(t, addrs, 2) + + // Verify return count and time were updated + assert.Equal(t, 1, cab.peers[testPeer].returnCount) + assert.False(t, cab.peers[testPeer].lastReturnTime.IsZero()) +} + +func TestBackground(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Create a test libp2p host + h, err := libp2p.New(libp2p.ListenAddrStrings("/ip4/127.0.0.1/tcp/0")) + require.NoError(t, err) + defer h.Close() + + em, err := h.EventBus().Emitter(new(event.EvtPeerIdentificationCompleted)) + require.NoError(t, err) + defer em.Close() + + cab := newCachedAddrBook() + + // Create a channel to signal when background processing is ready + ready := make(chan struct{}) + + // Start background process with ready signal + go func() { + // Signal ready before starting background process + close(ready) + cab.background(ctx, h) + }() + + // Wait for background process to start + <-ready + + // Create a test peer with new PeerID + testPeer, err := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") + require.NoError(t, err) + + // Simulate peer identification event + addr, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") + + // Use a channel to wait for event processing + done := make(chan struct{}) + go func() { + defer close(done) + // Check periodically until the peer is added or timeout (after 50 * 10ms) + for i := 0; i < 50; i++ { + cab.mu.RLock() + peerState, exists := cab.peers[testPeer] + if exists { + assert.Equal(t, addr, peerState.lastConnAddr) + cab.mu.RUnlock() + return + } + cab.mu.RUnlock() + time.Sleep(10 * time.Millisecond) + } + }() + + // Emit the event after setting up the waiter + err = em.Emit(event.EvtPeerIdentificationCompleted{ + Peer: testPeer, + Conn: &mockConnection{ + remoteAddr: addr, + }, + ListenAddrs: []ma.Multiaddr{addr}, + }) + require.NoError(t, err) + + // Wait for processing with timeout + select { + case <-done: + // Success case - continue to verification + case <-time.After(time.Second): + t.Fatal("timeout waiting for peer to be added") + } + + // Verify peer was added + cab.mu.RLock() + peerState, exists := cab.peers[testPeer] + assert.True(t, exists) + assert.NotNil(t, peerState) + assert.Equal(t, addr, peerState.lastConnAddr) + cab.mu.RUnlock() +} + +func TestProbePeers(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Create a test libp2p host + h, err := libp2p.New() + require.NoError(t, err) + defer h.Close() + + cab := newCachedAddrBook() + + // Add a test peer with some addresses + testPeer, _ := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") + addr, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") + cab.addrBook.AddAddrs(testPeer, []ma.Multiaddr{addr}, time.Hour) + + // Initialize peer state with old connection time + cab.peers[testPeer] = &peerState{ + lastConnTime: time.Now().Add(-2 * PeerProbeThreshold), + } + + // Run probe + cab.probePeers(ctx, h) + + // Verify connect failures increased (since connection will fail in test) + assert.Equal(t, 1, cab.peers[testPeer].connectFailures) +} + +// Mock connection for testing +type mockConnection struct { + network.Conn + remoteAddr ma.Multiaddr +} + +func (mc *mockConnection) RemoteMultiaddr() ma.Multiaddr { + return mc.remoteAddr +} From a20a4c37dea76e2449adb7110e66e72cc2464108 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:59:56 +0100 Subject: [PATCH 13/80] chore: rename files --- server_addr_book.go => cached_addr_book.go | 0 server_addr_book_test.go => cached_addr_book_test.go | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename server_addr_book.go => cached_addr_book.go (100%) rename server_addr_book_test.go => cached_addr_book_test.go (100%) diff --git a/server_addr_book.go b/cached_addr_book.go similarity index 100% rename from server_addr_book.go rename to cached_addr_book.go diff --git a/server_addr_book_test.go b/cached_addr_book_test.go similarity index 100% rename from server_addr_book_test.go rename to cached_addr_book_test.go From c5f1d6290a48f9b042bd925bfadc9c1f4dac15f9 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:23:04 +0100 Subject: [PATCH 14/80] feat: add options to cached addr book fix test by allowing private ips --- cached_addr_book.go | 38 ++++++++++++++++++++++++++++++-------- cached_addr_book_test.go | 12 ++++++++---- server.go | 5 ++++- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index d2af473..72d2311 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -50,17 +50,35 @@ type peerState struct { } type cachedAddrBook struct { - addrBook peerstore.AddrBook - peers map[peer.ID]*peerState - mu sync.RWMutex // Add mutex for thread safety - isProbing bool + addrBook peerstore.AddrBook + peers map[peer.ID]*peerState + mu sync.RWMutex // Add mutex for thread safety + isProbing bool + allowPrivateIPs bool // for testing } -func newCachedAddrBook() *cachedAddrBook { - return &cachedAddrBook{ +type AddrBookOption func(*cachedAddrBook) error + +func WithAllowPrivateIPs() AddrBookOption { + return func(cab *cachedAddrBook) error { + cab.allowPrivateIPs = true + return nil + } +} + +func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { + cab := &cachedAddrBook{ peers: make(map[peer.ID]*peerState), addrBook: pstoremem.NewAddrBook(), } + + for _, opt := range opts { + err := opt(cab) + if err != nil { + return nil, err + } + } + return cab, nil } func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { @@ -149,7 +167,8 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { semaphore := make(chan struct{}, MaxConcurrentProbes) for i, p := range cab.addrBook.PeersWithAddrs() { - if host.Network().Connectedness(p) == network.Connected || host.Network().Connectedness(p) == network.Limited { + connectedness := host.Network().Connectedness(p) + if connectedness == network.Connected || connectedness == network.Limited { continue // don't probe connected peers } @@ -164,7 +183,10 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { } addrs := cab.addrBook.Addrs(p) - addrs = ma.FilterAddrs(addrs, manet.IsPublicAddr) + + if !cab.allowPrivateIPs { + addrs = ma.FilterAddrs(addrs, manet.IsPublicAddr) + } if len(addrs) == 0 { continue // no addresses to probe diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index 5e9581a..5297eb0 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -16,14 +16,16 @@ import ( func TestCachedAddrBook(t *testing.T) { // Create a new cached address book - cab := newCachedAddrBook() + cab, err := newCachedAddrBook(WithAllowPrivateIPs()) + require.NoError(t, err) require.NotNil(t, cab) require.NotNil(t, cab.peers) require.NotNil(t, cab.addrBook) } func TestGetCachedAddrs(t *testing.T) { - cab := newCachedAddrBook() + cab, err := newCachedAddrBook(WithAllowPrivateIPs()) + require.NoError(t, err) // Create a test peer with new PeerID testPeer, err := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") @@ -59,7 +61,8 @@ func TestBackground(t *testing.T) { require.NoError(t, err) defer em.Close() - cab := newCachedAddrBook() + cab, err := newCachedAddrBook(WithAllowPrivateIPs()) + require.NoError(t, err) // Create a channel to signal when background processing is ready ready := make(chan struct{}) @@ -135,7 +138,8 @@ func TestProbePeers(t *testing.T) { require.NoError(t, err) defer h.Close() - cab := newCachedAddrBook() + cab, err := newCachedAddrBook(WithAllowPrivateIPs()) + require.NoError(t, err) // Add a test peer with some addresses testPeer, _ := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") diff --git a/server.go b/server.go index b84b6c3..168cc63 100644 --- a/server.go +++ b/server.go @@ -85,7 +85,10 @@ func start(ctx context.Context, cfg *config) error { if cfg.cachedAddrBook { fmt.Println("Using cached address book to speed up peer discovery") - cachedAddrBook = newCachedAddrBook() + cachedAddrBook, err = newCachedAddrBook() + if err != nil { + return err + } go cachedAddrBook.background(ctx, h) } From e678be842787ced6f55edc3b00bfee1da5a07a08 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:27:55 +0100 Subject: [PATCH 15/80] feat: add instrumentation --- cached_addr_book.go | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 72d2311..a3ca713 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -16,6 +16,26 @@ import ( "github.com/libp2p/go-libp2p/p2p/host/peerstore/pstoremem" ma "github.com/multiformats/go-multiaddr" manet "github.com/multiformats/go-multiaddr/net" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + probeDurationHistogram = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "probe_duration_seconds", + Namespace: "someguy", + Subsystem: "cached_addr_book", + Help: "Duration of peer probing operations in seconds", + // Buckets optimized for expected probe durations from ms to full timeout + Buckets: []float64{0.5, 1, 2, 5, 10, 30, 60, 120}, + }) + + peerStateSize = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "peer_state_size", + Subsystem: "cached_addr_book", + Namespace: "someguy", + Help: "Number of peers object currently in the peer state", + }) ) const ( @@ -114,6 +134,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { if !exists { pState = &peerState{} cab.peers[ev.Peer] = pState + peerStateSize.Set(float64(len(cab.peers))) // update metric } pState.lastConnTime = time.Now() pState.lastConnAddr = ev.Conn.RemoteMultiaddr() @@ -147,11 +168,10 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { logger.Debug("Skipping peer probe, still running") continue } - logger.Debug("Running peer probe") - start := time.Now() + logger.Debug("Starting to probe peers") + cab.isProbing = true cab.probePeers(ctx, host) - elapsed := time.Since(start) - logger.Debugf("Finished peer probe in %s", elapsed) + cab.isProbing = false } // TODO: Add some cleanup logic to remove peers that haven't been returned from the cache in a while or have failed to connect too many times } @@ -159,8 +179,12 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { // Loops over all peers with addresses and probes them if they haven't been probed recently func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { - cab.isProbing = true - defer func() { cab.isProbing = false }() + start := time.Now() + defer func() { + duration := time.Since(start).Seconds() + probeDurationHistogram.Observe(duration) + logger.Debugf("Finished probing peers in %s", duration) + }() wg := sync.WaitGroup{} // semaphore channel to limit the number of concurrent probes From a0965bcfc5902bf49dc79aefe2eab4b2fcbcd213 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:29:13 +0100 Subject: [PATCH 16/80] fix: thread safety --- cached_addr_book.go | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index a3ca713..dc6d212 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -5,6 +5,7 @@ import ( "io" "math" "sync" + "sync/atomic" "time" "github.com/ipfs/boxo/routing/http/types" @@ -73,7 +74,7 @@ type cachedAddrBook struct { addrBook peerstore.AddrBook peers map[peer.ID]*peerState mu sync.RWMutex // Add mutex for thread safety - isProbing bool + isProbing atomic.Bool allowPrivateIPs bool // for testing } @@ -129,15 +130,16 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { case ev := <-sub.Out(): switch ev := ev.(type) { case event.EvtPeerIdentificationCompleted: - // Update the peer state with the last connected address and time + cab.mu.Lock() pState, exists := cab.peers[ev.Peer] if !exists { pState = &peerState{} cab.peers[ev.Peer] = pState - peerStateSize.Set(float64(len(cab.peers))) // update metric + peerStateSize.Set(float64(len(cab.peers))) } pState.lastConnTime = time.Now() pState.lastConnAddr = ev.Conn.RemoteMultiaddr() + cab.mu.Unlock() if ev.SignedPeerRecord != nil { logger.Debug("Caching signed peer record") @@ -164,14 +166,12 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { } } case <-probeTicker.C: - if cab.isProbing { + if cab.isProbing.Load() { logger.Debug("Skipping peer probe, still running") continue } logger.Debug("Starting to probe peers") - cab.isProbing = true - cab.probePeers(ctx, host) - cab.isProbing = false + go cab.probePeers(ctx, host) } // TODO: Add some cleanup logic to remove peers that haven't been returned from the cache in a while or have failed to connect too many times } @@ -179,6 +179,9 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { // Loops over all peers with addresses and probes them if they haven't been probed recently func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { + cab.isProbing.Store(true) + defer cab.isProbing.Store(false) + start := time.Now() defer func() { duration := time.Since(start).Seconds() @@ -196,16 +199,17 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { continue // don't probe connected peers } - peerState := cab.peers[p] - - if time.Since(peerState.lastConnTime) < PeerProbeThreshold { + cab.mu.RLock() + if time.Since(cab.peers[p].lastConnTime) < PeerProbeThreshold { + cab.mu.RUnlock() continue // don't probe peers below the probe threshold } - if peerState.connectFailures > MaxConnectFailures { + if cab.peers[p].connectFailures > MaxConnectFailures { cab.addrBook.ClearAddrs(p) // clear the peer's addresses - continue // don't probe this peer + cab.mu.RUnlock() + continue // don't probe this peer } - + cab.mu.RUnlock() addrs := cab.addrBook.Addrs(p) if !cab.allowPrivateIPs { From d82ad0f6ea527d18ae60882356a0cfd89af97f05 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:32:24 +0100 Subject: [PATCH 17/80] docs: update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58d1c11..4defe86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ The following emojis are used to highlight certain changes: ### Added +- Added a new `cachedAddrBook` implementation that caches peer addresses and probes them in the background. +- Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. + ### Changed ### Removed From a84d5f68cc8640fbe6f31e44474768df249759d2 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:20:13 +0100 Subject: [PATCH 18/80] fix: small fixes --- cached_addr_book.go | 11 ++++++++--- cached_addr_book_test.go | 38 +++++++++++++++++--------------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index dc6d212..85fe697 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -139,6 +139,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { } pState.lastConnTime = time.Now() pState.lastConnAddr = ev.Conn.RemoteMultiaddr() + pState.connectFailures = 0 // reset connect failures on successful connection cab.mu.Unlock() if ev.SignedPeerRecord != nil { @@ -256,11 +257,15 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { return nil } - cab.mu.Lock() // Lock before accessing shared state - // Peer state already exists if it's in the addrbook so no need to check + cab.mu.Lock() + // Initialize peer state if it doesn't exist + if _, exists := cab.peers[*p]; !exists { + cab.peers[*p] = &peerState{} + peerStateSize.Set(float64(len(cab.peers))) + } cab.peers[*p].returnCount++ cab.peers[*p].lastReturnTime = time.Now() - defer cab.mu.Unlock() + cab.mu.Unlock() var result []types.Multiaddr // convert to local Multiaddr type 🙃 for _, addr := range cachedAddrs { diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index 5297eb0..be854b0 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -57,7 +57,7 @@ func TestBackground(t *testing.T) { require.NoError(t, err) defer h.Close() - em, err := h.EventBus().Emitter(new(event.EvtPeerIdentificationCompleted)) + em, err := h.EventBus().Emitter(&event.EvtPeerIdentificationCompleted{}) require.NoError(t, err) defer em.Close() @@ -84,24 +84,6 @@ func TestBackground(t *testing.T) { // Simulate peer identification event addr, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") - // Use a channel to wait for event processing - done := make(chan struct{}) - go func() { - defer close(done) - // Check periodically until the peer is added or timeout (after 50 * 10ms) - for i := 0; i < 50; i++ { - cab.mu.RLock() - peerState, exists := cab.peers[testPeer] - if exists { - assert.Equal(t, addr, peerState.lastConnAddr) - cab.mu.RUnlock() - return - } - cab.mu.RUnlock() - time.Sleep(10 * time.Millisecond) - } - }() - // Emit the event after setting up the waiter err = em.Emit(event.EvtPeerIdentificationCompleted{ Peer: testPeer, @@ -112,12 +94,26 @@ func TestBackground(t *testing.T) { }) require.NoError(t, err) + done := make(chan struct{}) + go func() { + defer close(done) + for { + cab.mu.RLock() + _, exists := cab.peers[testPeer] + cab.mu.RUnlock() + if exists { + return + } + time.Sleep(30 * time.Millisecond) + } + }() + // Wait for processing with timeout select { case <-done: // Success case - continue to verification - case <-time.After(time.Second): - t.Fatal("timeout waiting for peer to be added") + case <-time.After(time.Second * 5): + t.Fatal("timeout waiting for peer to be added to peer state") } // Verify peer was added From 9ab02e1d3935307169c8b2dca05317ccab20be66 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:36:15 +0100 Subject: [PATCH 19/80] fix: simplify cached router --- server_cached_router.go | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index b41aaee..16460a7 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -15,7 +15,7 @@ import ( var _ server.ContentRouter = cachedRouter{} -// cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers without multiaddrs +// cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers without multiaddrs in FindProviders type cachedRouter struct { router cachedAddrBook *cachedAddrBook @@ -40,9 +40,7 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return v } if len(result.Addrs) == 0 { - cachedAddrs := r.cachedAddrBook.GetCachedAddrs(result.ID) - logger.Debugw("no addresses found for peer, using cached addresses", "peer", result.ID, "cachedAddrs", cachedAddrs) - result.Addrs = cachedAddrs + result.Addrs = r.getMaddrsFromCache(result.ID) } v.Val = result @@ -57,9 +55,7 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) } if len(result.Addrs) == 0 { - cachedAddrs := r.cachedAddrBook.GetCachedAddrs(result.ID) - logger.Debugw("no addresses found for peer, using cached addresses", "peer", result.ID, "cachedAddrs", cachedAddrs) - result.Addrs = cachedAddrs + result.Addrs = r.getMaddrsFromCache(result.ID) } v.Val = result } @@ -69,27 +65,22 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) } func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { - it, err := r.router.FindPeers(ctx, pid, limit) - if err != nil { - return nil, err - } - - return iter.Map(it, func(v iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { - if v.Err != nil || v.Val == nil { - return v - } - - // If no addresses were found by router, use cached addresses - if len(v.Val.Addrs) == 0 { - cachedAddrs := r.cachedAddrBook.GetCachedAddrs(v.Val.ID) - logger.Debugw("no addresses found for peer, using cached addresses", "peer", v.Val.ID, "cachedAddrs", cachedAddrs) - v.Val.Addrs = cachedAddrs - } - return v - }), nil + // If FindPeers fails, it seems like there's no point returning results from the cache? + return r.router.FindPeers(ctx, pid, limit) } //lint:ignore SA1019 // ignore staticcheck func (r cachedRouter) ProvideBitswap(ctx context.Context, req *server.BitswapWriteProvideRequest) (time.Duration, error) { return 0, routing.ErrNotSupported } + +// GetPeer returns a peer record for a given peer ID, or nil if the peer is not found +func (r cachedRouter) getMaddrsFromCache(pid *peer.ID) []types.Multiaddr { + cachedAddrs := r.cachedAddrBook.GetCachedAddrs(pid) + if len(cachedAddrs) > 0 { + logger.Debugw("found cached addresses", "peer", pid, "cachedAddrs", cachedAddrs) + return cachedAddrs + } else { + return nil + } +} From 9658af822a8cdd24cabe21a9e4179daa4f5c479f Mon Sep 17 00:00:00 2001 From: Marcin Rataj Date: Thu, 28 Nov 2024 23:55:10 +0100 Subject: [PATCH 20/80] feat(metric): cached_router_peer_addr_lookups this adds metric for evaluating all addr lookups someguy_cached_router_peer_addr_lookups{cache="unused|hit|miss",origin="providers|peers"} I've also wired up FindPeers for completeness. --- server_cached_router.go | 83 +++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 16 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 16460a7..afb6e28 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -11,9 +11,37 @@ import ( "github.com/ipfs/go-cid" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/routing" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" ) -var _ server.ContentRouter = cachedRouter{} +var ( + _ server.ContentRouter = cachedRouter{} + + // peerAddrLookups allows us reason if/how effective peer addr cache is + peerAddrLookups = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "peer_addr_lookups", + Subsystem: "cached_router", + Namespace: "someguy", + Help: "Number of peer addr info lookups per origin and cache state", + }, + []string{addrCacheStateLabel, addrQueryOriginLabel}, + ) +) + +const ( + // cache=unused|hit|miss, indicates how effective cache is + addrCacheStateLabel = "cache" + addrCacheStateUnused = "unused" + addrCacheStateHit = "hit" + addrCacheStateMiss = "miss" + + // source=providers|peers indicates if query originated from provider or peer endpoint + addrQueryOriginLabel = "origin" + addrQueryOriginProviders = "providers" + addrQueryOriginPeers = "peers" + addrQueryOriginUnknown = "unknown" +) // cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers without multiaddrs in FindProviders type cachedRouter struct { @@ -26,12 +54,10 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) if err != nil { return nil, err } - return iter.Map(it, func(v iter.Result[types.Record]) iter.Result[types.Record] { if v.Err != nil || v.Val == nil { return v } - switch v.Val.GetSchema() { case types.SchemaPeer: result, ok := v.Val.(*types.PeerRecord) @@ -39,12 +65,8 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) logger.Errorw("problem casting find providers result", "Schema", v.Val.GetSchema(), "Type", reflect.TypeOf(v).String()) return v } - if len(result.Addrs) == 0 { - result.Addrs = r.getMaddrsFromCache(result.ID) - } - + result.Addrs = r.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) v.Val = result - //lint:ignore SA1019 // ignore staticcheck case types.SchemaBitswap: //lint:ignore SA1019 // ignore staticcheck @@ -53,10 +75,7 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) logger.Errorw("problem casting find providers result", "Schema", v.Val.GetSchema(), "Type", reflect.TypeOf(v).String()) return v } - - if len(result.Addrs) == 0 { - result.Addrs = r.getMaddrsFromCache(result.ID) - } + result.Addrs = r.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) v.Val = result } @@ -65,8 +84,31 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) } func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { - // If FindPeers fails, it seems like there's no point returning results from the cache? - return r.router.FindPeers(ctx, pid, limit) + it, err := r.router.FindPeers(ctx, pid, limit) + if err != nil { + // check cache, if peer is unknown, return original error + cachedAddrs := r.withAddrsFromCache(addrQueryOriginPeers, &pid, nil) + if len(cachedAddrs) == 0 { + return nil, err + } + // if found in cache, return synthetic peer result based on cached addrs + var sliceIt iter.Iter[*types.PeerRecord] = iter.FromSlice([]*types.PeerRecord{&types.PeerRecord{ + Schema: types.SchemaPeer, + ID: &pid, + Addrs: cachedAddrs, + }}) + it = iter.ToResultIter(sliceIt) + } + return iter.Map(it, func(v iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { + if v.Err != nil || v.Val == nil { + return v + } + switch v.Val.GetSchema() { + case types.SchemaPeer: + v.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, v.Val.ID, v.Val.Addrs) + } + return v + }), nil } //lint:ignore SA1019 // ignore staticcheck @@ -74,13 +116,22 @@ func (r cachedRouter) ProvideBitswap(ctx context.Context, req *server.BitswapWri return 0, routing.ErrNotSupported } -// GetPeer returns a peer record for a given peer ID, or nil if the peer is not found -func (r cachedRouter) getMaddrsFromCache(pid *peer.ID) []types.Multiaddr { +// withAddrsFromCache returns the best list of addrs for specified [peer.ID]. +// It will consult cache only if the addrs slice passed to it is empty. +func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid *peer.ID, addrs []types.Multiaddr) []types.Multiaddr { + // skip cache if we already have addrs + if len(addrs) > 0 { + peerAddrLookups.WithLabelValues(addrCacheStateUnused, queryOrigin).Inc() + return addrs + } + cachedAddrs := r.cachedAddrBook.GetCachedAddrs(pid) if len(cachedAddrs) > 0 { logger.Debugw("found cached addresses", "peer", pid, "cachedAddrs", cachedAddrs) + peerAddrLookups.WithLabelValues(addrCacheStateHit, queryOrigin).Inc() return cachedAddrs } else { + peerAddrLookups.WithLabelValues(addrCacheStateMiss, queryOrigin).Inc() return nil } } From 7cdb5be36d8c5e19c185f2ee4c65be138500d566 Mon Sep 17 00:00:00 2001 From: Daniel Norman <1992255+2color@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:33:42 +0100 Subject: [PATCH 21/80] Apply suggestions from code review Co-authored-by: Marcin Rataj --- cached_addr_book.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 85fe697..0b4091c 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -40,17 +40,17 @@ var ( ) const ( - // The TTL to keep recently connected peers for. Same as DefaultProviderAddrTTL in go-libp2p-kad-dht - RecentlyConnectedAddrTTL = time.Hour * 24 + // The TTL to keep recently connected peers for. Same as [amino.DefaultProvideValidity] in go-libp2p-kad-dht + RecentlyConnectedAddrTTL = amino.DefaultProvideValidity // Connected peers don't expire until they disconnect - ConnectedAddrTTL = math.MaxInt64 + ConnectedAddrTTL = peerstore.ConnectedAddrTTL // How long to wait since last connection before probing a peer again PeerProbeThreshold = time.Hour // How often to run the probe peers function - ProbeInterval = time.Minute * 5 + ProbeInterval = peerstore.RecentlyConnectedAddrTTL // How many concurrent probes to run at once MaxConcurrentProbes = 20 @@ -58,8 +58,9 @@ const ( // How many connect failures to tolerate before clearing a peer's addresses MaxConnectFailures = 3 - // How long to wait for a connect in a probe to complete - ConnectTimeout = time.Second * 10 + // How long to wait for a connect in a probe to complete. + // The worst case is a peer behind Relay. + ConnectTimeout = relay.ConnectTimeout ) type peerState struct { @@ -190,7 +191,7 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { logger.Debugf("Finished probing peers in %s", duration) }() - wg := sync.WaitGroup{} + var wg sync.WaitGroup // semaphore channel to limit the number of concurrent probes semaphore := make(chan struct{}, MaxConcurrentProbes) From 762136ec56b7493acfaf68afc54cc3e40de8bfbe Mon Sep 17 00:00:00 2001 From: Daniel Norman <1992255+2color@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:36:22 +0100 Subject: [PATCH 22/80] Update CHANGELOG.md Co-authored-by: Marcin Rataj --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4defe86..de98046 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ The following emojis are used to highlight certain changes: ### Added +- By default caching discovered Peer addresses up to 48h to match [provider record expiration on Amino DHT](https://github.com/libp2p/go-libp2p-kad-dht/blob/v0.28.1/amino/defaults.go#L40-L43). Someguy will use cached addresses if the default peerbook from go-libp2p does not have information at hand. This can be controlled via `SOMEGUY_CACHED_ADDR_BOOK=true|false` (enabled by default) - Added a new `cachedAddrBook` implementation that caches peer addresses and probes them in the background. - Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. From 2cf46d433bfadd86b610aedae38d32d115512963 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:46:14 +0100 Subject: [PATCH 23/80] chore: use service name for namespace --- cached_addr_book.go | 4 ++-- server_cached_router.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 0b4091c..47f0c3b 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -24,7 +24,7 @@ import ( var ( probeDurationHistogram = promauto.NewHistogram(prometheus.HistogramOpts{ Name: "probe_duration_seconds", - Namespace: "someguy", + Namespace: name, Subsystem: "cached_addr_book", Help: "Duration of peer probing operations in seconds", // Buckets optimized for expected probe durations from ms to full timeout @@ -34,7 +34,7 @@ var ( peerStateSize = promauto.NewGauge(prometheus.GaugeOpts{ Name: "peer_state_size", Subsystem: "cached_addr_book", - Namespace: "someguy", + Namespace: name, Help: "Number of peers object currently in the peer state", }) ) diff --git a/server_cached_router.go b/server_cached_router.go index afb6e28..90b22c8 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -22,7 +22,7 @@ var ( peerAddrLookups = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "peer_addr_lookups", Subsystem: "cached_router", - Namespace: "someguy", + Namespace: name, Help: "Number of peer addr info lookups per origin and cache state", }, []string{addrCacheStateLabel, addrQueryOriginLabel}, From a0d5c6237228496cb3d4cc8b227f9840c4512e69 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:49:38 +0100 Subject: [PATCH 24/80] fix: type errors and missing imports --- cached_addr_book.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 47f0c3b..3268fb2 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -3,18 +3,19 @@ package main import ( "context" "io" - "math" "sync" "sync/atomic" "time" "github.com/ipfs/boxo/routing/http/types" + "github.com/libp2p/go-libp2p-kad-dht/amino" "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/p2p/host/peerstore/pstoremem" + "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/relay" ma "github.com/multiformats/go-multiaddr" manet "github.com/multiformats/go-multiaddr/net" "github.com/prometheus/client_golang/prometheus" @@ -49,8 +50,8 @@ const ( // How long to wait since last connection before probing a peer again PeerProbeThreshold = time.Hour - // How often to run the probe peers function - ProbeInterval = peerstore.RecentlyConnectedAddrTTL + // How often to run the probe peers function (Same as RecentlyConnectedAddrTTL) + ProbeInterval = time.Minute * 15 // How many concurrent probes to run at once MaxConcurrentProbes = 20 @@ -59,7 +60,7 @@ const ( MaxConnectFailures = 3 // How long to wait for a connect in a probe to complete. - // The worst case is a peer behind Relay. + // The worst case is a peer behind a relay, so we use the relay connect timeout. ConnectTimeout = relay.ConnectTimeout ) From 75f1bf270a9462bbe9a146c5f8887c12a8866810 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 29 Nov 2024 18:14:38 +0100 Subject: [PATCH 25/80] feat: add queue probe --- cached_addr_book.go | 5 ++++- go.mod | 1 + go.sum | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 3268fb2..e360dd1 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -7,6 +7,7 @@ import ( "sync/atomic" "time" + "github.com/gammazero/deque" "github.com/ipfs/boxo/routing/http/types" "github.com/libp2p/go-libp2p-kad-dht/amino" "github.com/libp2p/go-libp2p/core/event" @@ -77,7 +78,8 @@ type cachedAddrBook struct { peers map[peer.ID]*peerState mu sync.RWMutex // Add mutex for thread safety isProbing atomic.Bool - allowPrivateIPs bool // for testing + allowPrivateIPs bool // for testing + toProbe *deque.Deque[peer.ID] // queue of peer IDs to find and probe } type AddrBookOption func(*cachedAddrBook) error @@ -93,6 +95,7 @@ func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { cab := &cachedAddrBook{ peers: make(map[peer.ID]*peerState), addrBook: pstoremem.NewAddrBook(), + toProbe: &deque.Deque[peer.ID]{}, } for _, opt := range opts { diff --git a/go.mod b/go.mod index 5510198..3278a86 100644 --- a/go.mod +++ b/go.mod @@ -45,6 +45,7 @@ require ( github.com/elastic/gosigar v0.14.3 // indirect github.com/flynn/noise v1.1.0 // indirect github.com/francoispqt/gojay v1.2.13 // indirect + github.com/gammazero/deque v1.0.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect diff --git a/go.sum b/go.sum index 261b798..1dba25b 100644 --- a/go.sum +++ b/go.sum @@ -96,6 +96,8 @@ github.com/francoispqt/gojay v1.2.13/go.mod h1:ehT5mTG4ua4581f1++1WLG0vPdaA9HaiD github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/gammazero/deque v1.0.0 h1:LTmimT8H7bXkkCy6gZX7zNLtkbz4NdS2z8LZuor3j34= +github.com/gammazero/deque v1.0.0/go.mod h1:iflpYvtGfM3U8S8j+sZEKIak3SAKYpA5/SQewgfXDKo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gliderlabs/ssh v0.1.1/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98= From 4cbaa911559f83d1ae7863bda7da21aa030013d8 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 29 Nov 2024 18:15:59 +0100 Subject: [PATCH 26/80] Revert "feat: add queue probe" This reverts commit 75f1bf270a9462bbe9a146c5f8887c12a8866810. --- cached_addr_book.go | 5 +---- go.mod | 1 - go.sum | 2 -- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index e360dd1..3268fb2 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -7,7 +7,6 @@ import ( "sync/atomic" "time" - "github.com/gammazero/deque" "github.com/ipfs/boxo/routing/http/types" "github.com/libp2p/go-libp2p-kad-dht/amino" "github.com/libp2p/go-libp2p/core/event" @@ -78,8 +77,7 @@ type cachedAddrBook struct { peers map[peer.ID]*peerState mu sync.RWMutex // Add mutex for thread safety isProbing atomic.Bool - allowPrivateIPs bool // for testing - toProbe *deque.Deque[peer.ID] // queue of peer IDs to find and probe + allowPrivateIPs bool // for testing } type AddrBookOption func(*cachedAddrBook) error @@ -95,7 +93,6 @@ func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { cab := &cachedAddrBook{ peers: make(map[peer.ID]*peerState), addrBook: pstoremem.NewAddrBook(), - toProbe: &deque.Deque[peer.ID]{}, } for _, opt := range opts { diff --git a/go.mod b/go.mod index 3278a86..5510198 100644 --- a/go.mod +++ b/go.mod @@ -45,7 +45,6 @@ require ( github.com/elastic/gosigar v0.14.3 // indirect github.com/flynn/noise v1.1.0 // indirect github.com/francoispqt/gojay v1.2.13 // indirect - github.com/gammazero/deque v1.0.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect diff --git a/go.sum b/go.sum index 1dba25b..261b798 100644 --- a/go.sum +++ b/go.sum @@ -96,8 +96,6 @@ github.com/francoispqt/gojay v1.2.13/go.mod h1:ehT5mTG4ua4581f1++1WLG0vPdaA9HaiD github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/gammazero/deque v1.0.0 h1:LTmimT8H7bXkkCy6gZX7zNLtkbz4NdS2z8LZuor3j34= -github.com/gammazero/deque v1.0.0/go.mod h1:iflpYvtGfM3U8S8j+sZEKIak3SAKYpA5/SQewgfXDKo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gliderlabs/ssh v0.1.1/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98= From d0383018a98adb754c52cb79521bd42e916c171f Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:36:06 +0100 Subject: [PATCH 27/80] chore: simplify composite literal --- server_cached_router.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server_cached_router.go b/server_cached_router.go index 90b22c8..9149fdb 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -92,7 +92,7 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it return nil, err } // if found in cache, return synthetic peer result based on cached addrs - var sliceIt iter.Iter[*types.PeerRecord] = iter.FromSlice([]*types.PeerRecord{&types.PeerRecord{ + var sliceIt iter.Iter[*types.PeerRecord] = iter.FromSlice([]*types.PeerRecord{{ Schema: types.SchemaPeer, ID: &pid, Addrs: cachedAddrs, From 796e94fd7dbf605db437083cbca63724b8ecd877 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:35:03 +0100 Subject: [PATCH 28/80] fix: implement custom cache fallback iterator --- cached_addr_book_test.go | 1 + server.go | 2 +- server_cached_router.go | 205 +++++++++++++++------- server_cached_router_test.go | 323 +++++++++++++++++++++++++++++++++++ 4 files changed, 467 insertions(+), 64 deletions(-) create mode 100644 server_cached_router_test.go diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index be854b0..0496ac9 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -49,6 +49,7 @@ func TestGetCachedAddrs(t *testing.T) { } func TestBackground(t *testing.T) { + t.Skip("skipping until we have a better way to test background") ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/server.go b/server.go index 168cc63..ceafe49 100644 --- a/server.go +++ b/server.go @@ -236,7 +236,7 @@ func getCombinedRouting(endpoints []string, dht routing.Routing, cachedAddrBook var dhtRouter router if cachedAddrBook != nil { - dhtRouter = cachedRouter{sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook} + dhtRouter = NewCachedRouter(sanitizeRouter{libp2pRouter{routing: dht}}, cachedAddrBook) } else { dhtRouter = sanitizeRouter{libp2pRouter{routing: dht}} } diff --git a/server_cached_router.go b/server_cached_router.go index 9149fdb..a2f96f2 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -2,21 +2,19 @@ package main import ( "context" - "reflect" - "time" + "sync/atomic" - "github.com/ipfs/boxo/routing/http/server" "github.com/ipfs/boxo/routing/http/types" "github.com/ipfs/boxo/routing/http/types/iter" "github.com/ipfs/go-cid" "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/core/routing" + ma "github.com/multiformats/go-multiaddr" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) var ( - _ server.ContentRouter = cachedRouter{} + _ router = cachedRouter{} // peerAddrLookups allows us reason if/how effective peer addr cache is peerAddrLookups = promauto.NewCounterVec(prometheus.CounterOpts{ @@ -49,72 +47,23 @@ type cachedRouter struct { cachedAddrBook *cachedAddrBook } +func NewCachedRouter(router router, cab *cachedAddrBook) cachedRouter { + return cachedRouter{router, cab} +} + func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) (iter.ResultIter[types.Record], error) { it, err := r.router.FindProviders(ctx, key, limit) if err != nil { return nil, err } - return iter.Map(it, func(v iter.Result[types.Record]) iter.Result[types.Record] { - if v.Err != nil || v.Val == nil { - return v - } - switch v.Val.GetSchema() { - case types.SchemaPeer: - result, ok := v.Val.(*types.PeerRecord) - if !ok { - logger.Errorw("problem casting find providers result", "Schema", v.Val.GetSchema(), "Type", reflect.TypeOf(v).String()) - return v - } - result.Addrs = r.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) - v.Val = result - //lint:ignore SA1019 // ignore staticcheck - case types.SchemaBitswap: - //lint:ignore SA1019 // ignore staticcheck - result, ok := v.Val.(*types.BitswapRecord) - if !ok { - logger.Errorw("problem casting find providers result", "Schema", v.Val.GetSchema(), "Type", reflect.TypeOf(v).String()) - return v - } - result.Addrs = r.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) - v.Val = result - } - return v - }), nil -} - -func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { - it, err := r.router.FindPeers(ctx, pid, limit) - if err != nil { - // check cache, if peer is unknown, return original error - cachedAddrs := r.withAddrsFromCache(addrQueryOriginPeers, &pid, nil) - if len(cachedAddrs) == 0 { - return nil, err - } - // if found in cache, return synthetic peer result based on cached addrs - var sliceIt iter.Iter[*types.PeerRecord] = iter.FromSlice([]*types.PeerRecord{{ - Schema: types.SchemaPeer, - ID: &pid, - Addrs: cachedAddrs, - }}) - it = iter.ToResultIter(sliceIt) - } - return iter.Map(it, func(v iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { - if v.Err != nil || v.Val == nil { - return v - } - switch v.Val.GetSchema() { - case types.SchemaPeer: - v.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, v.Val.ID, v.Val.Addrs) - } - return v - }), nil + return NewCacheFallbackIter(it, r, ctx), nil // create a new iterator that will use cache if available and fallback to `FindPeer` if no addresses are cached } -//lint:ignore SA1019 // ignore staticcheck -func (r cachedRouter) ProvideBitswap(ctx context.Context, req *server.BitswapWriteProvideRequest) (time.Duration, error) { - return 0, routing.ErrNotSupported -} +// TODO: Open question: should we implement FindPeers to look up cache? If a FindPeer fails to return any peers, the peer is likely long offline. +// func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { +// return r.router.FindPeers(ctx, pid, limit) +// } // withAddrsFromCache returns the best list of addrs for specified [peer.ID]. // It will consult cache only if the addrs slice passed to it is empty. @@ -131,7 +80,137 @@ func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid *peer.ID, addrs peerAddrLookups.WithLabelValues(addrCacheStateHit, queryOrigin).Inc() return cachedAddrs } else { + // Cache miss. Queue peer for lookup. peerAddrLookups.WithLabelValues(addrCacheStateMiss, queryOrigin).Inc() return nil } } + +var _ iter.ResultIter[types.Record] = &cacheFallbackIter{} + +// cacheFallbackIter is a custom iterator that will resolve peers with no addresses from cache and if no cached addresses, will look them up via FindPeers. +type cacheFallbackIter struct { + sourceIter iter.ResultIter[types.Record] + current iter.Result[types.Record] + findPeersResult chan *types.PeerRecord + router cachedRouter + ctx context.Context + ongoingLookups atomic.Int32 +} + +func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cachedRouter, ctx context.Context) *cacheFallbackIter { + return &cacheFallbackIter{ + sourceIter: sourceIter, + router: router, + ctx: ctx, + + findPeersResult: make(chan *types.PeerRecord, 1), + ongoingLookups: atomic.Int32{}, + } +} + +func (it *cacheFallbackIter) Next() bool { + select { + case <-it.ctx.Done(): + return false + case foundPeer := <-it.findPeersResult: + // read from channel if available + it.current = iter.Result[types.Record]{Val: foundPeer} + return true + default: + // load up current val from source iterator and avoid blocking on channel + if it.sourceIter.Next() { + val := it.sourceIter.Val() + switch val.Val.GetSchema() { + case types.SchemaBitswap: + result, ok := val.Val.(*types.BitswapRecord) + if !ok { + it.current = val + return true // pass these through + } + result.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) + if result.Addrs != nil { + it.current = iter.Result[types.Record]{Val: result} + return true + } else { + // no cached addrs, queue for lookup and try to get the next value from the source iterator + go it.dispatchFindPeer(*result.ID) + if it.sourceIter.Next() { + it.current = it.sourceIter.Val() + return true + } else { + return it.ongoingLookups.Load() > 0 // if the source iterator is exhausted, check if there are any peers left to look up + } + } + + case types.SchemaPeer: + result, ok := val.Val.(*types.PeerRecord) + if !ok { + it.current = val + return true // pass these through + } + result.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) + if result.Addrs != nil { + it.current = iter.Result[types.Record]{Val: result} + return true + } else { + // no cached addrs, queue for lookup and try to get the next value from the source iterator + go it.dispatchFindPeer(*result.ID) + if it.sourceIter.Next() { + it.current = it.sourceIter.Val() + return true + } else { + return it.ongoingLookups.Load() > 0 // if the source iterator is exhausted, check if there are any peers left to look up + } + } + } + } + // source iterator is exhausted, check if there are any peers left to look up + if it.ongoingLookups.Load() > 0 { + // if there are any ongoing lookups, return true to keep iterating + return true + } + // if there are no ongoing lookups and the source iterator is exhausted, we're done + return false + } +} + +func (it *cacheFallbackIter) dispatchFindPeer(pid peer.ID) { + it.ongoingLookups.Add(1) + defer it.ongoingLookups.Add(-1) + // FindPeers is weird in that it accepts a limit. But we only want one result, ideally from the libp2p router. + peersIt, err := it.router.FindPeers(it.ctx, pid, 1) + + if err != nil { + logger.Errorw("error looking up peer", "peer", pid, "error", err) + return + } + peers, err := iter.ReadAllResults(peersIt) + if err != nil { + logger.Errorw("error reading find peers results", "peer", pid, "error", err) + return + } + if len(peers) > 0 { + it.findPeersResult <- peers[0] + } else { + logger.Errorw("no peer was found in cachedFallbackIter", "peer", pid) + } +} + +func (it *cacheFallbackIter) Val() iter.Result[types.Record] { + return it.current +} + +func (it *cacheFallbackIter) Close() error { + it.ctx.Cancel() + close(it.findPeersResult) + return it.sourceIter.Close() +} + +func ToMultiaddrs(addrs []ma.Multiaddr) []types.Multiaddr { + var result []types.Multiaddr + for _, addr := range addrs { + result = append(result, types.Multiaddr{Multiaddr: addr}) + } + return result +} diff --git a/server_cached_router_test.go b/server_cached_router_test.go new file mode 100644 index 0000000..ad52eb4 --- /dev/null +++ b/server_cached_router_test.go @@ -0,0 +1,323 @@ +package main + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/ipfs/boxo/routing/http/types" + "github.com/ipfs/boxo/routing/http/types/iter" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/multiformats/go-multiaddr" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +func TestCachedRouter(t *testing.T) { + t.Parallel() + + t.Run("FindProviders with cached addresses", func(t *testing.T) { + ctx := context.Background() + c := makeCID() + pid := peer.ID("test-peer") + + // Create mock router + mr := &mockRouter{} + mockIter := newMockIter[types.Record](ctx) + mr.On("FindProviders", mock.Anything, c, 10).Return(mockIter, nil) + + // Create cached address book with test addresses + cab, err := newCachedAddrBook() + require.NoError(t, err) + + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) + + // Create cached router + cr := NewCachedRouter(mr, cab) + + // Simulate provider response without addresses + go func() { + mockIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ + Schema: "peer", + ID: &pid, + Addrs: nil, // No addresses in response + }} + close(mockIter.ch) + }() + + it, err := cr.FindProviders(ctx, c, 10) + require.NoError(t, err) + + results, err := iter.ReadAllResults(it) + require.NoError(t, err) + require.Len(t, results, 1) + + // Verify cached addresses were added + peerRecord := results[0].(*types.PeerRecord) + require.Equal(t, pid, *peerRecord.ID) + require.Len(t, peerRecord.Addrs, 1) + require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) + }) + + t.Run("FindPeers with cache hit", func(t *testing.T) { + t.Skip("skipping until we decide if FindPeers should look up cache") + ctx := context.Background() + pid := peer.ID("test-peer") + + // Create mock router that returns error + mr := &mockRouter{} + mr.On("FindPeers", mock.Anything, pid, 10).Return(nil, errors.New("peer not found")) + + // Create cached address book with test addresses + cab, err := newCachedAddrBook() + require.NoError(t, err) + + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) + + // Create cached router + cr := NewCachedRouter(mr, cab) + + it, err := cr.FindPeers(ctx, pid, 10) + require.NoError(t, err) + + results, err := iter.ReadAllResults(it) + require.NoError(t, err) + require.Len(t, results, 1) + + // Verify cached addresses were returned + require.Equal(t, pid, *results[0].ID) + require.Len(t, results[0].Addrs, 1) + require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) + }) + + t.Run("FindPeers with cache miss", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + + // Create mock router + mr := &mockRouter{} + mockIter := newMockIter[*types.PeerRecord](ctx) + mr.On("FindPeers", mock.Anything, pid, 10).Return(mockIter, nil) + + // Create empty cached address book + cab, err := newCachedAddrBook() + require.NoError(t, err) + + // Create cached router + cr := NewCachedRouter(mr, cab) + + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Simulate peer response with addresses + go func() { + mockIter.ch <- iter.Result[*types.PeerRecord]{Val: &types.PeerRecord{ + Schema: "peer", + ID: &pid, + Addrs: []types.Multiaddr{publicAddr}, + }} + close(mockIter.ch) + }() + + it, err := cr.FindPeers(ctx, pid, 10) + require.NoError(t, err) + + results, err := iter.ReadAllResults(it) + require.NoError(t, err) + require.Len(t, results, 1) + + // Verify addresses from response were returned + require.Equal(t, pid, *results[0].ID) + require.Len(t, results[0].Addrs, 1) + require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) + }) + +} + +func TestCacheFallbackIter(t *testing.T) { + t.Parallel() + + t.Run("handles source iterator with no fallback needed", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create source iterator with addresses + sourceIter := newMockIter[types.Record](ctx) + go func() { + sourceIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ + Schema: "peer", + ID: &pid, + Addrs: []types.Multiaddr{publicAddr}, + }} + close(sourceIter.ch) + }() + + // Create cached router + mr := &mockRouter{} + cab, err := newCachedAddrBook() + require.NoError(t, err) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Read all results + results, err := iter.ReadAllResults(fallbackIter) + require.NoError(t, err) + require.Len(t, results, 1) + + peerRecord := results[0].(*types.PeerRecord) + require.Equal(t, pid, *peerRecord.ID) + require.Len(t, peerRecord.Addrs, 1) + require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) + }) + + t.Run("uses cache when source has no addresses", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create source iterator without addresses + sourceIter := newMockIter[types.Record](ctx) + go func() { + sourceIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ + Schema: "peer", + ID: &pid, + Addrs: nil, + }} + close(sourceIter.ch) + }() + + // Create cached router with cached addresses + mr := &mockRouter{} + cab, err := newCachedAddrBook() + require.NoError(t, err) + cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Read all results + results, err := iter.ReadAllResults(fallbackIter) + require.NoError(t, err) + require.Len(t, results, 1) + + peerRecord := results[0].(*types.PeerRecord) + require.Equal(t, pid, *peerRecord.ID) + require.Len(t, peerRecord.Addrs, 1) + require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) + }) + + t.Run("falls back to FindPeers when cache misses", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create source iterator without addresses + sourceIter := newMockIter[types.Record](ctx) + go func() { + sourceIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ + Schema: "peer", + ID: &pid, + Addrs: nil, + }} + close(sourceIter.ch) + }() + + // Create mock router that returns addresses via FindPeers + mr := &mockRouter{} + findPeersIter := newMockIter[*types.PeerRecord](ctx) + mr.On("FindPeers", mock.Anything, pid, 1).Return(findPeersIter, nil) + go func() { + findPeersIter.ch <- iter.Result[*types.PeerRecord]{Val: &types.PeerRecord{ + Schema: "peer", + ID: &pid, + Addrs: []types.Multiaddr{publicAddr}, + }} + close(findPeersIter.ch) + }() + + // Create cached router with empty cache + cab, err := newCachedAddrBook() + require.NoError(t, err) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Read all results + results, err := iter.ReadAllResults(fallbackIter) + require.NoError(t, err) + require.Len(t, results, 1) + + peerRecord := results[0].(*types.PeerRecord) + require.Equal(t, pid, *peerRecord.ID) + require.Len(t, peerRecord.Addrs, 1) + require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) + }) + + t.Run("handles bitswap records", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create source iterator with bitswap record + sourceIter := newMockIter[types.Record](ctx) + go func() { + sourceIter.ch <- iter.Result[types.Record]{Val: &types.BitswapRecord{ + Schema: types.SchemaBitswap, + ID: &pid, + Addrs: nil, + }} + close(sourceIter.ch) + }() + + // Create cached router with cached addresses + mr := &mockRouter{} + cab, err := newCachedAddrBook() + require.NoError(t, err) + cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Read all results + results, err := iter.ReadAllResults(fallbackIter) + require.NoError(t, err) + require.Len(t, results, 1) + + bitswapRecord := results[0].(*types.BitswapRecord) + require.Equal(t, pid, *bitswapRecord.ID) + require.Len(t, bitswapRecord.Addrs, 1) + require.Equal(t, publicAddr.String(), bitswapRecord.Addrs[0].String()) + }) + + t.Run("handles context cancellation", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + // pid := peer.ID("test-peer") + + // Create source iterator that will block + sourceIter := newMockIter[types.Record](ctx) + + // Create cached router + mr := &mockRouter{} + cab, err := newCachedAddrBook() + require.NoError(t, err) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Cancel context before sending any values + cancel() + + // Verify iterator stops + require.False(t, fallbackIter.Next()) + require.NoError(t, fallbackIter.Close()) + }) +} From 2e4d12caa0eff70a07b3896866d0d15ad91d4261 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:03:00 +0100 Subject: [PATCH 29/80] fix: add cancel and simplify --- server_cached_router.go | 82 +++++++++++++++++------------------- server_cached_router_test.go | 8 ++-- 2 files changed, 42 insertions(+), 48 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index a2f96f2..09e82ac 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -3,6 +3,7 @@ package main import ( "context" "sync/atomic" + "time" "github.com/ipfs/boxo/routing/http/types" "github.com/ipfs/boxo/routing/http/types/iter" @@ -88,22 +89,24 @@ func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid *peer.ID, addrs var _ iter.ResultIter[types.Record] = &cacheFallbackIter{} -// cacheFallbackIter is a custom iterator that will resolve peers with no addresses from cache and if no cached addresses, will look them up via FindPeers. +// cacheFallbackIter is a wrapper around a results iterator that will resolve peers with no addresses from cache and if no cached addresses, will look them up via FindPeers. type cacheFallbackIter struct { sourceIter iter.ResultIter[types.Record] current iter.Result[types.Record] findPeersResult chan *types.PeerRecord router cachedRouter ctx context.Context + cancel context.CancelFunc ongoingLookups atomic.Int32 } func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cachedRouter, ctx context.Context) *cacheFallbackIter { + ctx, cancel := context.WithCancel(ctx) return &cacheFallbackIter{ - sourceIter: sourceIter, - router: router, - ctx: ctx, - + sourceIter: sourceIter, + router: router, + ctx: ctx, + cancel: cancel, findPeersResult: make(chan *types.PeerRecord, 1), ongoingLookups: atomic.Int32{}, } @@ -121,48 +124,34 @@ func (it *cacheFallbackIter) Next() bool { // load up current val from source iterator and avoid blocking on channel if it.sourceIter.Next() { val := it.sourceIter.Val() - switch val.Val.GetSchema() { - case types.SchemaBitswap: - result, ok := val.Val.(*types.BitswapRecord) - if !ok { - it.current = val - return true // pass these through + handleRecord := func(id *peer.ID, record *types.PeerRecord) bool { + record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, id, record.Addrs) + if record.Addrs != nil { // if we have addrs, return them + it.current = iter.Result[types.Record]{Val: record} + return true } - result.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) - if result.Addrs != nil { - it.current = iter.Result[types.Record]{Val: result} + // If a record has no addrs, we need to look it up. + go it.dispatchFindPeer(*id) + if it.sourceIter.Next() { // In the meantime, we continue reading from source iterator if we have more results + it.current = it.sourceIter.Val() return true - } else { - // no cached addrs, queue for lookup and try to get the next value from the source iterator - go it.dispatchFindPeer(*result.ID) - if it.sourceIter.Next() { - it.current = it.sourceIter.Val() - return true - } else { - return it.ongoingLookups.Load() > 0 // if the source iterator is exhausted, check if there are any peers left to look up - } } - - case types.SchemaPeer: - result, ok := val.Val.(*types.PeerRecord) - if !ok { - it.current = val - return true // pass these through + return it.ongoingLookups.Load() > 0 // If there are no more results from the source iterator, and no ongoing lookups, we're done. + } + switch val.Val.GetSchema() { + case types.SchemaBitswap: + if record, ok := val.Val.(*types.BitswapRecord); ok { + // we convert to peer record to handle uniformly + return handleRecord(record.ID, types.FromBitswapRecord(record)) } - result.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, result.ID, result.Addrs) - if result.Addrs != nil { - it.current = iter.Result[types.Record]{Val: result} - return true - } else { - // no cached addrs, queue for lookup and try to get the next value from the source iterator - go it.dispatchFindPeer(*result.ID) - if it.sourceIter.Next() { - it.current = it.sourceIter.Val() - return true - } else { - return it.ongoingLookups.Load() > 0 // if the source iterator is exhausted, check if there are any peers left to look up - } + case types.SchemaPeer: + if record, ok := val.Val.(*types.PeerRecord); ok { + return handleRecord(record.ID, record) } + default: + // we don't know how to handle this schema, so we just return the record as is + it.current = val + return true } } // source iterator is exhausted, check if there are any peers left to look up @@ -202,8 +191,13 @@ func (it *cacheFallbackIter) Val() iter.Result[types.Record] { } func (it *cacheFallbackIter) Close() error { - it.ctx.Cancel() - close(it.findPeersResult) + it.cancel() + go func() { + for it.ongoingLookups.Load() > 0 { + time.Sleep(time.Millisecond * 10) + } + close(it.findPeersResult) + }() return it.sourceIter.Close() } diff --git a/server_cached_router_test.go b/server_cached_router_test.go index ad52eb4..1d09a59 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -291,10 +291,10 @@ func TestCacheFallbackIter(t *testing.T) { require.NoError(t, err) require.Len(t, results, 1) - bitswapRecord := results[0].(*types.BitswapRecord) - require.Equal(t, pid, *bitswapRecord.ID) - require.Len(t, bitswapRecord.Addrs, 1) - require.Equal(t, publicAddr.String(), bitswapRecord.Addrs[0].String()) + peerRecord := results[0].(*types.PeerRecord) + require.Equal(t, pid, *peerRecord.ID) + require.Len(t, peerRecord.Addrs, 1) + require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) }) t.Run("handles context cancellation", func(t *testing.T) { From 811dce820906a7765265482af726c7dcd9aa0789 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:22:07 +0100 Subject: [PATCH 30/80] fix: move select to Val function --- server_cached_router.go | 91 ++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 09e82ac..60bcf57 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -93,7 +93,7 @@ var _ iter.ResultIter[types.Record] = &cacheFallbackIter{} type cacheFallbackIter struct { sourceIter iter.ResultIter[types.Record] current iter.Result[types.Record] - findPeersResult chan *types.PeerRecord + findPeersResult chan types.PeerRecord router cachedRouter ctx context.Context cancel context.CancelFunc @@ -107,61 +107,52 @@ func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cache router: router, ctx: ctx, cancel: cancel, - findPeersResult: make(chan *types.PeerRecord, 1), + findPeersResult: make(chan types.PeerRecord, 1), ongoingLookups: atomic.Int32{}, } } func (it *cacheFallbackIter) Next() bool { - select { - case <-it.ctx.Done(): - return false - case foundPeer := <-it.findPeersResult: - // read from channel if available - it.current = iter.Result[types.Record]{Val: foundPeer} - return true - default: - // load up current val from source iterator and avoid blocking on channel - if it.sourceIter.Next() { - val := it.sourceIter.Val() - handleRecord := func(id *peer.ID, record *types.PeerRecord) bool { - record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, id, record.Addrs) - if record.Addrs != nil { // if we have addrs, return them - it.current = iter.Result[types.Record]{Val: record} - return true - } - // If a record has no addrs, we need to look it up. - go it.dispatchFindPeer(*id) - if it.sourceIter.Next() { // In the meantime, we continue reading from source iterator if we have more results - it.current = it.sourceIter.Val() - return true - } - return it.ongoingLookups.Load() > 0 // If there are no more results from the source iterator, and no ongoing lookups, we're done. + // load up current val from source iterator and avoid blocking on channel + if it.sourceIter.Next() { + val := it.sourceIter.Val() + handleRecord := func(id *peer.ID, record *types.PeerRecord) bool { + record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, id, record.Addrs) + if record.Addrs != nil { // if we have addrs, return them + it.current = iter.Result[types.Record]{Val: record} + return true } - switch val.Val.GetSchema() { - case types.SchemaBitswap: - if record, ok := val.Val.(*types.BitswapRecord); ok { - // we convert to peer record to handle uniformly - return handleRecord(record.ID, types.FromBitswapRecord(record)) - } - case types.SchemaPeer: - if record, ok := val.Val.(*types.PeerRecord); ok { - return handleRecord(record.ID, record) - } - default: - // we don't know how to handle this schema, so we just return the record as is - it.current = val + // If a record has no addrs, we need to look it up. + go it.dispatchFindPeer(*id) + if it.sourceIter.Next() { // In the meantime, we continue reading from source iterator if we have more results + it.current = it.sourceIter.Val() return true } + return it.ongoingLookups.Load() > 0 // If there are no more results from the source iterator, and no ongoing lookups, we're done. } - // source iterator is exhausted, check if there are any peers left to look up - if it.ongoingLookups.Load() > 0 { - // if there are any ongoing lookups, return true to keep iterating + switch val.Val.GetSchema() { + case types.SchemaBitswap: + if record, ok := val.Val.(*types.BitswapRecord); ok { + // we convert to peer record to handle uniformly + return handleRecord(record.ID, types.FromBitswapRecord(record)) + } + case types.SchemaPeer: + if record, ok := val.Val.(*types.PeerRecord); ok { + return handleRecord(record.ID, record) + } + default: + // we don't know how to handle this schema, so we just return the record as is + it.current = val return true } - // if there are no ongoing lookups and the source iterator is exhausted, we're done - return false } + // source iterator is exhausted, check if there are any peers left to look up + if it.ongoingLookups.Load() > 0 { + // if there are any ongoing lookups, return true to keep iterating + return true + } + // if there are no ongoing lookups and the source iterator is exhausted, we're done + return false } func (it *cacheFallbackIter) dispatchFindPeer(pid peer.ID) { @@ -180,14 +171,22 @@ func (it *cacheFallbackIter) dispatchFindPeer(pid peer.ID) { return } if len(peers) > 0 { - it.findPeersResult <- peers[0] + it.findPeersResult <- *peers[0] } else { logger.Errorw("no peer was found in cachedFallbackIter", "peer", pid) } } func (it *cacheFallbackIter) Val() iter.Result[types.Record] { - return it.current + select { + case <-it.ctx.Done(): + return iter.Result[types.Record]{Err: it.ctx.Err()} + case foundPeer := <-it.findPeersResult: + // read from channel if available + return iter.Result[types.Record]{Val: &foundPeer} + default: + return it.current + } } func (it *cacheFallbackIter) Close() error { From b4da9cd38a4b51fcf75505045f592d010fd208a0 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:01:03 +0100 Subject: [PATCH 31/80] fix: concurrency bug from the ongoingLookups --- server_cached_router.go | 79 ++++++----- server_cached_router_test.go | 257 +++++++++++++++++++++++++++-------- 2 files changed, 240 insertions(+), 96 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 60bcf57..f7ecf25 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "sync/atomic" "time" @@ -26,6 +27,8 @@ var ( }, []string{addrCacheStateLabel, addrQueryOriginLabel}, ) + + errNoValueAvailable = errors.New("no value available") ) const ( @@ -107,85 +110,87 @@ func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cache router: router, ctx: ctx, cancel: cancel, - findPeersResult: make(chan types.PeerRecord, 1), + findPeersResult: make(chan types.PeerRecord), ongoingLookups: atomic.Int32{}, } } func (it *cacheFallbackIter) Next() bool { - // load up current val from source iterator and avoid blocking on channel + // Try to get the next value from the source iterator first if it.sourceIter.Next() { val := it.sourceIter.Val() handleRecord := func(id *peer.ID, record *types.PeerRecord) bool { record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, id, record.Addrs) - if record.Addrs != nil { // if we have addrs, return them + if record.Addrs != nil { it.current = iter.Result[types.Record]{Val: record} return true } - // If a record has no addrs, we need to look it up. - go it.dispatchFindPeer(*id) - if it.sourceIter.Next() { // In the meantime, we continue reading from source iterator if we have more results - it.current = it.sourceIter.Val() - return true - } - return it.ongoingLookups.Load() > 0 // If there are no more results from the source iterator, and no ongoing lookups, we're done. + logger.Infow("no cached addresses found in cacheFallbackIter, dispatching find peers", "peer", id) + // If a record has no addrs, we dispatch a lookup to find addresses + go it.dispatchFindPeer(*record) + // important to increment here since Next() may be called again synchronously + it.ongoingLookups.Add(1) + + return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } + switch val.Val.GetSchema() { case types.SchemaBitswap: if record, ok := val.Val.(*types.BitswapRecord); ok { - // we convert to peer record to handle uniformly return handleRecord(record.ID, types.FromBitswapRecord(record)) } case types.SchemaPeer: if record, ok := val.Val.(*types.PeerRecord); ok { return handleRecord(record.ID, record) } - default: - // we don't know how to handle this schema, so we just return the record as is - it.current = val - return true } + it.current = val // pass through unknown schemas + return true } - // source iterator is exhausted, check if there are any peers left to look up + + // If there are still ongoing lookups, wait for them if it.ongoingLookups.Load() > 0 { - // if there are any ongoing lookups, return true to keep iterating - return true + logger.Infow("waiting for ongoing find peers result") + select { + case result, ok := <-it.findPeersResult: + if ok { + it.current = iter.Result[types.Record]{Val: &result} + return true + } + case <-it.ctx.Done(): + return false + } } - // if there are no ongoing lookups and the source iterator is exhausted, we're done + return false } -func (it *cacheFallbackIter) dispatchFindPeer(pid peer.ID) { - it.ongoingLookups.Add(1) +func (it *cacheFallbackIter) Val() iter.Result[types.Record] { + if it.current.Val != nil || it.current.Err != nil { + return it.current + } + return iter.Result[types.Record]{Err: errNoValueAvailable} +} + +func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { defer it.ongoingLookups.Add(-1) // FindPeers is weird in that it accepts a limit. But we only want one result, ideally from the libp2p router. - peersIt, err := it.router.FindPeers(it.ctx, pid, 1) + peersIt, err := it.router.FindPeers(it.ctx, *record.ID, 1) if err != nil { - logger.Errorw("error looking up peer", "peer", pid, "error", err) + it.findPeersResult <- record // pass back the record with no addrs return } peers, err := iter.ReadAllResults(peersIt) if err != nil { - logger.Errorw("error reading find peers results", "peer", pid, "error", err) + it.findPeersResult <- record // pass back the record with no addrs return } if len(peers) > 0 { + // If we found the peer, pass back it.findPeersResult <- *peers[0] } else { - logger.Errorw("no peer was found in cachedFallbackIter", "peer", pid) - } -} - -func (it *cacheFallbackIter) Val() iter.Result[types.Record] { - select { - case <-it.ctx.Done(): - return iter.Result[types.Record]{Err: it.ctx.Err()} - case foundPeer := <-it.findPeersResult: - // read from channel if available - return iter.Result[types.Record]{Val: &foundPeer} - default: - return it.current + it.findPeersResult <- record // pass back the record with no addrs } } diff --git a/server_cached_router_test.go b/server_cached_router_test.go index 1d09a59..f291c66 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -2,18 +2,52 @@ package main import ( "context" - "errors" "testing" "time" "github.com/ipfs/boxo/routing/http/types" "github.com/ipfs/boxo/routing/http/types/iter" "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/core/routing" "github.com/multiformats/go-multiaddr" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" ) +type mockResultIter[T any] struct { + results []iter.Result[T] + current int + closed bool +} + +func newMockResultIter[T any](results []iter.Result[T]) *mockResultIter[T] { + return &mockResultIter[T]{ + results: results, + current: -1, + closed: false, + } +} + +func (m *mockResultIter[T]) Next() bool { + if m.closed { + return false + } + m.current++ + return m.current < len(m.results) +} + +func (m *mockResultIter[T]) Val() iter.Result[T] { + if m.current < 0 || m.current >= len(m.results) { + panic("Val() called without calling Next() or after Next() returned false") + } + return m.results[m.current] +} + +func (m *mockResultIter[T]) Close() error { + m.closed = true + return nil +} + func TestCachedRouter(t *testing.T) { t.Parallel() @@ -24,7 +58,9 @@ func TestCachedRouter(t *testing.T) { // Create mock router mr := &mockRouter{} - mockIter := newMockIter[types.Record](ctx) + mockIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, + }) mr.On("FindProviders", mock.Anything, c, 10).Return(mockIter, nil) // Create cached address book with test addresses @@ -37,16 +73,6 @@ func TestCachedRouter(t *testing.T) { // Create cached router cr := NewCachedRouter(mr, cab) - // Simulate provider response without addresses - go func() { - mockIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ - Schema: "peer", - ID: &pid, - Addrs: nil, // No addresses in response - }} - close(mockIter.ch) - }() - it, err := cr.FindProviders(ctx, c, 10) require.NoError(t, err) @@ -68,7 +94,7 @@ func TestCachedRouter(t *testing.T) { // Create mock router that returns error mr := &mockRouter{} - mr.On("FindPeers", mock.Anything, pid, 10).Return(nil, errors.New("peer not found")) + mr.On("FindPeers", mock.Anything, pid, 10).Return(nil, routing.ErrNotFound) // Create cached address book with test addresses cab, err := newCachedAddrBook() @@ -145,15 +171,9 @@ func TestCacheFallbackIter(t *testing.T) { publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") // Create source iterator with addresses - sourceIter := newMockIter[types.Record](ctx) - go func() { - sourceIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ - Schema: "peer", - ID: &pid, - Addrs: []types.Multiaddr{publicAddr}, - }} - close(sourceIter.ch) - }() + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: []types.Multiaddr{publicAddr}}}, + }) // Create cached router mr := &mockRouter{} @@ -181,15 +201,9 @@ func TestCacheFallbackIter(t *testing.T) { publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") // Create source iterator without addresses - sourceIter := newMockIter[types.Record](ctx) - go func() { - sourceIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ - Schema: "peer", - ID: &pid, - Addrs: nil, - }} - close(sourceIter.ch) - }() + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, + }) // Create cached router with cached addresses mr := &mockRouter{} @@ -218,28 +232,16 @@ func TestCacheFallbackIter(t *testing.T) { publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") // Create source iterator without addresses - sourceIter := newMockIter[types.Record](ctx) - go func() { - sourceIter.ch <- iter.Result[types.Record]{Val: &types.PeerRecord{ - Schema: "peer", - ID: &pid, - Addrs: nil, - }} - close(sourceIter.ch) - }() + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, + }) // Create mock router that returns addresses via FindPeers mr := &mockRouter{} - findPeersIter := newMockIter[*types.PeerRecord](ctx) + findPeersIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: []types.Multiaddr{publicAddr}}}, + }) mr.On("FindPeers", mock.Anything, pid, 1).Return(findPeersIter, nil) - go func() { - findPeersIter.ch <- iter.Result[*types.PeerRecord]{Val: &types.PeerRecord{ - Schema: "peer", - ID: &pid, - Addrs: []types.Multiaddr{publicAddr}, - }} - close(findPeersIter.ch) - }() // Create cached router with empty cache cab, err := newCachedAddrBook() @@ -266,15 +268,9 @@ func TestCacheFallbackIter(t *testing.T) { publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") // Create source iterator with bitswap record - sourceIter := newMockIter[types.Record](ctx) - go func() { - sourceIter.ch <- iter.Result[types.Record]{Val: &types.BitswapRecord{ - Schema: types.SchemaBitswap, - ID: &pid, - Addrs: nil, - }} - close(sourceIter.ch) - }() + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.BitswapRecord{Schema: types.SchemaBitswap, ID: &pid, Addrs: nil}}, + }) // Create cached router with cached addresses mr := &mockRouter{} @@ -299,7 +295,6 @@ func TestCacheFallbackIter(t *testing.T) { t.Run("handles context cancellation", func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) - // pid := peer.ID("test-peer") // Create source iterator that will block sourceIter := newMockIter[types.Record](ctx) @@ -320,4 +315,148 @@ func TestCacheFallbackIter(t *testing.T) { require.False(t, fallbackIter.Next()) require.NoError(t, fallbackIter.Close()) }) + + t.Run("handles multiple Val() calls correctly", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create source iterator with a single record + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: []types.Multiaddr{publicAddr}}}, + }) + + // Create cached router + mr := &mockRouter{} + cab, err := newCachedAddrBook() + require.NoError(t, err) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // First Next() should succeed + require.True(t, fallbackIter.Next()) + + // Multiple Val() calls should return the same value + val1 := fallbackIter.Val() + val2 := fallbackIter.Val() + require.Equal(t, val1, val2) + + // Value should be correct + peerRecord := val1.Val.(*types.PeerRecord) + require.Equal(t, pid, *peerRecord.ID) + require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) + + // After consuming the only value, Next() should return false + require.False(t, fallbackIter.Next()) + }) + + t.Run("handles context cancellation during lookup", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + pid := peer.ID("test-peer") + + // Create source iterator with record without addresses + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, + }) + + // Create mock router with FindPeers that returns ErrNotFound + mr := &mockRouter{} + mr.On("FindPeers", mock.Anything, pid, 1).Return(nil, routing.ErrNotFound) + + // Create cached router + cab, err := newCachedAddrBook() + require.NoError(t, err) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // First Next() should trigger lookup + require.True(t, fallbackIter.Next()) + + // Cancel context during lookup + cancel() + + // Next() should return false + require.False(t, fallbackIter.Next()) + + // Val() should return the record with no addrs + result := fallbackIter.Val() + require.Equal(t, pid, *result.Val.(*types.PeerRecord).ID) + require.Len(t, result.Val.(*types.PeerRecord).Addrs, 0) + }) + + t.Run("handles FindPeers error gracefully", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + + // Create source iterator without addresses + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, + }) + + // Create mock router that returns error from FindPeers + mr := &mockRouter{} + mr.On("FindPeers", mock.Anything, pid, 1).Return(nil, routing.ErrNotFound) + + // Create cached router with empty cache + cab, err := newCachedAddrBook() + require.NoError(t, err) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Should still get a result, but with no addresses + results, err := iter.ReadAllResults(fallbackIter) + require.NoError(t, err) + require.Len(t, results, 1) + require.Empty(t, results[0].(*types.PeerRecord).Addrs) + }) + + t.Run("handles multiple records with mixed address states", func(t *testing.T) { + ctx := context.Background() + pid1 := peer.ID("test-peer-1") + pid2 := peer.ID("test-peer-2") + pid3 := peer.ID("test-peer-3") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create source iterator with multiple records + sourceIter := newMockResultIter([]iter.Result[types.Record]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid1, Addrs: []types.Multiaddr{publicAddr}}}, // Has address + {Val: &types.PeerRecord{Schema: "peer", ID: &pid2, Addrs: nil}}, // No address, will use cache + {Val: &types.PeerRecord{Schema: "peer", ID: &pid3, Addrs: nil}}, // No address, will need FindPeers + }) + + // Create mock router + mr := &mockRouter{} + findPeersIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid3, Addrs: []types.Multiaddr{publicAddr}}}, + }) + mr.On("FindPeers", mock.Anything, pid3, 1).Return(findPeersIter, nil) + + // Create cached router with some cached addresses + cab, err := newCachedAddrBook() + require.NoError(t, err) + cab.addrBook.AddAddrs(pid2, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) + cr := NewCachedRouter(mr, cab) + + // Create fallback iterator + fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) + + // Should get all records with addresses + results, err := iter.ReadAllResults(fallbackIter) + require.NoError(t, err) + require.Len(t, results, 3) + + // Verify each record has the expected addresses + for _, result := range results { + record := result.(*types.PeerRecord) + require.Len(t, record.Addrs, 1) + require.Equal(t, publicAddr.String(), record.Addrs[0].String()) + } + }) + } From d00fcb43978fe10a70876e4646fb23f5ea03b5a5 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:07:32 +0100 Subject: [PATCH 32/80] chore: clean up comments --- cached_addr_book_test.go | 2 +- server_cached_router.go | 10 ++++++---- server_cached_router_test.go | 1 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index 0496ac9..a90682f 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -49,7 +49,7 @@ func TestGetCachedAddrs(t *testing.T) { } func TestBackground(t *testing.T) { - t.Skip("skipping until we have a better way to test background") + t.Skip("skipping until this test is less flaky") ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/server_cached_router.go b/server_cached_router.go index f7ecf25..a9bfa66 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -61,16 +61,17 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return nil, err } - return NewCacheFallbackIter(it, r, ctx), nil // create a new iterator that will use cache if available and fallback to `FindPeer` if no addresses are cached + return NewCacheFallbackIter(it, r, ctx), nil } -// TODO: Open question: should we implement FindPeers to look up cache? If a FindPeer fails to return any peers, the peer is likely long offline. +// TODO: Open question: should FindPeers look up cache? If a FindPeer fails to return any peers, the peer is likely long offline. +// So it's not clear that we gain anything by returning a cached peer that is likely offline. // func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { // return r.router.FindPeers(ctx, pid, limit) // } // withAddrsFromCache returns the best list of addrs for specified [peer.ID]. -// It will consult cache only if the addrs slice passed to it is empty. +// It will consult cache ONLY if the addrs slice passed to it is empty. func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid *peer.ID, addrs []types.Multiaddr) []types.Multiaddr { // skip cache if we already have addrs if len(addrs) > 0 { @@ -92,7 +93,6 @@ func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid *peer.ID, addrs var _ iter.ResultIter[types.Record] = &cacheFallbackIter{} -// cacheFallbackIter is a wrapper around a results iterator that will resolve peers with no addresses from cache and if no cached addresses, will look them up via FindPeers. type cacheFallbackIter struct { sourceIter iter.ResultIter[types.Record] current iter.Result[types.Record] @@ -103,6 +103,8 @@ type cacheFallbackIter struct { ongoingLookups atomic.Int32 } +// NewCacheFallbackIter is a wrapper around a results iterator that will resolve peers with no addresses from cache and if no cached addresses, will look them up via FindPeers. +// It's a bit complex because it ensures we continue iterating without blocking on the FindPeers call. func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cachedRouter, ctx context.Context) *cacheFallbackIter { ctx, cancel := context.WithCancel(ctx) return &cacheFallbackIter{ diff --git a/server_cached_router_test.go b/server_cached_router_test.go index f291c66..95a5010 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -20,6 +20,7 @@ type mockResultIter[T any] struct { closed bool } +// Simple mock results iter that doesn't use channels func newMockResultIter[T any](results []iter.Result[T]) *mockResultIter[T] { return &mockResultIter[T]{ results: results, From 6219804546753898ffd6d6772d898d36e557cca1 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:13:20 +0100 Subject: [PATCH 33/80] fix: add lint ignores --- server_cached_router.go | 2 ++ server_cached_router_test.go | 1 + 2 files changed, 3 insertions(+) diff --git a/server_cached_router.go b/server_cached_router.go index a9bfa66..d919ba8 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -137,7 +137,9 @@ func (it *cacheFallbackIter) Next() bool { } switch val.Val.GetSchema() { + //lint:ignore SA1019 // ignore staticcheck case types.SchemaBitswap: + //lint:ignore SA1019 // ignore staticcheck if record, ok := val.Val.(*types.BitswapRecord); ok { return handleRecord(record.ID, types.FromBitswapRecord(record)) } diff --git a/server_cached_router_test.go b/server_cached_router_test.go index 95a5010..045afa4 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -270,6 +270,7 @@ func TestCacheFallbackIter(t *testing.T) { // Create source iterator with bitswap record sourceIter := newMockResultIter([]iter.Result[types.Record]{ + //lint:ignore SA1019 // ignore staticcheck {Val: &types.BitswapRecord{Schema: types.SchemaBitswap, ID: &pid, Addrs: nil}}, }) From 662f0d442a35fcb70fe8affd2fd46917ed1203be Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:52:28 +0100 Subject: [PATCH 34/80] docs: update changelog --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de98046..316e4d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,9 +15,9 @@ The following emojis are used to highlight certain changes: ### Added -- By default caching discovered Peer addresses up to 48h to match [provider record expiration on Amino DHT](https://github.com/libp2p/go-libp2p-kad-dht/blob/v0.28.1/amino/defaults.go#L40-L43). Someguy will use cached addresses if the default peerbook from go-libp2p does not have information at hand. This can be controlled via `SOMEGUY_CACHED_ADDR_BOOK=true|false` (enabled by default) -- Added a new `cachedAddrBook` implementation that caches peer addresses and probes them in the background. -- Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. +- Default caching of peer addresses for 48h to match [provider record expiration on Amino DHT](https://github.com/libp2p/go-libp2p-kad-dht/blob/v0.28.1/amino/defaults.go#L40-L43). Someguy will return cached addresses for peers without multiaddrs in `FindProviders` if there are no addresses for a provider. This can be enabled via `SOMEGUY_CACHED_ADDR_BOOK=true|false` (enabled by default) +- Added a new `cachedAddrBook` implementation that caches peer addresses by subscribing to Identify events and probes those peers in the background. +- Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. If a Peer is encountered with no cached addresses, `FindPeer` is dispatched in the background. ### Changed From c812cf4bc8463d5e26fbffdb387397ee96c47a2d Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:52:38 +0100 Subject: [PATCH 35/80] fix: increase bucket sizes for probe duration --- cached_addr_book.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 3268fb2..88ca98d 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -28,8 +28,8 @@ var ( Namespace: name, Subsystem: "cached_addr_book", Help: "Duration of peer probing operations in seconds", - // Buckets optimized for expected probe durations from ms to full timeout - Buckets: []float64{0.5, 1, 2, 5, 10, 30, 60, 120}, + // Buckets probe durations from 1s to 5 minutes + Buckets: []float64{1, 2, 5, 10, 30, 60, 120, 300}, }) peerStateSize = promauto.NewGauge(prometheus.GaugeOpts{ @@ -236,8 +236,7 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { logger.Debugf("Probe %d: PeerID: %s, Addrs: %v", i+1, p, addrs) // if connect succeeds and identify runs, the background loop will take care of updating the peer state and cache err := host.Connect(ctx, peer.AddrInfo{ - ID: p, - // TODO: Should we should probe the last connected address or all addresses? + ID: p, Addrs: addrs, }) if err != nil { From 8646f38964dbc2153d5b0f4b991b5791f10cddfc Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:55:48 +0100 Subject: [PATCH 36/80] chore: remove unused peer state fields save some memory --- cached_addr_book.go | 10 +++------- cached_addr_book_test.go | 2 -- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 88ca98d..96e1dd3 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -65,11 +65,9 @@ const ( ) type peerState struct { - lastConnTime time.Time // last time we successfully connected to this peer - lastConnAddr ma.Multiaddr // last address we connected to this peer on - returnCount int // number of times we've returned this peer from the cache - lastReturnTime time.Time // last time we returned this peer from the cache - connectFailures int // number of times we've failed to connect to this peer + lastConnTime time.Time // last time we successfully connected to this peer + returnCount int // number of times we've returned this peer from the cache + connectFailures int // number of times we've failed to connect to this peer } type cachedAddrBook struct { @@ -140,7 +138,6 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { peerStateSize.Set(float64(len(cab.peers))) } pState.lastConnTime = time.Now() - pState.lastConnAddr = ev.Conn.RemoteMultiaddr() pState.connectFailures = 0 // reset connect failures on successful connection cab.mu.Unlock() @@ -265,7 +262,6 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { peerStateSize.Set(float64(len(cab.peers))) } cab.peers[*p].returnCount++ - cab.peers[*p].lastReturnTime = time.Now() cab.mu.Unlock() var result []types.Multiaddr // convert to local Multiaddr type 🙃 diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index a90682f..f64216a 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -45,7 +45,6 @@ func TestGetCachedAddrs(t *testing.T) { // Verify return count and time were updated assert.Equal(t, 1, cab.peers[testPeer].returnCount) - assert.False(t, cab.peers[testPeer].lastReturnTime.IsZero()) } func TestBackground(t *testing.T) { @@ -122,7 +121,6 @@ func TestBackground(t *testing.T) { peerState, exists := cab.peers[testPeer] assert.True(t, exists) assert.NotNil(t, peerState) - assert.Equal(t, addr, peerState.lastConnAddr) cab.mu.RUnlock() } From 46a74a31f317e4a003d523ce59711044fc53711d Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:09:29 +0100 Subject: [PATCH 37/80] feat: enable caching for FindPeer in cached router --- server_cached_router.go | 18 +++++++--- server_cached_router_test.go | 65 ++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index d919ba8..314271c 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -64,11 +64,19 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return NewCacheFallbackIter(it, r, ctx), nil } -// TODO: Open question: should FindPeers look up cache? If a FindPeer fails to return any peers, the peer is likely long offline. -// So it's not clear that we gain anything by returning a cached peer that is likely offline. -// func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { -// return r.router.FindPeers(ctx, pid, limit) -// } +// FindPeers uses a simpler approach than FindProviders because we're dealing with a single PeerRecord, and there's +// no point in trying to dispatch an additional FindPeer call. +func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { + it, err := r.router.FindPeers(ctx, pid, limit) + if err != nil { + return nil, err + } + + return iter.Map(it, func(record iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { + record.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, record.Val.ID, record.Val.Addrs) + return record + }), nil +} // withAddrsFromCache returns the best list of addrs for specified [peer.ID]. // It will consult cache ONLY if the addrs slice passed to it is empty. diff --git a/server_cached_router_test.go b/server_cached_router_test.go index 045afa4..dab6b68 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -161,6 +161,71 @@ func TestCachedRouter(t *testing.T) { require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) }) + t.Run("FindPeers handles records with and without addresses", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create mock router that returns a record without addresses + mr := &mockRouter{} + mockIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, + }) + mr.On("FindPeers", mock.Anything, pid, 10).Return(mockIter, nil) + + // Create cached address book with test addresses + cab, err := newCachedAddrBook() + require.NoError(t, err) + cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) + + // Create cached router + cr := NewCachedRouter(mr, cab) + + it, err := cr.FindPeers(ctx, pid, 10) + require.NoError(t, err) + + results, err := iter.ReadAllResults(it) + require.NoError(t, err) + require.Len(t, results, 1) + + // Verify cached addresses were added to the record + require.Equal(t, pid, *results[0].ID) + require.Len(t, results[0].Addrs, 1) + require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) + }) + + t.Run("FindPeers returns same addresses as underlying router", func(t *testing.T) { + ctx := context.Background() + pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") + + // Create mock router that returns a record with addresses + mr := &mockRouter{} + mockIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: []types.Multiaddr{publicAddr}}}, + }) + mr.On("FindPeers", mock.Anything, pid, 10).Return(mockIter, nil) + + // Create cached address book without any addresses + cab, err := newCachedAddrBook() + require.NoError(t, err) + + // Create cached router + cr := NewCachedRouter(mr, cab) + + it, err := cr.FindPeers(ctx, pid, 10) + require.NoError(t, err) + + results, err := iter.ReadAllResults(it) + require.NoError(t, err) + require.Len(t, results, 1) + + // Verify the addresses returned are the same as those from the underlying router + require.Equal(t, pid, *results[0].ID) + require.Len(t, results[0].Addrs, 1) + require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) + }) + } func TestCacheFallbackIter(t *testing.T) { From d9601e4b9fc59dee6666710ff9a8179170ceaa3b Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:24:38 +0100 Subject: [PATCH 38/80] fix: handle peer not found case --- server_cached_router.go | 18 ++++++++++ server_cached_router_test.go | 66 ------------------------------------ 2 files changed, 18 insertions(+), 66 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 314271c..bcc8523 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -10,6 +10,7 @@ import ( "github.com/ipfs/boxo/routing/http/types/iter" "github.com/ipfs/go-cid" "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/core/routing" ma "github.com/multiformats/go-multiaddr" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -68,10 +69,27 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) // no point in trying to dispatch an additional FindPeer call. func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (iter.ResultIter[*types.PeerRecord], error) { it, err := r.router.FindPeers(ctx, pid, limit) + + if err == routing.ErrNotFound { + // If we didn't find the peer, try the cache + cachedAddrs := r.withAddrsFromCache(addrQueryOriginPeers, &pid, nil) + if len(cachedAddrs) > 0 { + return iter.ToResultIter(iter.FromSlice([]*types.PeerRecord{ + { + Schema: types.SchemaPeer, + ID: &pid, + Addrs: cachedAddrs, + }, + })), nil + } + return nil, routing.ErrNotFound + } + if err != nil { return nil, err } + // If the peer was found, there is likely no point in looking up the cache (because kad-dht will connect to it as part of FindPeers), but we'll do it just in case. return iter.Map(it, func(record iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { record.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, record.Val.ID, record.Val.Addrs) return record diff --git a/server_cached_router_test.go b/server_cached_router_test.go index dab6b68..232e164 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -89,7 +89,6 @@ func TestCachedRouter(t *testing.T) { }) t.Run("FindPeers with cache hit", func(t *testing.T) { - t.Skip("skipping until we decide if FindPeers should look up cache") ctx := context.Background() pid := peer.ID("test-peer") @@ -161,71 +160,6 @@ func TestCachedRouter(t *testing.T) { require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) }) - t.Run("FindPeers handles records with and without addresses", func(t *testing.T) { - ctx := context.Background() - pid := peer.ID("test-peer") - publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") - - // Create mock router that returns a record without addresses - mr := &mockRouter{} - mockIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ - {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, - }) - mr.On("FindPeers", mock.Anything, pid, 10).Return(mockIter, nil) - - // Create cached address book with test addresses - cab, err := newCachedAddrBook() - require.NoError(t, err) - cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) - - // Create cached router - cr := NewCachedRouter(mr, cab) - - it, err := cr.FindPeers(ctx, pid, 10) - require.NoError(t, err) - - results, err := iter.ReadAllResults(it) - require.NoError(t, err) - require.Len(t, results, 1) - - // Verify cached addresses were added to the record - require.Equal(t, pid, *results[0].ID) - require.Len(t, results[0].Addrs, 1) - require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) - }) - - t.Run("FindPeers returns same addresses as underlying router", func(t *testing.T) { - ctx := context.Background() - pid := peer.ID("test-peer") - publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") - - // Create mock router that returns a record with addresses - mr := &mockRouter{} - mockIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ - {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: []types.Multiaddr{publicAddr}}}, - }) - mr.On("FindPeers", mock.Anything, pid, 10).Return(mockIter, nil) - - // Create cached address book without any addresses - cab, err := newCachedAddrBook() - require.NoError(t, err) - - // Create cached router - cr := NewCachedRouter(mr, cab) - - it, err := cr.FindPeers(ctx, pid, 10) - require.NoError(t, err) - - results, err := iter.ReadAllResults(it) - require.NoError(t, err) - require.Len(t, results, 1) - - // Verify the addresses returned are the same as those from the underlying router - require.Equal(t, pid, *results[0].ID) - require.Len(t, results[0].Addrs, 1) - require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) - }) - } func TestCacheFallbackIter(t *testing.T) { From 986b010528bde2bae0cc825180e29902c27a8a1e Mon Sep 17 00:00:00 2001 From: Daniel Norman <1992255+2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 10:50:40 +0100 Subject: [PATCH 39/80] Apply suggestions from code review Co-authored-by: Marcin Rataj --- CHANGELOG.md | 4 ++-- server_cached_router.go | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 316e4d6..0c09a26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,8 @@ The following emojis are used to highlight certain changes: ### Added - Default caching of peer addresses for 48h to match [provider record expiration on Amino DHT](https://github.com/libp2p/go-libp2p-kad-dht/blob/v0.28.1/amino/defaults.go#L40-L43). Someguy will return cached addresses for peers without multiaddrs in `FindProviders` if there are no addresses for a provider. This can be enabled via `SOMEGUY_CACHED_ADDR_BOOK=true|false` (enabled by default) -- Added a new `cachedAddrBook` implementation that caches peer addresses by subscribing to Identify events and probes those peers in the background. -- Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. If a Peer is encountered with no cached addresses, `FindPeer` is dispatched in the background. + - Added a new `cachedAddrBook` implementation that caches peer addresses by subscribing to Identify events and probes those peers in the background. + - Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. If a Peer is encountered with no cached addresses, `FindPeer` is dispatched in the background. ### Changed diff --git a/server_cached_router.go b/server_cached_router.go index bcc8523..615f476 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -163,12 +163,6 @@ func (it *cacheFallbackIter) Next() bool { } switch val.Val.GetSchema() { - //lint:ignore SA1019 // ignore staticcheck - case types.SchemaBitswap: - //lint:ignore SA1019 // ignore staticcheck - if record, ok := val.Val.(*types.BitswapRecord); ok { - return handleRecord(record.ID, types.FromBitswapRecord(record)) - } case types.SchemaPeer: if record, ok := val.Val.(*types.PeerRecord); ok { return handleRecord(record.ID, record) From ecd075795fe6cd337e9b34f9478173207291c248 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 10:51:15 +0100 Subject: [PATCH 40/80] fix: wait longer during cleanup function --- server_cached_router.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server_cached_router.go b/server_cached_router.go index 615f476..40b9aff 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -222,7 +222,7 @@ func (it *cacheFallbackIter) Close() error { it.cancel() go func() { for it.ongoingLookups.Load() > 0 { - time.Sleep(time.Millisecond * 10) + time.Sleep(time.Millisecond * 100) } close(it.findPeersResult) }() From a0443d050e96a08194f6c5f35146269f63c5887d Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 10:54:34 +0100 Subject: [PATCH 41/80] test: remove bitswap record test --- server_cached_router_test.go | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/server_cached_router_test.go b/server_cached_router_test.go index 232e164..3119e3b 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -262,38 +262,6 @@ func TestCacheFallbackIter(t *testing.T) { require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) }) - t.Run("handles bitswap records", func(t *testing.T) { - ctx := context.Background() - pid := peer.ID("test-peer") - publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") - - // Create source iterator with bitswap record - sourceIter := newMockResultIter([]iter.Result[types.Record]{ - //lint:ignore SA1019 // ignore staticcheck - {Val: &types.BitswapRecord{Schema: types.SchemaBitswap, ID: &pid, Addrs: nil}}, - }) - - // Create cached router with cached addresses - mr := &mockRouter{} - cab, err := newCachedAddrBook() - require.NoError(t, err) - cab.addrBook.AddAddrs(pid, []multiaddr.Multiaddr{publicAddr.Multiaddr}, time.Hour) - cr := NewCachedRouter(mr, cab) - - // Create fallback iterator - fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) - - // Read all results - results, err := iter.ReadAllResults(fallbackIter) - require.NoError(t, err) - require.Len(t, results, 1) - - peerRecord := results[0].(*types.PeerRecord) - require.Equal(t, pid, *peerRecord.ID) - require.Len(t, peerRecord.Addrs, 1) - require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) - }) - t.Run("handles context cancellation", func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) From 22aacd7b0a5478af7e198cea866c4b9960e0a30d Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 10:58:28 +0100 Subject: [PATCH 42/80] refactor: extract connectedness checks to a func --- cached_addr_book.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 96e1dd3..8c4a225 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -146,7 +146,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { cab, ok := peerstore.GetCertifiedAddrBook(cab.addrBook) if ok { ttl := RecentlyConnectedAddrTTL - if host.Network().Connectedness(ev.Peer) == network.Connected || host.Network().Connectedness(ev.Peer) == network.Limited { + if hasValidConnectedness(host.Network().Connectedness(ev.Peer)) { ttl = ConnectedAddrTTL } _, err := cab.ConsumePeerRecord(ev.SignedPeerRecord, ttl) @@ -161,7 +161,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { } case event.EvtPeerConnectednessChanged: // If the peer is not connected or limited, we update the TTL - if ev.Connectedness != network.Connected && ev.Connectedness != network.Limited { + if !hasValidConnectedness(ev.Connectedness) { cab.addrBook.UpdateAddrs(ev.Peer, ConnectedAddrTTL, RecentlyConnectedAddrTTL) } } @@ -270,3 +270,7 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { } return result } + +func hasValidConnectedness(connectedness network.Connectedness) bool { + return connectedness == network.Connected || connectedness == network.Limited +} From fe372ac7eb19a47510754e307843169c9c7dc26d Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:10:58 +0100 Subject: [PATCH 43/80] fix: set ttl for both signed and unsigned addrs --- cached_addr_book.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 8c4a225..dd90013 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -141,14 +141,11 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { pState.connectFailures = 0 // reset connect failures on successful connection cab.mu.Unlock() + ttl := getTTL(host.Network().Connectedness(ev.Peer)) if ev.SignedPeerRecord != nil { logger.Debug("Caching signed peer record") cab, ok := peerstore.GetCertifiedAddrBook(cab.addrBook) if ok { - ttl := RecentlyConnectedAddrTTL - if hasValidConnectedness(host.Network().Connectedness(ev.Peer)) { - ttl = ConnectedAddrTTL - } _, err := cab.ConsumePeerRecord(ev.SignedPeerRecord, ttl) if err != nil { logger.Warnf("failed to consume signed peer record: %v", err) @@ -157,7 +154,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { } else { logger.Debug("No signed peer record, caching listen addresses") // We don't have a signed peer record, so we use the listen addresses - cab.addrBook.AddAddrs(ev.Peer, ev.ListenAddrs, ConnectedAddrTTL) + cab.addrBook.AddAddrs(ev.Peer, ev.ListenAddrs, ttl) } case event.EvtPeerConnectednessChanged: // If the peer is not connected or limited, we update the TTL @@ -274,3 +271,10 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { func hasValidConnectedness(connectedness network.Connectedness) bool { return connectedness == network.Connected || connectedness == network.Limited } + +func getTTL(connectedness network.Connectedness) time.Duration { + if hasValidConnectedness(connectedness) { + return ConnectedAddrTTL + } + return RecentlyConnectedAddrTTL +} From 03a4078b817bd3c401d8912ac9535dc3aa1d658f Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:13:17 +0100 Subject: [PATCH 44/80] fix: prevent race condition --- server_cached_router.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 40b9aff..187a05f 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -154,10 +154,10 @@ func (it *cacheFallbackIter) Next() bool { return true } logger.Infow("no cached addresses found in cacheFallbackIter, dispatching find peers", "peer", id) + + it.ongoingLookups.Add(1) // important to increment here since Next() may be called again synchronously // If a record has no addrs, we dispatch a lookup to find addresses go it.dispatchFindPeer(*record) - // important to increment here since Next() may be called again synchronously - it.ongoingLookups.Add(1) return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } From 84393fd34ff92e232266ae64243d9f85d896e218 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:04:16 +0100 Subject: [PATCH 45/80] feat: use 2q-lru cache for peer state 2q-lru tracks both frequently and recently used entries separately --- cached_addr_book.go | 85 +++++++++++++++++++++++----------------- cached_addr_book_test.go | 64 ++++++++++++++++++++---------- go.mod | 1 + go.sum | 2 + 4 files changed, 97 insertions(+), 55 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index dd90013..e695e36 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -7,6 +7,7 @@ import ( "sync/atomic" "time" + lru "github.com/hashicorp/golang-lru/v2" "github.com/ipfs/boxo/routing/http/types" "github.com/libp2p/go-libp2p-kad-dht/amino" "github.com/libp2p/go-libp2p/core/event" @@ -62,18 +63,22 @@ const ( // How long to wait for a connect in a probe to complete. // The worst case is a peer behind a relay, so we use the relay connect timeout. ConnectTimeout = relay.ConnectTimeout + + // How many peers to cache in the peer state cache + // 100_000 is also the default number of signed peer records cached by the memory address book. + PeerCacheSize = 100_000 ) type peerState struct { - lastConnTime time.Time // last time we successfully connected to this peer - returnCount int // number of times we've returned this peer from the cache - connectFailures int // number of times we've failed to connect to this peer + lastConnTime time.Time // last time we successfully connected to this peer + lastFailedConnTime time.Time // last time we failed to find or connect to this peer + connectFailures int // number of times we've failed to connect to this peer + returnCount int // number of times we've returned this peer from the cache //TODO: remove } type cachedAddrBook struct { - addrBook peerstore.AddrBook - peers map[peer.ID]*peerState - mu sync.RWMutex // Add mutex for thread safety + addrBook peerstore.AddrBook // memory address book + peerCache *lru.TwoQueueCache[peer.ID, peerState] // LRU cache with additional metadata about peer isProbing atomic.Bool allowPrivateIPs bool // for testing } @@ -88,9 +93,14 @@ func WithAllowPrivateIPs() AddrBookOption { } func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { + peerCache, err := lru.New2Q[peer.ID, peerState](PeerCacheSize) + if err != nil { + return nil, err + } + cab := &cachedAddrBook{ - peers: make(map[peer.ID]*peerState), - addrBook: pstoremem.NewAddrBook(), + peerCache: peerCache, + addrBook: pstoremem.NewAddrBook(), } for _, opt := range opts { @@ -130,16 +140,15 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { case ev := <-sub.Out(): switch ev := ev.(type) { case event.EvtPeerIdentificationCompleted: - cab.mu.Lock() - pState, exists := cab.peers[ev.Peer] + pState, exists := cab.peerCache.Get(ev.Peer) if !exists { - pState = &peerState{} - cab.peers[ev.Peer] = pState - peerStateSize.Set(float64(len(cab.peers))) + pState = peerState{} } pState.lastConnTime = time.Now() - pState.connectFailures = 0 // reset connect failures on successful connection - cab.mu.Unlock() + pState.lastFailedConnTime = time.Time{} // reset failed connection time + pState.connectFailures = 0 // reset connect failures on successful connection + cab.peerCache.Add(ev.Peer, pState) + peerStateSize.Set(float64(cab.peerCache.Len())) // update metric ttl := getTTL(host.Network().Connectedness(ev.Peer)) if ev.SignedPeerRecord != nil { @@ -170,7 +179,6 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { logger.Debug("Starting to probe peers") go cab.probePeers(ctx, host) } - // TODO: Add some cleanup logic to remove peers that haven't been returned from the cache in a while or have failed to connect too many times } } @@ -191,22 +199,25 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { semaphore := make(chan struct{}, MaxConcurrentProbes) for i, p := range cab.addrBook.PeersWithAddrs() { - connectedness := host.Network().Connectedness(p) - if connectedness == network.Connected || connectedness == network.Limited { + if hasValidConnectedness(host.Network().Connectedness(p)) { continue // don't probe connected peers } - cab.mu.RLock() - if time.Since(cab.peers[p].lastConnTime) < PeerProbeThreshold { - cab.mu.RUnlock() + pState, exists := cab.peerCache.Get(p) + if !exists { + logger.Errorf("peer %s not in peer cache but found in cached address book. This should not happen. ", p) + continue // TODO: maybe we should still probe them? + } + + if time.Since(pState.lastConnTime) < PeerProbeThreshold { continue // don't probe peers below the probe threshold } - if cab.peers[p].connectFailures > MaxConnectFailures { + + if pState.connectFailures > MaxConnectFailures { + // TODO: maybe implement a backoff strategy instead of clearing the peer's addresses cab.addrBook.ClearAddrs(p) // clear the peer's addresses - cab.mu.RUnlock() - continue // don't probe this peer + continue // don't probe this peer } - cab.mu.RUnlock() addrs := cab.addrBook.Addrs(p) if !cab.allowPrivateIPs { @@ -235,9 +246,14 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { }) if err != nil { logger.Debugf("failed to connect to peer %s: %v", p, err) - cab.mu.Lock() // Lock before accessing shared state - cab.peers[p].connectFailures++ - cab.mu.Unlock() + pState, exists := cab.peerCache.Get(p) + if !exists { + logger.Errorf("peer %s not in peer cache but found in cached address book. This should not happen. ", p) + pState = peerState{} + } + pState.connectFailures++ + pState.lastFailedConnTime = time.Now() + cab.peerCache.Add(p, pState) } }() } @@ -252,14 +268,13 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { return nil } - cab.mu.Lock() - // Initialize peer state if it doesn't exist - if _, exists := cab.peers[*p]; !exists { - cab.peers[*p] = &peerState{} - peerStateSize.Set(float64(len(cab.peers))) + pState, exists := cab.peerCache.Get(*p) + if !exists { + pState = peerState{} } - cab.peers[*p].returnCount++ - cab.mu.Unlock() + pState.returnCount++ + cab.peerCache.Add(*p, pState) + peerStateSize.Set(float64(cab.peerCache.Len())) var result []types.Multiaddr // convert to local Multiaddr type 🙃 for _, addr := range cachedAddrs { diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index f64216a..37ed1d0 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -2,11 +2,13 @@ package main import ( "context" + "fmt" "testing" "time" "github.com/libp2p/go-libp2p" "github.com/libp2p/go-libp2p/core/event" + "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" ma "github.com/multiformats/go-multiaddr" @@ -19,7 +21,7 @@ func TestCachedAddrBook(t *testing.T) { cab, err := newCachedAddrBook(WithAllowPrivateIPs()) require.NoError(t, err) require.NotNil(t, cab) - require.NotNil(t, cab.peers) + require.NotNil(t, cab.peerCache) require.NotNil(t, cab.addrBook) } @@ -37,14 +39,16 @@ func TestGetCachedAddrs(t *testing.T) { cab.addrBook.AddAddrs(testPeer, []ma.Multiaddr{addr1, addr2}, time.Hour) // Initialize peer state - cab.peers[testPeer] = &peerState{} + cab.peerCache.Add(testPeer, peerState{}) // Test getting addresses addrs := cab.GetCachedAddrs(&testPeer) assert.Len(t, addrs, 2) - // Verify return count and time were updated - assert.Equal(t, 1, cab.peers[testPeer].returnCount) + // Verify return count was updated + pState, exists := cab.peerCache.Get(testPeer) + assert.True(t, exists) + assert.Equal(t, 1, pState.returnCount) } func TestBackground(t *testing.T) { @@ -98,9 +102,7 @@ func TestBackground(t *testing.T) { go func() { defer close(done) for { - cab.mu.RLock() - _, exists := cab.peers[testPeer] - cab.mu.RUnlock() + _, exists := cab.peerCache.Get(testPeer) if exists { return } @@ -117,11 +119,9 @@ func TestBackground(t *testing.T) { } // Verify peer was added - cab.mu.RLock() - peerState, exists := cab.peers[testPeer] + pState, exists := cab.peerCache.Get(testPeer) assert.True(t, exists) - assert.NotNil(t, peerState) - cab.mu.RUnlock() + assert.NotNil(t, pState) } func TestProbePeers(t *testing.T) { @@ -129,9 +129,7 @@ func TestProbePeers(t *testing.T) { defer cancel() // Create a test libp2p host - h, err := libp2p.New() - require.NoError(t, err) - defer h.Close() + mockHost := &mockHost{} cab, err := newCachedAddrBook(WithAllowPrivateIPs()) require.NoError(t, err) @@ -142,15 +140,17 @@ func TestProbePeers(t *testing.T) { cab.addrBook.AddAddrs(testPeer, []ma.Multiaddr{addr}, time.Hour) // Initialize peer state with old connection time - cab.peers[testPeer] = &peerState{ + cab.peerCache.Add(testPeer, peerState{ lastConnTime: time.Now().Add(-2 * PeerProbeThreshold), - } + }) - // Run probe - cab.probePeers(ctx, h) + // Run probe with mockHost instead of h + cab.probePeers(ctx, mockHost) - // Verify connect failures increased (since connection will fail in test) - assert.Equal(t, 1, cab.peers[testPeer].connectFailures) + // Verify connect failures increased + pState, exists := cab.peerCache.Get(testPeer) + assert.True(t, exists) + assert.Equal(t, 1, pState.connectFailures) } // Mock connection for testing @@ -162,3 +162,27 @@ type mockConnection struct { func (mc *mockConnection) RemoteMultiaddr() ma.Multiaddr { return mc.remoteAddr } + +type mockHost struct { + host.Host +} + +func (mh *mockHost) Connect(ctx context.Context, pi peer.AddrInfo) error { + // Simulate connection failure + return fmt.Errorf("mock connection failure") +} + +// Add Network method to mockHost +func (mh *mockHost) Network() network.Network { + return &mockNetwork{} +} + +// Add mockNetwork implementation +type mockNetwork struct { + network.Network +} + +func (mn *mockNetwork) Connectedness(p peer.ID) network.Connectedness { + // Simulate not connected state + return network.NotConnected +} diff --git a/go.mod b/go.mod index 5510198..6f429af 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/coreos/go-systemd/v22 v22.5.0 github.com/dustin/go-humanize v1.0.1 github.com/felixge/httpsnoop v1.0.4 + github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/ipfs/boxo v0.24.4-0.20241119003055-e38f236348d6 github.com/ipfs/go-cid v0.4.1 github.com/ipfs/go-log/v2 v2.5.1 diff --git a/go.sum b/go.sum index 261b798..5f51037 100644 --- a/go.sum +++ b/go.sum @@ -186,6 +186,8 @@ github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c= github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/huin/goupnp v1.3.0 h1:UvLUlWDNpoUdYzb2TCn+MuTWtcjXKSza2n6CBdQ0xXc= From d466dc7c9902cc97921c096145cfe2fce295a1bf Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:09:04 +0100 Subject: [PATCH 46/80] chore: remove return count we don't need the return count with the 2q-lru cache and the peerAddrLookups metric --- cached_addr_book.go | 9 --------- cached_addr_book_test.go | 26 -------------------------- 2 files changed, 35 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index e695e36..a04d904 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -73,7 +73,6 @@ type peerState struct { lastConnTime time.Time // last time we successfully connected to this peer lastFailedConnTime time.Time // last time we failed to find or connect to this peer connectFailures int // number of times we've failed to connect to this peer - returnCount int // number of times we've returned this peer from the cache //TODO: remove } type cachedAddrBook struct { @@ -268,14 +267,6 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { return nil } - pState, exists := cab.peerCache.Get(*p) - if !exists { - pState = peerState{} - } - pState.returnCount++ - cab.peerCache.Add(*p, pState) - peerStateSize.Set(float64(cab.peerCache.Len())) - var result []types.Multiaddr // convert to local Multiaddr type 🙃 for _, addr := range cachedAddrs { result = append(result, types.Multiaddr{Multiaddr: addr}) diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index 37ed1d0..2cd8cfd 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -25,32 +25,6 @@ func TestCachedAddrBook(t *testing.T) { require.NotNil(t, cab.addrBook) } -func TestGetCachedAddrs(t *testing.T) { - cab, err := newCachedAddrBook(WithAllowPrivateIPs()) - require.NoError(t, err) - - // Create a test peer with new PeerID - testPeer, err := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") - require.NoError(t, err) - - // Add test addresses - addr1, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") - addr2, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/5678") - cab.addrBook.AddAddrs(testPeer, []ma.Multiaddr{addr1, addr2}, time.Hour) - - // Initialize peer state - cab.peerCache.Add(testPeer, peerState{}) - - // Test getting addresses - addrs := cab.GetCachedAddrs(&testPeer) - assert.Len(t, addrs, 2) - - // Verify return count was updated - pState, exists := cab.peerCache.Get(testPeer) - assert.True(t, exists) - assert.Equal(t, 1, pState.returnCount) -} - func TestBackground(t *testing.T) { t.Skip("skipping until this test is less flaky") ctx, cancel := context.WithCancel(context.Background()) From 8078cb51420e31d4597dbbd19ec739fe7cef438b Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:31:46 +0100 Subject: [PATCH 47/80] test: improve reliability of tests mock the libp2p host and use a real event bus --- cached_addr_book_test.go | 73 ++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index 2cd8cfd..894094d 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -6,11 +6,11 @@ import ( "testing" "time" - "github.com/libp2p/go-libp2p" "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/p2p/host/eventbus" ma "github.com/multiformats/go-multiaddr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -30,40 +30,34 @@ func TestBackground(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - // Create a test libp2p host - h, err := libp2p.New(libp2p.ListenAddrStrings("/ip4/127.0.0.1/tcp/0")) - require.NoError(t, err) - defer h.Close() + // Create a real event bus + eventBus := eventbus.NewBus() - em, err := h.EventBus().Emitter(&event.EvtPeerIdentificationCompleted{}) + emitter, err := eventBus.Emitter(new(event.EvtPeerIdentificationCompleted)) require.NoError(t, err) - defer em.Close() + + // Use a mock host with a real event bus + mockHost := &mockHost{ + eventBus: eventBus, + } cab, err := newCachedAddrBook(WithAllowPrivateIPs()) require.NoError(t, err) - // Create a channel to signal when background processing is ready - ready := make(chan struct{}) - - // Start background process with ready signal - go func() { - // Signal ready before starting background process - close(ready) - cab.background(ctx, h) - }() - - // Wait for background process to start - <-ready + ctx, cancel = context.WithTimeout(ctx, time.Second*5) + defer cancel() + go cab.background(ctx, mockHost) - // Create a test peer with new PeerID + // Create a test peer testPeer, err := peer.Decode("12D3KooWCZ67sU8oCvKd82Y6c9NgpqgoZYuZEUcg4upHCjK3n1aj") require.NoError(t, err) - // Simulate peer identification event - addr, _ := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") + // Create test address + addr, err := ma.NewMultiaddr("/ip4/127.0.0.1/tcp/1234") + require.NoError(t, err) - // Emit the event after setting up the waiter - err = em.Emit(event.EvtPeerIdentificationCompleted{ + // Emit a real peer identification event + err = emitter.Emit(event.EvtPeerIdentificationCompleted{ Peer: testPeer, Conn: &mockConnection{ remoteAddr: addr, @@ -72,27 +66,13 @@ func TestBackground(t *testing.T) { }) require.NoError(t, err) - done := make(chan struct{}) - go func() { - defer close(done) - for { - _, exists := cab.peerCache.Get(testPeer) - if exists { - return - } - time.Sleep(30 * time.Millisecond) - } - }() - - // Wait for processing with timeout - select { - case <-done: - // Success case - continue to verification - case <-time.After(time.Second * 5): - t.Fatal("timeout waiting for peer to be added to peer state") - } + // Wait for the peer to be added to the cache + require.Eventually(t, func() bool { + _, exists := cab.peerCache.Get(testPeer) + return exists + }, time.Second*5, time.Millisecond*100, "peer was not added to cache") - // Verify peer was added + // Verify peer state pState, exists := cab.peerCache.Get(testPeer) assert.True(t, exists) assert.NotNil(t, pState) @@ -139,6 +119,7 @@ func (mc *mockConnection) RemoteMultiaddr() ma.Multiaddr { type mockHost struct { host.Host + eventBus event.Bus } func (mh *mockHost) Connect(ctx context.Context, pi peer.AddrInfo) error { @@ -160,3 +141,7 @@ func (mn *mockNetwork) Connectedness(p peer.ID) network.Connectedness { // Simulate not connected state return network.NotConnected } + +func (mh *mockHost) EventBus() event.Bus { + return mh.eventBus +} From 7decf6c45e15e873400a37dca313d2514eefac29 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:41:23 +0100 Subject: [PATCH 48/80] fix: record failed connections --- cached_addr_book.go | 25 +++++++++++++++---------- server_cached_router.go | 11 +++++++---- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index a04d904..7688c7f 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -245,14 +245,7 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { }) if err != nil { logger.Debugf("failed to connect to peer %s: %v", p, err) - pState, exists := cab.peerCache.Get(p) - if !exists { - logger.Errorf("peer %s not in peer cache but found in cached address book. This should not happen. ", p) - pState = peerState{} - } - pState.connectFailures++ - pState.lastFailedConnTime = time.Now() - cab.peerCache.Add(p, pState) + cab.RecordFailedConnection(p) } }() } @@ -260,8 +253,8 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { } // Returns the cached addresses for a peer, incrementing the return count -func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { - cachedAddrs := cab.addrBook.Addrs(*p) +func (cab *cachedAddrBook) GetCachedAddrs(p peer.ID) []types.Multiaddr { + cachedAddrs := cab.addrBook.Addrs(p) if len(cachedAddrs) == 0 { return nil @@ -274,6 +267,18 @@ func (cab *cachedAddrBook) GetCachedAddrs(p *peer.ID) []types.Multiaddr { return result } +// Update the peer cache with information about a failed connection +// This should be called when a connection attempt to a peer fails +func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { + pState, exists := cab.peerCache.Get(p) + if !exists { + pState = peerState{} + } + pState.lastFailedConnTime = time.Now() + pState.connectFailures++ + cab.peerCache.Add(p, pState) +} + func hasValidConnectedness(connectedness network.Connectedness) bool { return connectedness == network.Connected || connectedness == network.Limited } diff --git a/server_cached_router.go b/server_cached_router.go index 187a05f..cb388c2 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -71,8 +71,10 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it it, err := r.router.FindPeers(ctx, pid, limit) if err == routing.ErrNotFound { + // ErrNotFound will be returned if either dialing the peer failed or the peer was not found + r.cachedAddrBook.RecordFailedConnection(pid) // If we didn't find the peer, try the cache - cachedAddrs := r.withAddrsFromCache(addrQueryOriginPeers, &pid, nil) + cachedAddrs := r.withAddrsFromCache(addrQueryOriginPeers, pid, nil) if len(cachedAddrs) > 0 { return iter.ToResultIter(iter.FromSlice([]*types.PeerRecord{ { @@ -91,14 +93,14 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it // If the peer was found, there is likely no point in looking up the cache (because kad-dht will connect to it as part of FindPeers), but we'll do it just in case. return iter.Map(it, func(record iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { - record.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, record.Val.ID, record.Val.Addrs) + record.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, *record.Val.ID, record.Val.Addrs) return record }), nil } // withAddrsFromCache returns the best list of addrs for specified [peer.ID]. // It will consult cache ONLY if the addrs slice passed to it is empty. -func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid *peer.ID, addrs []types.Multiaddr) []types.Multiaddr { +func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid peer.ID, addrs []types.Multiaddr) []types.Multiaddr { // skip cache if we already have addrs if len(addrs) > 0 { peerAddrLookups.WithLabelValues(addrCacheStateUnused, queryOrigin).Inc() @@ -148,13 +150,14 @@ func (it *cacheFallbackIter) Next() bool { if it.sourceIter.Next() { val := it.sourceIter.Val() handleRecord := func(id *peer.ID, record *types.PeerRecord) bool { - record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, id, record.Addrs) + record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, *id, record.Addrs) if record.Addrs != nil { it.current = iter.Result[types.Record]{Val: record} return true } logger.Infow("no cached addresses found in cacheFallbackIter, dispatching find peers", "peer", id) + // TODO: Before dispatching, implement a backoff strategy based on the failed connection time it.ongoingLookups.Add(1) // important to increment here since Next() may be called again synchronously // If a record has no addrs, we dispatch a lookup to find addresses go it.dispatchFindPeer(*record) From b536e82ff40bf1134cb53407811089e6cb957f91 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 18:08:43 +0100 Subject: [PATCH 49/80] feat: add exponential backoff for probes/peer lookups --- cached_addr_book.go | 44 +++++++++++-------- cached_addr_book_test.go | 91 +++++++++++++++++++++++++++++++++++++++- server_cached_router.go | 9 ++-- 3 files changed, 121 insertions(+), 23 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 7688c7f..94d5ebc 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -51,15 +51,12 @@ const ( // How long to wait since last connection before probing a peer again PeerProbeThreshold = time.Hour - // How often to run the probe peers function (Same as RecentlyConnectedAddrTTL) + // How often to run the probe peers loop ProbeInterval = time.Minute * 15 // How many concurrent probes to run at once MaxConcurrentProbes = 20 - // How many connect failures to tolerate before clearing a peer's addresses - MaxConnectFailures = 3 - // How long to wait for a connect in a probe to complete. // The worst case is a peer behind a relay, so we use the relay connect timeout. ConnectTimeout = relay.ConnectTimeout @@ -72,7 +69,7 @@ const ( type peerState struct { lastConnTime time.Time // last time we successfully connected to this peer lastFailedConnTime time.Time // last time we failed to find or connect to this peer - connectFailures int // number of times we've failed to connect to this peer + connectFailures uint // number of times we've failed to connect to this peer } type cachedAddrBook struct { @@ -202,21 +199,10 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { continue // don't probe connected peers } - pState, exists := cab.peerCache.Get(p) - if !exists { - logger.Errorf("peer %s not in peer cache but found in cached address book. This should not happen. ", p) - continue // TODO: maybe we should still probe them? - } - - if time.Since(pState.lastConnTime) < PeerProbeThreshold { - continue // don't probe peers below the probe threshold + if !cab.ShouldProbePeer(p) { + continue } - if pState.connectFailures > MaxConnectFailures { - // TODO: maybe implement a backoff strategy instead of clearing the peer's addresses - cab.addrBook.ClearAddrs(p) // clear the peer's addresses - continue // don't probe this peer - } addrs := cab.addrBook.Addrs(p) if !cab.allowPrivateIPs { @@ -279,6 +265,28 @@ func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { cab.peerCache.Add(p, pState) } +// Returns true if we should probe a peer (either by dialing known addresses or by dispatching a FindPeer) +// based on the last failed connection time and connection failures +func (cab *cachedAddrBook) ShouldProbePeer(p peer.ID) bool { + pState, exists := cab.peerCache.Get(p) + if !exists { + return true // default to probing if the peer is not in the cache + } + + var backoffDuration time.Duration + if pState.connectFailures > 0 { + // Calculate backoff only if we have failures + // this is effectively 2^(connectFailures - 1) * PeerProbeThreshold + // A single failure results in a 1 hour backoff + backoffDuration = PeerProbeThreshold * time.Duration(1<<(pState.connectFailures-1)) + } else { + backoffDuration = PeerProbeThreshold + } + + // Only dispatch if we've waited long enough based on the backoff + return time.Since(pState.lastFailedConnTime) > backoffDuration +} + func hasValidConnectedness(connectedness network.Connectedness) bool { return connectedness == network.Connected || connectedness == network.Limited } diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index 894094d..e30715e 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -104,7 +104,96 @@ func TestProbePeers(t *testing.T) { // Verify connect failures increased pState, exists := cab.peerCache.Get(testPeer) assert.True(t, exists) - assert.Equal(t, 1, pState.connectFailures) + assert.Equal(t, pState.connectFailures, uint(1)) +} + +func TestShouldProbePeer(t *testing.T) { + t.Parallel() + + cab, err := newCachedAddrBook() + require.NoError(t, err) + + testPeer := peer.ID("test-peer") + + tests := []struct { + name string + peerState peerState + expectedResult bool + }{ + { + name: "peer not in cache", + peerState: peerState{}, + expectedResult: true, + }, + { + name: "no failures, within threshold", + peerState: peerState{ + lastFailedConnTime: time.Now().Add(-30 * time.Minute), + connectFailures: 0, + }, + expectedResult: false, + }, + { + name: "no failures, beyond threshold", + peerState: peerState{ + lastFailedConnTime: time.Now().Add(-2 * PeerProbeThreshold), + connectFailures: 0, + }, + expectedResult: true, + }, + { + name: "one failure, within backoff", + peerState: peerState{ + lastFailedConnTime: time.Now().Add(-90 * time.Minute), + connectFailures: 1, + }, + expectedResult: true, + }, + { + name: "one failure, beyond backoff", + peerState: peerState{ + lastFailedConnTime: time.Now().Add(-3 * PeerProbeThreshold), + connectFailures: 1, + }, + expectedResult: true, + }, + { + name: "two failures, within backoff", + peerState: peerState{ + lastFailedConnTime: time.Now().Add(-90 * time.Minute), + connectFailures: 2, + }, + expectedResult: false, + }, + { + name: "two failures, beyond backoff", + peerState: peerState{ + lastFailedConnTime: time.Now().Add(-2 * PeerProbeThreshold), + connectFailures: 2, + }, + expectedResult: true, + }, + { + name: "never failed connection", + peerState: peerState{ + lastFailedConnTime: time.Time{}, // zero time + connectFailures: 0, + }, + expectedResult: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.peerState != (peerState{}) { + cab.peerCache.Add(testPeer, tt.peerState) + } + result := cab.ShouldProbePeer(testPeer) + assert.Equal(t, tt.expectedResult, result, + "expected ShouldProbePeer to return %v for case: %s", + tt.expectedResult, tt.name) + }) + } } // Mock connection for testing diff --git a/server_cached_router.go b/server_cached_router.go index cb388c2..bcc339f 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -157,10 +157,11 @@ func (it *cacheFallbackIter) Next() bool { } logger.Infow("no cached addresses found in cacheFallbackIter, dispatching find peers", "peer", id) - // TODO: Before dispatching, implement a backoff strategy based on the failed connection time - it.ongoingLookups.Add(1) // important to increment here since Next() may be called again synchronously - // If a record has no addrs, we dispatch a lookup to find addresses - go it.dispatchFindPeer(*record) + if it.router.cachedAddrBook.ShouldProbePeer(*id) { + it.ongoingLookups.Add(1) // important to increment before dispatchFindPeer + // If a record has no addrs, we dispatch a lookup to find addresses + go it.dispatchFindPeer(*record) + } return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } From 7182699f3aafa63fdc348883777a03cbdff4a42d Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Thu, 5 Dec 2024 18:48:07 +0100 Subject: [PATCH 50/80] fix: return peers with no addrs that wont probe --- server_cached_router.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/server_cached_router.go b/server_cached_router.go index bcc339f..cdcb7fa 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -161,9 +161,14 @@ func (it *cacheFallbackIter) Next() bool { it.ongoingLookups.Add(1) // important to increment before dispatchFindPeer // If a record has no addrs, we dispatch a lookup to find addresses go it.dispatchFindPeer(*record) + + return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } - return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result + // If we're not going to probe, return the record with no addrs + // TODO: should we even return these if the peer is likely unreachable? + it.current = iter.Result[types.Record]{Val: record} + return true } switch val.Val.GetSchema() { From b0b24e05c189b426c7cfd7a6b912d695fff36a20 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 6 Dec 2024 10:41:10 +0100 Subject: [PATCH 51/80] fix: brittle test --- cached_addr_book_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index e30715e..fc773e2 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -168,7 +168,7 @@ func TestShouldProbePeer(t *testing.T) { { name: "two failures, beyond backoff", peerState: peerState{ - lastFailedConnTime: time.Now().Add(-2 * PeerProbeThreshold), + lastFailedConnTime: time.Now().Add(-3 * PeerProbeThreshold), connectFailures: 2, }, expectedResult: true, From 697457d1d575ac1df8d50062d308872ed845991f Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:00:26 +0100 Subject: [PATCH 52/80] feat: add probed peers counter --- cached_addr_book.go | 46 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 94d5ebc..04ee223 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -23,25 +23,8 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" ) -var ( - probeDurationHistogram = promauto.NewHistogram(prometheus.HistogramOpts{ - Name: "probe_duration_seconds", - Namespace: name, - Subsystem: "cached_addr_book", - Help: "Duration of peer probing operations in seconds", - // Buckets probe durations from 1s to 5 minutes - Buckets: []float64{1, 2, 5, 10, 30, 60, 120, 300}, - }) - - peerStateSize = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "peer_state_size", - Subsystem: "cached_addr_book", - Namespace: name, - Help: "Number of peers object currently in the peer state", - }) -) - const ( + Subsystem = "cached_addr_book" // The TTL to keep recently connected peers for. Same as [amino.DefaultProvideValidity] in go-libp2p-kad-dht RecentlyConnectedAddrTTL = amino.DefaultProvideValidity @@ -66,6 +49,31 @@ const ( PeerCacheSize = 100_000 ) +var ( + probeDurationHistogram = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "probe_duration_seconds", + Namespace: name, + Subsystem: Subsystem, + Help: "Duration of peer probing operations in seconds", + // Buckets probe durations from 1s to 5 minutes + Buckets: []float64{1, 2, 5, 10, 30, 60, 120, 300}, + }) + + probedPeersCounter = promauto.NewCounter(prometheus.CounterOpts{ + Name: "probed_peers", + Subsystem: Subsystem, + Namespace: name, + Help: "Number of peers probed", + }) + + peerStateSize = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "peer_state_size", + Subsystem: Subsystem, + Namespace: name, + Help: "Number of peers object currently in the peer state", + }) +) + type peerState struct { lastConnTime time.Time // last time we successfully connected to this peer lastFailedConnTime time.Time // last time we failed to find or connect to this peer @@ -220,7 +228,7 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { <-semaphore // Release semaphore wg.Done() }() - + probedPeersCounter.Inc() ctx, cancel := context.WithTimeout(ctx, ConnectTimeout) defer cancel() logger.Debugf("Probe %d: PeerID: %s, Addrs: %v", i+1, p, addrs) From 7fcf45fccca84fed279873765af93eac3008cea0 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:48:27 +0100 Subject: [PATCH 53/80] fix: adjust probe duration metric buckets --- cached_addr_book.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 04ee223..ce50a96 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -55,8 +55,8 @@ var ( Namespace: name, Subsystem: Subsystem, Help: "Duration of peer probing operations in seconds", - // Buckets probe durations from 1s to 5 minutes - Buckets: []float64{1, 2, 5, 10, 30, 60, 120, 300}, + // Buckets probe durations from 5s to 15 minutes + Buckets: []float64{5, 10, 30, 60, 120, 300, 600, 900}, }) probedPeersCounter = promauto.NewCounter(prometheus.CounterOpts{ From 171821532bf5a7e9a15c45608f8ff5a532eb74f0 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:04:08 +0100 Subject: [PATCH 54/80] fix: prevent race conditions --- cached_addr_book.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index ce50a96..d031448 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -181,6 +181,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { continue } logger.Debug("Starting to probe peers") + cab.isProbing.Store(true) go cab.probePeers(ctx, host) } } @@ -188,7 +189,6 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { // Loops over all peers with addresses and probes them if they haven't been probed recently func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { - cab.isProbing.Store(true) defer cab.isProbing.Store(false) start := time.Now() @@ -222,8 +222,8 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { } wg.Add(1) + semaphore <- struct{}{} go func() { - semaphore <- struct{}{} defer func() { <-semaphore // Release semaphore wg.Done() From dc57e9f9eba17d19d54e1d30056f85904af916bc Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:18:56 +0100 Subject: [PATCH 55/80] feat: increase cache size and add max backoff --- cached_addr_book.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index d031448..54cb495 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -45,8 +45,11 @@ const ( ConnectTimeout = relay.ConnectTimeout // How many peers to cache in the peer state cache - // 100_000 is also the default number of signed peer records cached by the memory address book. - PeerCacheSize = 100_000 + // 1_000_000 is 10x the default number of signed peer records cached by the memory address book. + PeerCacheSize = 1_000_000 + + // Maximum backoff duration for probing a peer + MaxBackoffDuration = time.Hour * 24 ) var ( @@ -285,8 +288,9 @@ func (cab *cachedAddrBook) ShouldProbePeer(p peer.ID) bool { if pState.connectFailures > 0 { // Calculate backoff only if we have failures // this is effectively 2^(connectFailures - 1) * PeerProbeThreshold - // A single failure results in a 1 hour backoff + // A single failure results in a 1 hour backoff and each additional failure doubles the backoff up to 24 hours backoffDuration = PeerProbeThreshold * time.Duration(1<<(pState.connectFailures-1)) + backoffDuration = min(backoffDuration, MaxBackoffDuration) // clamp to max backoff duration } else { backoffDuration = PeerProbeThreshold } From c5abeec8c18055d9c3c871db2f387ac819f38e27 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Mon, 9 Dec 2024 12:33:16 +0100 Subject: [PATCH 56/80] fix: omit providers whose peer cannot be found --- server_cached_router.go | 9 +++++++-- server_cached_router_test.go | 24 ++++++++++-------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index cdcb7fa..5af54f0 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -186,9 +186,14 @@ func (it *cacheFallbackIter) Next() bool { logger.Infow("waiting for ongoing find peers result") select { case result, ok := <-it.findPeersResult: - if ok { + if !ok { + return false // channel closed. We're done + } + if result.Addrs != nil { // Only if the lookup returned a result and it has addrs it.current = iter.Result[types.Record]{Val: &result} return true + } else { + return it.Next() // recursively call Next() in case there are more ongoing lookups } case <-it.ctx.Done(): return false @@ -220,7 +225,7 @@ func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { return } if len(peers) > 0 { - // If we found the peer, pass back + // If we found the peer, pass back the result it.findPeersResult <- *peers[0] } else { it.findPeersResult <- record // pass back the record with no addrs diff --git a/server_cached_router_test.go b/server_cached_router_test.go index 3119e3b..cfe79ff 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -324,15 +324,20 @@ func TestCacheFallbackIter(t *testing.T) { t.Run("handles context cancellation during lookup", func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) pid := peer.ID("test-peer") + publicAddr := mustMultiaddr(t, "/ip4/137.21.14.12/tcp/4001") // Create source iterator with record without addresses sourceIter := newMockResultIter([]iter.Result[types.Record]{ {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: nil}}, }) - // Create mock router with FindPeers that returns ErrNotFound + // Create mock router with FindPeers that returns mr := &mockRouter{} - mr.On("FindPeers", mock.Anything, pid, 1).Return(nil, routing.ErrNotFound) + // mr.On("FindPeers", mock.Anything, pid, 1).Return(nil, routing.ErrNotFound) + findPeersIter := newMockResultIter([]iter.Result[*types.PeerRecord]{ + {Val: &types.PeerRecord{Schema: "peer", ID: &pid, Addrs: []types.Multiaddr{publicAddr}}}, + }) + mr.On("FindPeers", mock.Anything, pid, 1).Return(findPeersIter, nil) // Create cached router cab, err := newCachedAddrBook() @@ -342,22 +347,14 @@ func TestCacheFallbackIter(t *testing.T) { // Create fallback iterator fallbackIter := NewCacheFallbackIter(sourceIter, cr, ctx) - // First Next() should trigger lookup - require.True(t, fallbackIter.Next()) - // Cancel context during lookup cancel() - // Next() should return false + // First Next() should trigger lookup require.False(t, fallbackIter.Next()) - - // Val() should return the record with no addrs - result := fallbackIter.Val() - require.Equal(t, pid, *result.Val.(*types.PeerRecord).ID) - require.Len(t, result.Val.(*types.PeerRecord).Addrs, 0) }) - t.Run("handles FindPeers error gracefully", func(t *testing.T) { + t.Run("Fallback FindPeers with no addresses is omitted from result", func(t *testing.T) { ctx := context.Background() pid := peer.ID("test-peer") @@ -381,8 +378,7 @@ func TestCacheFallbackIter(t *testing.T) { // Should still get a result, but with no addresses results, err := iter.ReadAllResults(fallbackIter) require.NoError(t, err) - require.Len(t, results, 1) - require.Empty(t, results[0].(*types.PeerRecord).Addrs) + require.Len(t, results, 0) }) t.Run("handles multiple records with mixed address states", func(t *testing.T) { From 0cc76f904102dc9ecea0ec1b679831f3a26b96c7 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 10 Dec 2024 12:43:15 +0100 Subject: [PATCH 57/80] chore: remove unused function --- server_cached_router.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 5af54f0..483ddfd 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -11,7 +11,6 @@ import ( "github.com/ipfs/go-cid" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/routing" - ma "github.com/multiformats/go-multiaddr" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -242,11 +241,3 @@ func (it *cacheFallbackIter) Close() error { }() return it.sourceIter.Close() } - -func ToMultiaddrs(addrs []ma.Multiaddr) []types.Multiaddr { - var result []types.Multiaddr - for _, addr := range addrs { - result = append(result, types.Multiaddr{Multiaddr: addr}) - } - return result -} From f0e0bd414c241dd2adec2ffefc04229cd4ca2706 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 10 Dec 2024 12:51:03 +0100 Subject: [PATCH 58/80] deps: upgrade go-libp2p --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 6f429af..8180fea 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/ipfs/boxo v0.24.4-0.20241119003055-e38f236348d6 github.com/ipfs/go-cid v0.4.1 github.com/ipfs/go-log/v2 v2.5.1 - github.com/libp2p/go-libp2p v0.37.1 + github.com/libp2p/go-libp2p v0.37.2 github.com/libp2p/go-libp2p-kad-dht v0.28.1 github.com/libp2p/go-libp2p-record v0.2.0 github.com/multiformats/go-multiaddr v0.13.0 @@ -91,7 +91,7 @@ require ( github.com/mr-tron/base58 v1.2.0 // indirect github.com/multiformats/go-base32 v0.1.0 // indirect github.com/multiformats/go-base36 v0.2.0 // indirect - github.com/multiformats/go-multiaddr-dns v0.4.0 // indirect + github.com/multiformats/go-multiaddr-dns v0.4.1 // indirect github.com/multiformats/go-multiaddr-fmt v0.1.0 // indirect github.com/multiformats/go-multicodec v0.9.0 // indirect github.com/multiformats/go-multistream v0.6.0 // indirect @@ -124,7 +124,7 @@ require ( github.com/prometheus/common v0.60.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/quic-go/qpack v0.5.1 // indirect - github.com/quic-go/quic-go v0.48.1 // indirect + github.com/quic-go/quic-go v0.48.2 // indirect github.com/quic-go/webtransport-go v0.8.1-0.20241018022711-4ac2c9250e66 // indirect github.com/raulk/go-watchdog v1.3.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect diff --git a/go.sum b/go.sum index 5f51037..627eddd 100644 --- a/go.sum +++ b/go.sum @@ -275,8 +275,8 @@ github.com/libp2p/go-flow-metrics v0.0.1/go.mod h1:Iv1GH0sG8DtYN3SVJ2eG221wMiNpZ github.com/libp2p/go-flow-metrics v0.0.3/go.mod h1:HeoSNUrOJVK1jEpDqVEiUOIXqhbnS27omG0uWU5slZs= github.com/libp2p/go-flow-metrics v0.2.0 h1:EIZzjmeOE6c8Dav0sNv35vhZxATIXWZg6j/C08XmmDw= github.com/libp2p/go-flow-metrics v0.2.0/go.mod h1:st3qqfu8+pMfh+9Mzqb2GTiwrAGjIPszEjZmtksN8Jc= -github.com/libp2p/go-libp2p v0.37.1 h1:9p6fLUGmegmI1VuD9y7jgKvisMYNl44HQSiEmPUNi4c= -github.com/libp2p/go-libp2p v0.37.1/go.mod h1:K7H2RGSoEYdi6v85xlSzqW2oqGz7t98nq+b2eRdfvW8= +github.com/libp2p/go-libp2p v0.37.2 h1:Irh+n9aDPTLt9wJYwtlHu6AhMUipbC1cGoJtOiBqI9c= +github.com/libp2p/go-libp2p v0.37.2/go.mod h1:M8CRRywYkqC6xKHdZ45hmqVckBj5z4mRLIMLWReypz8= github.com/libp2p/go-libp2p-asn-util v0.4.1 h1:xqL7++IKD9TBFMgnLPZR6/6iYhawHKHl950SO9L6n94= github.com/libp2p/go-libp2p-asn-util v0.4.1/go.mod h1:d/NI6XZ9qxw67b4e+NgpQexCIiFYJjErASrYW4PFDN8= github.com/libp2p/go-libp2p-core v0.2.4/go.mod h1:STh4fdfa5vDYr0/SzYYeqnt+E6KfEV5VxfIrm0bcI0g= @@ -357,8 +357,8 @@ github.com/multiformats/go-multiaddr v0.1.1/go.mod h1:aMKBKNEYmzmDmxfX88/vz+J5IU github.com/multiformats/go-multiaddr v0.2.0/go.mod h1:0nO36NvPpyV4QzvTLi/lafl2y95ncPj0vFwVF6k6wJ4= github.com/multiformats/go-multiaddr v0.13.0 h1:BCBzs61E3AGHcYYTv8dqRH43ZfyrqM8RXVPT8t13tLQ= github.com/multiformats/go-multiaddr v0.13.0/go.mod h1:sBXrNzucqkFJhvKOiwwLyqamGa/P5EIXNPLovyhQCII= -github.com/multiformats/go-multiaddr-dns v0.4.0 h1:P76EJ3qzBXpUXZ3twdCDx/kvagMsNo0LMFXpyms/zgU= -github.com/multiformats/go-multiaddr-dns v0.4.0/go.mod h1:7hfthtB4E4pQwirrz+J0CcDUfbWzTqEzVyYKKIKpgkc= +github.com/multiformats/go-multiaddr-dns v0.4.1 h1:whi/uCLbDS3mSEUMb1MsoT4uzUeZB0N32yzufqS0i5M= +github.com/multiformats/go-multiaddr-dns v0.4.1/go.mod h1:7hfthtB4E4pQwirrz+J0CcDUfbWzTqEzVyYKKIKpgkc= github.com/multiformats/go-multiaddr-fmt v0.1.0 h1:WLEFClPycPkp4fnIzoFoV9FVd49/eQsuaL3/CWe167E= github.com/multiformats/go-multiaddr-fmt v0.1.0/go.mod h1:hGtDIW4PU4BqJ50gW2quDuPVjyWNZxToGUh/HwTZYJo= github.com/multiformats/go-multiaddr-net v0.1.1/go.mod h1:5JNbcfBOP4dnhoZOv10JJVkJO0pCCEf8mTnipAo2UZQ= @@ -469,8 +469,8 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/quic-go/qpack v0.5.1 h1:giqksBPnT/HDtZ6VhtFKgoLOWmlyo9Ei6u9PqzIMbhI= github.com/quic-go/qpack v0.5.1/go.mod h1:+PC4XFrEskIVkcLzpEkbLqq1uCoxPhQuvK5rH1ZgaEg= -github.com/quic-go/quic-go v0.48.1 h1:y/8xmfWI9qmGTc+lBr4jKRUWLGSlSigv847ULJ4hYXA= -github.com/quic-go/quic-go v0.48.1/go.mod h1:yBgs3rWBOADpga7F+jJsb6Ybg1LSYiQvwWlLX+/6HMs= +github.com/quic-go/quic-go v0.48.2 h1:wsKXZPeGWpMpCGSWqOcqpW2wZYic/8T3aqiOID0/KWE= +github.com/quic-go/quic-go v0.48.2/go.mod h1:yBgs3rWBOADpga7F+jJsb6Ybg1LSYiQvwWlLX+/6HMs= github.com/quic-go/webtransport-go v0.8.1-0.20241018022711-4ac2c9250e66 h1:4WFk6u3sOT6pLa1kQ50ZVdm8BQFgJNA117cepZxtLIg= github.com/quic-go/webtransport-go v0.8.1-0.20241018022711-4ac2c9250e66/go.mod h1:Vp72IJajgeOL6ddqrAhmp7IM9zbTcgkQxD/YdxrVwMw= github.com/raulk/go-watchdog v1.3.0 h1:oUmdlHxdkXRJlwfG0O9omj8ukerm8MEQavSiDTEtBsk= From 2211aae9db77cb6ae0897bb77469308697e46ecd Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:39:05 +0100 Subject: [PATCH 59/80] fix: avoid using the cache in FindPeers --- server_cached_router.go | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 483ddfd..9b49206 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -71,18 +71,7 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it if err == routing.ErrNotFound { // ErrNotFound will be returned if either dialing the peer failed or the peer was not found - r.cachedAddrBook.RecordFailedConnection(pid) - // If we didn't find the peer, try the cache - cachedAddrs := r.withAddrsFromCache(addrQueryOriginPeers, pid, nil) - if len(cachedAddrs) > 0 { - return iter.ToResultIter(iter.FromSlice([]*types.PeerRecord{ - { - Schema: types.SchemaPeer, - ID: &pid, - Addrs: cachedAddrs, - }, - })), nil - } + r.cachedAddrBook.RecordFailedConnection(pid) // record the failure used for probing/backoff purposes return nil, routing.ErrNotFound } @@ -90,11 +79,9 @@ func (r cachedRouter) FindPeers(ctx context.Context, pid peer.ID, limit int) (it return nil, err } - // If the peer was found, there is likely no point in looking up the cache (because kad-dht will connect to it as part of FindPeers), but we'll do it just in case. - return iter.Map(it, func(record iter.Result[*types.PeerRecord]) iter.Result[*types.PeerRecord] { - record.Val.Addrs = r.withAddrsFromCache(addrQueryOriginPeers, *record.Val.ID, record.Val.Addrs) - return record - }), nil + // update the metrics to indicate that we didn't look up the cache for this lookup + peerAddrLookups.WithLabelValues(addrCacheStateUnused, addrQueryOriginPeers).Inc() + return it, nil } // withAddrsFromCache returns the best list of addrs for specified [peer.ID]. From be5958abc30f1bfd4ffa817fbd387f397ebb2572 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:46:06 +0100 Subject: [PATCH 60/80] fix: do not return cached results for FindPeers --- server_cached_router_test.go | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/server_cached_router_test.go b/server_cached_router_test.go index cfe79ff..e0ec1c6 100644 --- a/server_cached_router_test.go +++ b/server_cached_router_test.go @@ -88,7 +88,7 @@ func TestCachedRouter(t *testing.T) { require.Equal(t, publicAddr.String(), peerRecord.Addrs[0].String()) }) - t.Run("FindPeers with cache hit", func(t *testing.T) { + t.Run("Failed FindPeers with cached addresses does not return cached addresses", func(t *testing.T) { ctx := context.Background() pid := peer.ID("test-peer") @@ -106,17 +106,8 @@ func TestCachedRouter(t *testing.T) { // Create cached router cr := NewCachedRouter(mr, cab) - it, err := cr.FindPeers(ctx, pid, 10) - require.NoError(t, err) - - results, err := iter.ReadAllResults(it) - require.NoError(t, err) - require.Len(t, results, 1) - - // Verify cached addresses were returned - require.Equal(t, pid, *results[0].ID) - require.Len(t, results[0].Addrs, 1) - require.Equal(t, publicAddr.String(), results[0].Addrs[0].String()) + _, err = cr.FindPeers(ctx, pid, 10) + require.ErrorIs(t, err, routing.ErrNotFound) }) t.Run("FindPeers with cache miss", func(t *testing.T) { From af7c3a840de7ca241d0aaa5440c9f8b235b772e5 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:49:44 +0100 Subject: [PATCH 61/80] refactor: small optimisation --- cached_addr_book.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 54cb495..d20fbd5 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -257,7 +257,7 @@ func (cab *cachedAddrBook) GetCachedAddrs(p peer.ID) []types.Multiaddr { return nil } - var result []types.Multiaddr // convert to local Multiaddr type 🙃 + result := make([]types.Multiaddr, 0, len(cachedAddrs)) for _, addr := range cachedAddrs { result = append(result, types.Multiaddr{Multiaddr: addr}) } From 62c0d9fa55d9108a9fc6b859a4c310c41d344535 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:15:02 +0100 Subject: [PATCH 62/80] chore: re-add comment --- cached_addr_book.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index d20fbd5..8fab876 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -257,7 +257,7 @@ func (cab *cachedAddrBook) GetCachedAddrs(p peer.ID) []types.Multiaddr { return nil } - result := make([]types.Multiaddr, 0, len(cachedAddrs)) + result := make([]types.Multiaddr, 0, len(cachedAddrs)) // convert to local Multiaddr type 🙃 for _, addr := range cachedAddrs { result = append(result, types.Multiaddr{Multiaddr: addr}) } From 8b36b0c75a0033e091f00fca65148328c9701f8e Mon Sep 17 00:00:00 2001 From: Daniel Norman <1992255+2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:04:33 +0100 Subject: [PATCH 63/80] Apply suggestions from code review Co-authored-by: Marcin Rataj --- cached_addr_book.go | 15 +++++++++++---- server.go | 2 +- server_cached_router.go | 4 ++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 8fab876..e60da7f 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -147,7 +147,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { case ev := <-sub.Out(): switch ev := ev.(type) { case event.EvtPeerIdentificationCompleted: - pState, exists := cab.peerCache.Get(ev.Peer) + pState, exists := cab.peerCache.Peek(ev.Peer) if !exists { pState = peerState{} } @@ -267,11 +267,18 @@ func (cab *cachedAddrBook) GetCachedAddrs(p peer.ID) []types.Multiaddr { // Update the peer cache with information about a failed connection // This should be called when a connection attempt to a peer fails func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { - pState, exists := cab.peerCache.Get(p) + pState, exists := cab.peerCache.Peek(p) if !exists { pState = peerState{} } - pState.lastFailedConnTime = time.Now() + now := time.Now() + // once probing of offline peer reached MaxBackoffDuration and still failed, + // we opportunistically remove the dead peer from cache to save time on probing it further + if exists && pState.connectFailures > 1 && now.Sub(pState.lastFailedConnTime) > MaxBackoffDuration { + cab.peerCache.Remove(p) + return + } + pState.lastFailedConnTime = now pState.connectFailures++ cab.peerCache.Add(p, pState) } @@ -279,7 +286,7 @@ func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { // Returns true if we should probe a peer (either by dialing known addresses or by dispatching a FindPeer) // based on the last failed connection time and connection failures func (cab *cachedAddrBook) ShouldProbePeer(p peer.ID) bool { - pState, exists := cab.peerCache.Get(p) + pState, exists := cab.peerCache.Peek(p) if !exists { return true // default to probing if the peer is not in the cache } diff --git a/server.go b/server.go index ceafe49..bc9281e 100644 --- a/server.go +++ b/server.go @@ -84,7 +84,7 @@ func start(ctx context.Context, cfg *config) error { var cachedAddrBook *cachedAddrBook if cfg.cachedAddrBook { - fmt.Println("Using cached address book to speed up peer discovery") + fmt.Println("Using cached address book to speed up provider discovery") cachedAddrBook, err = newCachedAddrBook() if err != nil { return err diff --git a/server_cached_router.go b/server_cached_router.go index 9b49206..1584550 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -137,7 +137,7 @@ func (it *cacheFallbackIter) Next() bool { val := it.sourceIter.Val() handleRecord := func(id *peer.ID, record *types.PeerRecord) bool { record.Addrs = it.router.withAddrsFromCache(addrQueryOriginProviders, *id, record.Addrs) - if record.Addrs != nil { + if len(record.Addrs) > 0 { it.current = iter.Result[types.Record]{Val: record} return true } @@ -175,7 +175,7 @@ func (it *cacheFallbackIter) Next() bool { if !ok { return false // channel closed. We're done } - if result.Addrs != nil { // Only if the lookup returned a result and it has addrs + if len(result.Addrs) > 0 { // Only if the lookup returned a result and it has addrs it.current = iter.Result[types.Record]{Val: &result} return true } else { From b58b50d8c757452c426d288096963f4a8ccbf499 Mon Sep 17 00:00:00 2001 From: Daniel Norman <1992255+2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:34:15 +0100 Subject: [PATCH 64/80] Apply suggestions from code review Co-authored-by: Marcin Rataj --- server_cached_router.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 1584550..5d6d22c 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -148,13 +148,8 @@ func (it *cacheFallbackIter) Next() bool { // If a record has no addrs, we dispatch a lookup to find addresses go it.dispatchFindPeer(*record) - return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } - - // If we're not going to probe, return the record with no addrs - // TODO: should we even return these if the peer is likely unreachable? - it.current = iter.Result[types.Record]{Val: record} - return true + return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } switch val.Val.GetSchema() { From 41922af5877e594a9138fda329fecfce0f9fe6b9 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:16:34 +0100 Subject: [PATCH 65/80] fix: use separate context for dispatched jobs --- server_cached_router.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 5d6d22c..2b7eae4 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -43,6 +43,8 @@ const ( addrQueryOriginProviders = "providers" addrQueryOriginPeers = "peers" addrQueryOriginUnknown = "unknown" + + DispatchedFindPeersTimeout = time.Minute ) // cachedRouter wraps a router with the cachedAddrBook to retrieve cached addresses for peers without multiaddrs in FindProviders @@ -147,7 +149,6 @@ func (it *cacheFallbackIter) Next() bool { it.ongoingLookups.Add(1) // important to increment before dispatchFindPeer // If a record has no addrs, we dispatch a lookup to find addresses go it.dispatchFindPeer(*record) - } return it.Next() // Recursively call Next() to either read from sourceIter or wait for lookup result } @@ -193,8 +194,14 @@ func (it *cacheFallbackIter) Val() iter.Result[types.Record] { func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { defer it.ongoingLookups.Add(-1) - // FindPeers is weird in that it accepts a limit. But we only want one result, ideally from the libp2p router. - peersIt, err := it.router.FindPeers(it.ctx, *record.ID, 1) + + // Create a new context with a timeout that is independent of the main request context + // This is important because finishing (and determining whether this peer is reachable) the + // FindPeer will benefit other requests and keep the cache up to date. + ctx, cancel := context.WithTimeout(context.Background(), DispatchedFindPeersTimeout) + defer cancel() + + peersIt, err := it.router.FindPeers(ctx, *record.ID, 1) if err != nil { it.findPeersResult <- record // pass back the record with no addrs From 06cef21b1ee6689fdcfc10767db9f319a11bb455 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:23:59 +0100 Subject: [PATCH 66/80] fix: ensure proper cleanup of cache fallback iter --- server_cached_router.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/server_cached_router.go b/server_cached_router.go index 2b7eae4..b965f04 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -123,7 +123,7 @@ type cacheFallbackIter struct { // It's a bit complex because it ensures we continue iterating without blocking on the FindPeers call. func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cachedRouter, ctx context.Context) *cacheFallbackIter { ctx, cancel := context.WithCancel(ctx) - return &cacheFallbackIter{ + iter := &cacheFallbackIter{ sourceIter: sourceIter, router: router, ctx: ctx, @@ -131,6 +131,14 @@ func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cache findPeersResult: make(chan types.PeerRecord), ongoingLookups: atomic.Int32{}, } + + // Add a goroutine to handle context cancellation + go func() { + <-ctx.Done() + iter.Close() + }() + + return iter } func (it *cacheFallbackIter) Next() bool { @@ -203,6 +211,11 @@ func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { peersIt, err := it.router.FindPeers(ctx, *record.ID, 1) + // Check if the parent context is done before sending + if it.ctx.Err() != nil { + return // Exit early if the parent context is done + } + if err != nil { it.findPeersResult <- record // pass back the record with no addrs return From 7a2160a9080a9c2668d20cd1786c842c61336624 Mon Sep 17 00:00:00 2001 From: Daniel Norman <1992255+2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:26:04 +0100 Subject: [PATCH 67/80] Update main.go Co-authored-by: Marcin Rataj --- main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.go b/main.go index 3f1a3da..1fd5c7a 100644 --- a/main.go +++ b/main.go @@ -42,7 +42,7 @@ func main() { Name: "cached-addr-book", Value: true, EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK"}, - Usage: "use a cached address book to improve peer routing performance", + Usage: "use a cached address book to improve provider lookup responses", }, &cli.StringSliceFlag{ Name: "provider-endpoints", From 84bc4f75c6f2ca47c26eaff83ab5e2c86de99a31 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:16:42 +0100 Subject: [PATCH 68/80] fix: formatting --- cached_addr_book.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index e60da7f..5074de2 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -275,8 +275,8 @@ func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { // once probing of offline peer reached MaxBackoffDuration and still failed, // we opportunistically remove the dead peer from cache to save time on probing it further if exists && pState.connectFailures > 1 && now.Sub(pState.lastFailedConnTime) > MaxBackoffDuration { - cab.peerCache.Remove(p) - return + cab.peerCache.Remove(p) + return } pState.lastFailedConnTime = now pState.connectFailures++ From 0c28c6b3034cc3d1a6ff77088086e5a3bff63817 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:05:04 +0100 Subject: [PATCH 69/80] fix: let consumer handle cleanup --- server_cached_router.go | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index b965f04..3764b48 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -63,7 +63,15 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) return nil, err } - return NewCacheFallbackIter(it, r, ctx), nil + iter := NewCacheFallbackIter(it, r, ctx) + + go func() { + // make sure we close the iterator when the parent context is done + <-ctx.Done() + iter.Close() + }() + + return iter, nil } // FindPeers uses a simpler approach than FindProviders because we're dealing with a single PeerRecord, and there's @@ -115,29 +123,20 @@ type cacheFallbackIter struct { findPeersResult chan types.PeerRecord router cachedRouter ctx context.Context - cancel context.CancelFunc ongoingLookups atomic.Int32 } // NewCacheFallbackIter is a wrapper around a results iterator that will resolve peers with no addresses from cache and if no cached addresses, will look them up via FindPeers. // It's a bit complex because it ensures we continue iterating without blocking on the FindPeers call. func NewCacheFallbackIter(sourceIter iter.ResultIter[types.Record], router cachedRouter, ctx context.Context) *cacheFallbackIter { - ctx, cancel := context.WithCancel(ctx) iter := &cacheFallbackIter{ sourceIter: sourceIter, router: router, ctx: ctx, - cancel: cancel, findPeersResult: make(chan types.PeerRecord), ongoingLookups: atomic.Int32{}, } - // Add a goroutine to handle context cancellation - go func() { - <-ctx.Done() - iter.Close() - }() - return iter } @@ -234,12 +233,10 @@ func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { } func (it *cacheFallbackIter) Close() error { - it.cancel() - go func() { - for it.ongoingLookups.Load() > 0 { - time.Sleep(time.Millisecond * 100) - } - close(it.findPeersResult) - }() - return it.sourceIter.Close() + for it.ongoingLookups.Load() > 0 { + time.Sleep(time.Millisecond * 100) + } + + close(it.findPeersResult) + return nil } From e0a601fe4deed1419016eff09ef48d4588a8bf5a Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:29:09 +0100 Subject: [PATCH 70/80] fix: remove from address book when removed from peer state --- cached_addr_book.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cached_addr_book.go b/cached_addr_book.go index 5074de2..31a654f 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -276,6 +276,8 @@ func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { // we opportunistically remove the dead peer from cache to save time on probing it further if exists && pState.connectFailures > 1 && now.Sub(pState.lastFailedConnTime) > MaxBackoffDuration { cab.peerCache.Remove(p) + // remove the peer from the addr book. Otherwise it will be probed again in the probe loop + cab.addrBook.ClearAddrs(p) return } pState.lastFailedConnTime = now From 7f0ec50ea151754f91af5a5abb15eda03900498e Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:35:17 +0100 Subject: [PATCH 71/80] fix: use normal lru cache instead of 2Q --- cached_addr_book.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 31a654f..79e7bb0 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -84,8 +84,8 @@ type peerState struct { } type cachedAddrBook struct { - addrBook peerstore.AddrBook // memory address book - peerCache *lru.TwoQueueCache[peer.ID, peerState] // LRU cache with additional metadata about peer + addrBook peerstore.AddrBook // memory address book + peerCache *lru.Cache[peer.ID, peerState] // LRU cache with additional metadata about peer isProbing atomic.Bool allowPrivateIPs bool // for testing } @@ -100,7 +100,7 @@ func WithAllowPrivateIPs() AddrBookOption { } func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { - peerCache, err := lru.New2Q[peer.ID, peerState](PeerCacheSize) + peerCache, err := lru.New[peer.ID, peerState](PeerCacheSize) if err != nil { return nil, err } From 2e025ebf4e50bfa2dd34804148a0f6de78e401f4 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:40:42 +0100 Subject: [PATCH 72/80] fix: update the metric when removing from the peer cache --- cached_addr_book.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cached_addr_book.go b/cached_addr_book.go index 79e7bb0..a5f52de 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -276,6 +276,7 @@ func (cab *cachedAddrBook) RecordFailedConnection(p peer.ID) { // we opportunistically remove the dead peer from cache to save time on probing it further if exists && pState.connectFailures > 1 && now.Sub(pState.lastFailedConnTime) > MaxBackoffDuration { cab.peerCache.Remove(p) + peerStateSize.Set(float64(cab.peerCache.Len())) // update metric // remove the peer from the addr book. Otherwise it will be probed again in the probe loop cab.addrBook.ClearAddrs(p) return From 6b4b40dd8d2e7b94c30b77b776f508408ffc3deb Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:57:07 +0100 Subject: [PATCH 73/80] fix: increase max backoff to 48 hours When the max backoff duration is reached and a connection attempt fails we clear the cached addresses and state. Since this state is useful to prevent unncessary attempts to dispatch a find peer we should keep it for as long as a provider record is valid for. --- cached_addr_book.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index a5f52de..9fe68b7 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -49,7 +49,7 @@ const ( PeerCacheSize = 1_000_000 // Maximum backoff duration for probing a peer - MaxBackoffDuration = time.Hour * 24 + MaxBackoffDuration = time.Hour * 48 ) var ( @@ -298,7 +298,7 @@ func (cab *cachedAddrBook) ShouldProbePeer(p peer.ID) bool { if pState.connectFailures > 0 { // Calculate backoff only if we have failures // this is effectively 2^(connectFailures - 1) * PeerProbeThreshold - // A single failure results in a 1 hour backoff and each additional failure doubles the backoff up to 24 hours + // A single failure results in a 1 hour backoff and each additional failure doubles the backoff backoffDuration = PeerProbeThreshold * time.Duration(1<<(pState.connectFailures-1)) backoffDuration = min(backoffDuration, MaxBackoffDuration) // clamp to max backoff duration } else { From fe7ad5452c4228cbf37247a2934ff363477182b3 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 11:50:16 +0100 Subject: [PATCH 74/80] feat: add env var for recently connected ttl --- cached_addr_book.go | 40 +++++++++++++++++++++++------------ docs/environment-variables.md | 12 +++++++++++ main.go | 14 +++++++++--- server.go | 15 +++++++++---- 4 files changed, 60 insertions(+), 21 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 9fe68b7..128a852 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -25,8 +25,9 @@ import ( const ( Subsystem = "cached_addr_book" - // The TTL to keep recently connected peers for. Same as [amino.DefaultProvideValidity] in go-libp2p-kad-dht - RecentlyConnectedAddrTTL = amino.DefaultProvideValidity + + // The default TTL to keep recently connected peers' multiaddrs for + DefaultRecentlyConnectedAddrTTL = amino.DefaultProvideValidity // Connected peers don't expire until they disconnect ConnectedAddrTTL = peerstore.ConnectedAddrTTL @@ -48,8 +49,9 @@ const ( // 1_000_000 is 10x the default number of signed peer records cached by the memory address book. PeerCacheSize = 1_000_000 - // Maximum backoff duration for probing a peer - MaxBackoffDuration = time.Hour * 48 + // Maximum backoff duration for probing a peer. After this duration, we will stop + // trying to connect to the peer and remove it from the cache. + MaxBackoffDuration = amino.DefaultProvideValidity ) var ( @@ -84,10 +86,11 @@ type peerState struct { } type cachedAddrBook struct { - addrBook peerstore.AddrBook // memory address book - peerCache *lru.Cache[peer.ID, peerState] // LRU cache with additional metadata about peer - isProbing atomic.Bool - allowPrivateIPs bool // for testing + addrBook peerstore.AddrBook // memory address book + peerCache *lru.Cache[peer.ID, peerState] // LRU cache with additional metadata about peer + isProbing atomic.Bool + allowPrivateIPs bool // for testing + recentlyConnectedTTL time.Duration } type AddrBookOption func(*cachedAddrBook) error @@ -99,6 +102,13 @@ func WithAllowPrivateIPs() AddrBookOption { } } +func WithRecentlyConnectedTTL(ttl time.Duration) AddrBookOption { + return func(cab *cachedAddrBook) error { + cab.recentlyConnectedTTL = ttl + return nil + } +} + func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { peerCache, err := lru.New[peer.ID, peerState](PeerCacheSize) if err != nil { @@ -106,8 +116,9 @@ func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { } cab := &cachedAddrBook{ - peerCache: peerCache, - addrBook: pstoremem.NewAddrBook(), + peerCache: peerCache, + addrBook: pstoremem.NewAddrBook(), + recentlyConnectedTTL: DefaultRecentlyConnectedAddrTTL, // Set default value } for _, opt := range opts { @@ -116,6 +127,7 @@ func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { return nil, err } } + logger.Infof("cachedAddrBook: Using TTL of %s for recently connected peers", cab.recentlyConnectedTTL) return cab, nil } @@ -157,7 +169,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { cab.peerCache.Add(ev.Peer, pState) peerStateSize.Set(float64(cab.peerCache.Len())) // update metric - ttl := getTTL(host.Network().Connectedness(ev.Peer)) + ttl := cab.getTTL(host.Network().Connectedness(ev.Peer)) if ev.SignedPeerRecord != nil { logger.Debug("Caching signed peer record") cab, ok := peerstore.GetCertifiedAddrBook(cab.addrBook) @@ -175,7 +187,7 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { case event.EvtPeerConnectednessChanged: // If the peer is not connected or limited, we update the TTL if !hasValidConnectedness(ev.Connectedness) { - cab.addrBook.UpdateAddrs(ev.Peer, ConnectedAddrTTL, RecentlyConnectedAddrTTL) + cab.addrBook.UpdateAddrs(ev.Peer, ConnectedAddrTTL, cab.recentlyConnectedTTL) } } case <-probeTicker.C: @@ -313,9 +325,9 @@ func hasValidConnectedness(connectedness network.Connectedness) bool { return connectedness == network.Connected || connectedness == network.Limited } -func getTTL(connectedness network.Connectedness) time.Duration { +func (cab *cachedAddrBook) getTTL(connectedness network.Connectedness) time.Duration { if hasValidConnectedness(connectedness) { return ConnectedAddrTTL } - return RecentlyConnectedAddrTTL + return cab.recentlyConnectedTTL } diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 2c10c44..2710dda 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -37,6 +37,18 @@ Whether or not the Accelerated DHT is enabled or not. Default: `true` +### `SOMEGUY_CACHED_ADDR_BOOK` + +Whether or not the Cached Address Book is enabled or not. + +Default: `true` + +### `SOMEGUY_CACHED_ADDR_BOOK_RECENT_TTL` + +The TTL for recently connected peers' multiaddrs in the cached address book. + +Default: `48h` + ### `SOMEGUY_PROVIDER_ENDPOINTS` Comma-separated list of other Delegated Routing V1 endpoints to proxy provider requests to. diff --git a/main.go b/main.go index 1fd5c7a..8d193b9 100644 --- a/main.go +++ b/main.go @@ -44,6 +44,13 @@ func main() { EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK"}, Usage: "use a cached address book to improve provider lookup responses", }, + &cli.DurationFlag{ + Name: "cached-addr-book-recent-ttl", + DefaultText: DefaultRecentlyConnectedAddrTTL.String(), + Value: DefaultRecentlyConnectedAddrTTL, + EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK_RECENT_TTL"}, + Usage: "TTL for recently connected peers' multiaddrs in the cached address book", + }, &cli.StringSliceFlag{ Name: "provider-endpoints", Value: cli.NewStringSlice(cidContactEndpoint), @@ -121,9 +128,10 @@ func main() { }, Action: func(ctx *cli.Context) error { cfg := &config{ - listenAddress: ctx.String("listen-address"), - acceleratedDHTClient: ctx.Bool("accelerated-dht"), - cachedAddrBook: ctx.Bool("cached-addr-book"), + listenAddress: ctx.String("listen-address"), + acceleratedDHTClient: ctx.Bool("accelerated-dht"), + cachedAddrBook: ctx.Bool("cached-addr-book"), + cachedAddrBookRecentTTL: ctx.Duration("cached-addr-book-recent-ttl"), contentEndpoints: ctx.StringSlice("provider-endpoints"), peerEndpoints: ctx.StringSlice("peer-endpoints"), diff --git a/server.go b/server.go index bc9281e..7b4ef9e 100644 --- a/server.go +++ b/server.go @@ -40,9 +40,10 @@ func withRequestLogger(next http.Handler) http.Handler { } type config struct { - listenAddress string - acceleratedDHTClient bool - cachedAddrBook bool + listenAddress string + acceleratedDHTClient bool + cachedAddrBook bool + cachedAddrBookRecentTTL time.Duration contentEndpoints []string peerEndpoints []string @@ -85,7 +86,13 @@ func start(ctx context.Context, cfg *config) error { if cfg.cachedAddrBook { fmt.Println("Using cached address book to speed up provider discovery") - cachedAddrBook, err = newCachedAddrBook() + opts := []AddrBookOption{} + + if cfg.cachedAddrBookRecentTTL > 0 { + opts = append(opts, WithRecentlyConnectedTTL(cfg.cachedAddrBookRecentTTL)) + } + + cachedAddrBook, err = newCachedAddrBook(opts...) if err != nil { return err } From 49efe9bcb8c8041a69450a5c9a14e5232dedd7f1 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:00:24 +0100 Subject: [PATCH 75/80] feat: add env var to control active probing --- cached_addr_book.go | 15 ++++++++++++++- docs/environment-variables.md | 6 ++++++ main.go | 15 +++++++++++---- server.go | 13 ++++++++----- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 128a852..6b708b2 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -88,6 +88,7 @@ type peerState struct { type cachedAddrBook struct { addrBook peerstore.AddrBook // memory address book peerCache *lru.Cache[peer.ID, peerState] // LRU cache with additional metadata about peer + probingEnabled bool isProbing atomic.Bool allowPrivateIPs bool // for testing recentlyConnectedTTL time.Duration @@ -109,6 +110,13 @@ func WithRecentlyConnectedTTL(ttl time.Duration) AddrBookOption { } } +func WithActiveProbing(enabled bool) AddrBookOption { + return func(cab *cachedAddrBook) error { + cab.probingEnabled = enabled + return nil + } +} + func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { peerCache, err := lru.New[peer.ID, peerState](PeerCacheSize) if err != nil { @@ -127,7 +135,8 @@ func newCachedAddrBook(opts ...AddrBookOption) (*cachedAddrBook, error) { return nil, err } } - logger.Infof("cachedAddrBook: Using TTL of %s for recently connected peers", cab.recentlyConnectedTTL) + logger.Infof("Using TTL of %s for recently connected peers", cab.recentlyConnectedTTL) + logger.Infof("Probing enabled: %t", cab.probingEnabled) return cab, nil } @@ -191,6 +200,10 @@ func (cab *cachedAddrBook) background(ctx context.Context, host host.Host) { } } case <-probeTicker.C: + if !cab.probingEnabled { + logger.Debug("Probing disabled, skipping") + continue + } if cab.isProbing.Load() { logger.Debug("Skipping peer probe, still running") continue diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 2710dda..bddd91b 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -49,6 +49,12 @@ The TTL for recently connected peers' multiaddrs in the cached address book. Default: `48h` +### `SOMEGUY_CACHED_ADDR_BOOK_ACTIVE_PROBING` + +Whether or not the Cached Address Book should actively probe peers in cache to keep their multiaddrs up to date. + +Default: `true` + ### `SOMEGUY_PROVIDER_ENDPOINTS` Comma-separated list of other Delegated Routing V1 endpoints to proxy provider requests to. diff --git a/main.go b/main.go index 8d193b9..94488bb 100644 --- a/main.go +++ b/main.go @@ -44,6 +44,12 @@ func main() { EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK"}, Usage: "use a cached address book to improve provider lookup responses", }, + &cli.BoolFlag{ + Name: "cached-addr-book-active-probing", + Value: true, + EnvVars: []string{"SOMEGUY_CACHED_ADDR_BOOK_ACTIVE_PROBING"}, + Usage: "actively probe peers in cache to keep their multiaddrs up to date", + }, &cli.DurationFlag{ Name: "cached-addr-book-recent-ttl", DefaultText: DefaultRecentlyConnectedAddrTTL.String(), @@ -128,10 +134,11 @@ func main() { }, Action: func(ctx *cli.Context) error { cfg := &config{ - listenAddress: ctx.String("listen-address"), - acceleratedDHTClient: ctx.Bool("accelerated-dht"), - cachedAddrBook: ctx.Bool("cached-addr-book"), - cachedAddrBookRecentTTL: ctx.Duration("cached-addr-book-recent-ttl"), + listenAddress: ctx.String("listen-address"), + acceleratedDHTClient: ctx.Bool("accelerated-dht"), + cachedAddrBook: ctx.Bool("cached-addr-book"), + cachedAddrBookActiveProbing: ctx.Bool("cached-addr-book-active-probing"), + cachedAddrBookRecentTTL: ctx.Duration("cached-addr-book-recent-ttl"), contentEndpoints: ctx.StringSlice("provider-endpoints"), peerEndpoints: ctx.StringSlice("peer-endpoints"), diff --git a/server.go b/server.go index 7b4ef9e..4052c81 100644 --- a/server.go +++ b/server.go @@ -40,10 +40,11 @@ func withRequestLogger(next http.Handler) http.Handler { } type config struct { - listenAddress string - acceleratedDHTClient bool - cachedAddrBook bool - cachedAddrBookRecentTTL time.Duration + listenAddress string + acceleratedDHTClient bool + cachedAddrBook bool + cachedAddrBookActiveProbing bool + cachedAddrBookRecentTTL time.Duration contentEndpoints []string peerEndpoints []string @@ -85,13 +86,15 @@ func start(ctx context.Context, cfg *config) error { var cachedAddrBook *cachedAddrBook if cfg.cachedAddrBook { - fmt.Println("Using cached address book to speed up provider discovery") + fmt.Printf("Using cached address book to speed up provider discovery (active probing enabled: %t)\n", cfg.cachedAddrBookActiveProbing) opts := []AddrBookOption{} if cfg.cachedAddrBookRecentTTL > 0 { opts = append(opts, WithRecentlyConnectedTTL(cfg.cachedAddrBookRecentTTL)) } + opts = append(opts, WithActiveProbing(cfg.cachedAddrBookActiveProbing)) + cachedAddrBook, err = newCachedAddrBook(opts...) if err != nil { return err From 8ca4d194f0637b0ed25b34a26b7b900134336a91 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:32:18 +0100 Subject: [PATCH 76/80] fix: bug from closing the iterator twice no need to close the channel. just the source iterator --- server_cached_router.go | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 3764b48..2f638f3 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -64,13 +64,6 @@ func (r cachedRouter) FindProviders(ctx context.Context, key cid.Cid, limit int) } iter := NewCacheFallbackIter(it, r, ctx) - - go func() { - // make sure we close the iterator when the parent context is done - <-ctx.Done() - iter.Close() - }() - return iter, nil } @@ -199,6 +192,10 @@ func (it *cacheFallbackIter) Val() iter.Result[types.Record] { return iter.Result[types.Record]{Err: errNoValueAvailable} } +func (it *cacheFallbackIter) Close() error { + return it.sourceIter.Close() +} + func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { defer it.ongoingLookups.Add(-1) @@ -231,12 +228,3 @@ func (it *cacheFallbackIter) dispatchFindPeer(record types.PeerRecord) { it.findPeersResult <- record // pass back the record with no addrs } } - -func (it *cacheFallbackIter) Close() error { - for it.ongoingLookups.Load() > 0 { - time.Sleep(time.Millisecond * 100) - } - - close(it.findPeersResult) - return nil -} From 317ccb7802e3bed434f28d9c14e561c2d9bc4f08 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:36:08 +0100 Subject: [PATCH 77/80] docs: update comment --- server_cached_router.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/server_cached_router.go b/server_cached_router.go index 2f638f3..8ad4149 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -102,8 +102,7 @@ func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid peer.ID, addrs peerAddrLookups.WithLabelValues(addrCacheStateHit, queryOrigin).Inc() return cachedAddrs } else { - // Cache miss. Queue peer for lookup. - peerAddrLookups.WithLabelValues(addrCacheStateMiss, queryOrigin).Inc() + peerAddrLookups.WithLabelValues(addrCacheStateMiss, queryOrigin).Inc() // Cache miss return nil } } From 327f9cbcf907d6aa17aa6be5f23cffbd4f9c23ff Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:23:56 +0100 Subject: [PATCH 78/80] docs: improve changelog --- CHANGELOG.md | 10 +++++++--- docs/environment-variables.md | 6 +++--- server_cached_router.go | 5 +++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c09a26..dbb763a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,9 +15,13 @@ The following emojis are used to highlight certain changes: ### Added -- Default caching of peer addresses for 48h to match [provider record expiration on Amino DHT](https://github.com/libp2p/go-libp2p-kad-dht/blob/v0.28.1/amino/defaults.go#L40-L43). Someguy will return cached addresses for peers without multiaddrs in `FindProviders` if there are no addresses for a provider. This can be enabled via `SOMEGUY_CACHED_ADDR_BOOK=true|false` (enabled by default) - - Added a new `cachedAddrBook` implementation that caches peer addresses by subscribing to Identify events and probes those peers in the background. - - Added a new `cachedRouter` that uses `cachedAddrBook` to retrieve cached addresses for peers without multiaddrs. If a Peer is encountered with no cached addresses, `FindPeer` is dispatched in the background. +- Peer addresses are cached for 48h to match [provider record expiration on Amino DHT](https://github.com/libp2p/go-libp2p-kad-dht/blob/v0.28.1/amino/defaults.go#L40-L43). +- In the background, someguy probes cached peers at most once per hour (`PeerProbeThreshold`) by attempting to dial them to keep their multiaddrs up to date. If a peer is not reachable, an exponential backoff is applied to reduce the frequency of probing. If a cached peer is unreachable for more than 48h (`MaxBackoffDuration`), it is removed from the cache. +- Someguy now augments providers missing addresses in `FindProviders` with cached addresses. If a peer is encountered with no cached addresses, `FindPeer` is dispatched in the background and the result is streamed in the reponse. Providers for which no addresses can be found, are omitted from the response. + - This can be enabled via `SOMEGUY_CACHED_ADDR_BOOK=true|false` (enabled by default) + - Two additional configuration options for the `cachedAddrBook` implementation: + - `SOMEGUY_CACHED_ADDR_BOOK_ACTIVE_PROBING` whether to actively probe cached peers in the background to keep their multiaddrs up to date. + - `SOMEGUY_CACHED_ADDR_BOOK_RECENT_TTL` to adjust the TTL for cached addresses of recently connected peers. ### Changed diff --git a/docs/environment-variables.md b/docs/environment-variables.md index bddd91b..f09ac7d 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -39,19 +39,19 @@ Default: `true` ### `SOMEGUY_CACHED_ADDR_BOOK` -Whether or not the Cached Address Book is enabled or not. +Whether or not the Cached Address Book is enabled or not. If disabled, someguy will not return cached addresses for peers without multiaddrs in `FindProviders`. Default: `true` ### `SOMEGUY_CACHED_ADDR_BOOK_RECENT_TTL` -The TTL for recently connected peers' multiaddrs in the cached address book. +The TTL for recently connected peers' multiaddrs in the cached address book. Only applies if `SOMEGUY_CACHED_ADDR_BOOK` is enabled. Default: `48h` ### `SOMEGUY_CACHED_ADDR_BOOK_ACTIVE_PROBING` -Whether or not the Cached Address Book should actively probe peers in cache to keep their multiaddrs up to date. +Whether or not the Cached Address Book should actively probe peers in cache to keep their multiaddrs up to date. Only applies if `SOMEGUY_CACHED_ADDR_BOOK` is enabled. Default: `true` diff --git a/server_cached_router.go b/server_cached_router.go index 8ad4149..c66c6aa 100644 --- a/server_cached_router.go +++ b/server_cached_router.go @@ -96,10 +96,11 @@ func (r cachedRouter) withAddrsFromCache(queryOrigin string, pid peer.ID, addrs return addrs } - cachedAddrs := r.cachedAddrBook.GetCachedAddrs(pid) + cachedAddrs := r.cachedAddrBook.GetCachedAddrs(pid) // Get cached addresses + if len(cachedAddrs) > 0 { logger.Debugw("found cached addresses", "peer", pid, "cachedAddrs", cachedAddrs) - peerAddrLookups.WithLabelValues(addrCacheStateHit, queryOrigin).Inc() + peerAddrLookups.WithLabelValues(addrCacheStateHit, queryOrigin).Inc() // Cache hit return cachedAddrs } else { peerAddrLookups.WithLabelValues(addrCacheStateMiss, queryOrigin).Inc() // Cache miss From 48e1943abb4f9b95d552ff057d3c05e156541e59 Mon Sep 17 00:00:00 2001 From: Daniel N <2color@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:26:39 +0100 Subject: [PATCH 79/80] test: fix background test --- cached_addr_book_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cached_addr_book_test.go b/cached_addr_book_test.go index fc773e2..a20dae1 100644 --- a/cached_addr_book_test.go +++ b/cached_addr_book_test.go @@ -26,14 +26,13 @@ func TestCachedAddrBook(t *testing.T) { } func TestBackground(t *testing.T) { - t.Skip("skipping until this test is less flaky") ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Create a real event bus eventBus := eventbus.NewBus() - emitter, err := eventBus.Emitter(new(event.EvtPeerIdentificationCompleted)) + emitter, err := eventBus.Emitter(new(event.EvtPeerIdentificationCompleted), eventbus.Stateful) require.NoError(t, err) // Use a mock host with a real event bus @@ -70,7 +69,7 @@ func TestBackground(t *testing.T) { require.Eventually(t, func() bool { _, exists := cab.peerCache.Get(testPeer) return exists - }, time.Second*5, time.Millisecond*100, "peer was not added to cache") + }, time.Second*3, time.Millisecond*100, "peer was not added to cache") // Verify peer state pState, exists := cab.peerCache.Get(testPeer) From c1ac41ba13a924a7d7c1b2676efe4c6d26bb8b70 Mon Sep 17 00:00:00 2001 From: Marcin Rataj Date: Wed, 18 Dec 2024 01:32:39 +0100 Subject: [PATCH 80/80] feat(metrics): track online vs offline probe ratio --- cached_addr_book.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cached_addr_book.go b/cached_addr_book.go index 6b708b2..75171a2 100644 --- a/cached_addr_book.go +++ b/cached_addr_book.go @@ -52,6 +52,10 @@ const ( // Maximum backoff duration for probing a peer. After this duration, we will stop // trying to connect to the peer and remove it from the cache. MaxBackoffDuration = amino.DefaultProvideValidity + + probeResult = "result" + probeResultOnline = "online" + probeResultOffline = "offline" ) var ( @@ -64,12 +68,14 @@ var ( Buckets: []float64{5, 10, 30, 60, 120, 300, 600, 900}, }) - probedPeersCounter = promauto.NewCounter(prometheus.CounterOpts{ + probedPeersCounter = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "probed_peers", Subsystem: Subsystem, Namespace: name, Help: "Number of peers probed", - }) + }, + []string{probeResult}, + ) peerStateSize = promauto.NewGauge(prometheus.GaugeOpts{ Name: "peer_state_size", @@ -256,7 +262,6 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { <-semaphore // Release semaphore wg.Done() }() - probedPeersCounter.Inc() ctx, cancel := context.WithTimeout(ctx, ConnectTimeout) defer cancel() logger.Debugf("Probe %d: PeerID: %s, Addrs: %v", i+1, p, addrs) @@ -268,6 +273,9 @@ func (cab *cachedAddrBook) probePeers(ctx context.Context, host host.Host) { if err != nil { logger.Debugf("failed to connect to peer %s: %v", p, err) cab.RecordFailedConnection(p) + probedPeersCounter.WithLabelValues(probeResultOffline).Inc() + } else { + probedPeersCounter.WithLabelValues(probeResultOnline).Inc() } }() }