Skip to content

Commit

Permalink
[Traffic Control] Improved error metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
williampsmith committed Nov 1, 2024
1 parent ef0d78c commit 7d55405
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 4 deletions.
3 changes: 2 additions & 1 deletion crates/sui-core/src/authority_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1296,9 +1296,10 @@ impl ValidatorService {
traffic_controller.tally(TrafficTally {
direct: client,
through_fullnode: None,
error_weight: error.map(normalize).unwrap_or(Weight::zero()),
error_weight: error.clone().map(normalize).unwrap_or(Weight::zero()),
spam_weight,
timestamp: SystemTime::now(),
error_type: error.map(|e| e.to_string()),
})
}
unwrapped_response
Expand Down
12 changes: 10 additions & 2 deletions crates/sui-core/src/traffic_controller/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// SPDX-License-Identifier: Apache-2.0

use prometheus::{
register_int_counter_with_registry, register_int_gauge_with_registry, IntCounter, IntGauge,
Registry,
register_int_counter_vec_with_registry, register_int_counter_with_registry,
register_int_gauge_with_registry, IntCounter, IntCounterVec, IntGauge, Registry,
};

#[derive(Clone)]
Expand All @@ -18,6 +18,7 @@ pub struct TrafficControllerMetrics {
pub num_dry_run_blocked_requests: IntCounter,
pub tally_handled: IntCounter,
pub error_tally_handled: IntCounter,
pub tally_errors: IntCounterVec,
pub deadmans_switch_enabled: IntGauge,
pub highest_direct_spam_rate: IntGauge,
pub highest_proxied_spam_rate: IntGauge,
Expand Down Expand Up @@ -90,6 +91,13 @@ impl TrafficControllerMetrics {
registry
)
.unwrap(),
tally_errors: register_int_counter_vec_with_registry!(
"traffic_control_tally_errors",
"Number of tally errors, grouped by error type",
&["error_type"],
registry
)
.unwrap(),
deadmans_switch_enabled: register_int_gauge_with_registry!(
"deadmans_switch_enabled",
"If 1, the deadman's switch is enabled and all traffic control
Expand Down
7 changes: 7 additions & 0 deletions crates/sui-core/src/traffic_controller/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,10 @@ async fn handle_error_tally(
}
let resp = policy.handle_tally(tally.clone());
metrics.error_tally_handled.inc();
metrics
.tally_errors
.with_label_values(&[tally.error_type.as_deref().unwrap_or("unknown")])
.inc();
if let Some(fw_config) = fw_config {
if fw_config.delegate_error_blocking && !mem_drainfile_present {
let client = nodefw_client
Expand Down Expand Up @@ -509,6 +513,7 @@ async fn handle_policy_response(
{
// Only increment the metric if the client was not already blocked
debug!("Blocking client: {:?}", client);
metrics.requests_blocked_at_protocol.inc();
metrics.connection_ip_blocklist_len.inc();
}
}
Expand All @@ -523,6 +528,7 @@ async fn handle_policy_response(
{
// Only increment the metric if the client was not already blocked
debug!("Blocking proxied client: {:?}", client);
metrics.requests_blocked_at_protocol.inc();
metrics.proxy_ip_blocklist_len.inc();
}
}
Expand Down Expand Up @@ -744,6 +750,7 @@ impl TrafficSim {
client,
// TODO add proxy IP for testing
None,
None,
// TODO add weight adjustments
Weight::one(),
Weight::one(),
Expand Down
6 changes: 6 additions & 0 deletions crates/sui-core/src/traffic_controller/policies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ pub struct TrafficTally {
pub direct: Option<IpAddr>,
pub through_fullnode: Option<IpAddr>,
pub error_weight: Weight,
pub error_type: Option<String>,
pub spam_weight: Weight,
pub timestamp: SystemTime,
}
Expand All @@ -231,13 +232,15 @@ impl TrafficTally {
pub fn new(
direct: Option<IpAddr>,
through_fullnode: Option<IpAddr>,
error_type: Option<String>,
error_weight: Weight,
spam_weight: Weight,
) -> Self {
Self {
direct,
through_fullnode,
error_weight,
error_type,
spam_weight,
timestamp: SystemTime::now(),
}
Expand Down Expand Up @@ -516,20 +519,23 @@ mod tests {
direct: Some(IpAddr::V4(Ipv4Addr::new(8, 7, 6, 5))),
through_fullnode: Some(IpAddr::V4(Ipv4Addr::new(1, 2, 3, 4))),
error_weight: Weight::zero(),
error_type: None,
spam_weight: Weight::one(),
timestamp: SystemTime::now(),
};
let bob = TrafficTally {
direct: Some(IpAddr::V4(Ipv4Addr::new(8, 7, 6, 5))),
through_fullnode: Some(IpAddr::V4(Ipv4Addr::new(4, 3, 2, 1))),
error_weight: Weight::zero(),
error_type: None,
spam_weight: Weight::one(),
timestamp: SystemTime::now(),
};
let charlie = TrafficTally {
direct: Some(IpAddr::V4(Ipv4Addr::new(8, 7, 6, 5))),
through_fullnode: Some(IpAddr::V4(Ipv4Addr::new(5, 6, 7, 8))),
error_weight: Weight::zero(),
error_type: None,
spam_weight: Weight::one(),
timestamp: SystemTime::now(),
};
Expand Down
3 changes: 2 additions & 1 deletion crates/sui-json-rpc/src/axum_router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ fn handle_traffic_resp(
traffic_controller.tally(TrafficTally {
direct: client,
through_fullnode: None,
error_weight: error.map(normalize).unwrap_or(Weight::zero()),
error_weight: error.clone().map(normalize).unwrap_or(Weight::zero()),
// For now, count everything as spam with equal weight
// on the rpc node side, including gas-charging endpoints
// such as `sui_executeTransactionBlock`, as this can enable
Expand All @@ -262,6 +262,7 @@ fn handle_traffic_resp(
// to provide a weight distribution based on the method being called.
spam_weight: Weight::one(),
timestamp: SystemTime::now(),
error_type: error.map(|e| e.to_string()),
});
}

Expand Down

0 comments on commit 7d55405

Please sign in to comment.