Skip to content

Commit

Permalink
Enable OpenTelemetry tracing for usage service
Browse files Browse the repository at this point in the history
added docs, changeset and updated README
  • Loading branch information
dotansimha committed Jan 16, 2025
1 parent cc86bc5 commit aa195c2
Show file tree
Hide file tree
Showing 23 changed files with 668 additions and 513 deletions.
7 changes: 7 additions & 0 deletions .changeset/honest-scissors-live.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
'hive': minor
---

Added a new environment variable `OPENTELEMETRY_TRACE_USAGE_REQUESTS` for `rate-limit` and `tokens` services.

Self-hosters who wish to report telemetry information for `usage` service, can opt-in and set `OPENTELEMETRY_TRACE_USAGE_REQUESTS=1` to these services. This will skip sampling and will always trace requests originating from the `usage` service.
9 changes: 9 additions & 0 deletions .changeset/tender-maps-shout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
'hive': minor
---

Added OpenTelemetry traces to Usage service using a new `OPENTELEMETRY_COLLECTOR_ENDPOINT` env var.

This option is disabled by default for self-hosting, you can opt-in by setting `OPENTELEMETRY_COLLECTOR_ENDPOINT`.


1 change: 1 addition & 0 deletions deployment/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ const usage = deployUsage({
dbMigrations,
rateLimit,
sentry,
observability,
});

const usageIngestor = deployUsageIngestor({
Expand Down
4 changes: 4 additions & 0 deletions deployment/services/observability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ import { serviceLocalHost } from '../utils/local-endpoint';
import { Observability as ObservabilityInstance } from '../utils/observability';
import { deployGrafana } from './grafana';

// Change this to control OTEL tracing for usage service
const enableTracingForUsageService = true;

export function deployObservability(config: {
envName: string;
/**
Expand Down Expand Up @@ -57,6 +60,7 @@ export function deployObservability(config: {
observability: observabilityInstance,
grafana: useLocal ? undefined : deployGrafana(config.envName, config.tableSuffix),
enabled: true,
enabledForUsageService: enableTracingForUsageService,
};
}

Expand Down
1 change: 1 addition & 0 deletions deployment/services/rate-limit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ export function deployRateLimit({
USAGE_ESTIMATOR_ENDPOINT: serviceLocalEndpoint(usageEstimator.service),
EMAILS_ENDPOINT: serviceLocalEndpoint(emails.service),
WEB_APP_URL: `https://${environment.appDns}/`,
OPENTELEMETRY_TRACE_USAGE_REQUESTS: observability.enabledForUsageService ? '1' : '',
OPENTELEMETRY_COLLECTOR_ENDPOINT:
observability.enabled && observability.tracingEndpoint
? observability.tracingEndpoint
Expand Down
1 change: 1 addition & 0 deletions deployment/services/tokens.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export function deployTokens({
...environment.envVars,
SENTRY: sentry.enabled ? '1' : '0',
HEARTBEAT_ENDPOINT: heartbeat ?? '',
OPENTELEMETRY_TRACE_USAGE_REQUESTS: observability.enabledForUsageService ? '1' : '',
OPENTELEMETRY_COLLECTOR_ENDPOINT:
observability.enabled && observability.tracingEndpoint
? observability.tracingEndpoint
Expand Down
9 changes: 9 additions & 0 deletions deployment/services/usage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { DbMigrations } from './db-migrations';
import { Docker } from './docker';
import { Environment } from './environment';
import { Kafka } from './kafka';
import { Observability } from './observability';
import { RateLimitService } from './rate-limit';
import { Sentry } from './sentry';
import { Tokens } from './tokens';
Expand All @@ -19,8 +20,10 @@ export function deployUsage({
rateLimit,
image,
docker,
observability,
sentry,
}: {
observability: Observability;
image: string;
environment: Environment;
tokens: Tokens;
Expand Down Expand Up @@ -58,6 +61,12 @@ export function deployUsage({
KAFKA_TOPIC: kafka.config.topic,
TOKENS_ENDPOINT: serviceLocalEndpoint(tokens.service),
RATE_LIMIT_ENDPOINT: serviceLocalEndpoint(rateLimit.service),
OPENTELEMETRY_COLLECTOR_ENDPOINT:
observability.enabled &&
observability.enabledForUsageService &&
observability.tracingEndpoint
? observability.tracingEndpoint
: '',
},
exposesMetrics: true,
port: 4000,
Expand Down
6 changes: 3 additions & 3 deletions deployment/utils/observability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,12 @@ export class Observability {
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_health"',
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and IsMatch(attributes["http.url"], ".*/_health") == true',
// Ignore Contour/Envoy traces for /usage requests
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and attributes["http.url"] == "/usage"',
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_usage-service-.*") == true',
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true',
// 'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and attributes["http.url"] == "/usage"',
// 'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_usage-service-.*") == true',
// Ignore metrics scraping
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/metrics"',
// Ignore webapp HTTP calls
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true',
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true',
],
},
Expand Down
3 changes: 2 additions & 1 deletion packages/services/rate-limit/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ USAGE_ESTIMATOR_ENDPOINT=http://localhost:4011
EMAILS_ENDPOINT=http://localhost:6260
WEB_APP_URL=http://localhost:3000
OPENTELEMETRY_COLLECTOR_ENDPOINT="<sync>"
LIMIT_CACHE_UPDATE_INTERVAL_MS=2000
LIMIT_CACHE_UPDATE_INTERVAL_MS=2000
OPENTELEMETRY_TRACE_USAGE_REQUESTS=1
41 changes: 21 additions & 20 deletions packages/services/rate-limit/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,24 @@ you don't need this service.

## Configuration

| Name | Required | Description | Example Value |
| ----------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- |
| `PORT` | **Yes** | The HTTP port of the service. | `4012` |
| `LIMIT_CACHE_UPDATE_INTERVAL_MS` | No | The cache update interval limit in milliseconds. | `60_000` |
| `POSTGRES_HOST` | **Yes** | Host of the postgres database | `127.0.0.1` |
| `POSTGRES_PORT` | **Yes** | Port of the postgres database | `5432` |
| `POSTGRES_DB` | **Yes** | Name of the postgres database. | `registry` |
| `POSTGRES_USER` | **Yes** | User name for accessing the postgres database. | `postgres` |
| `POSTGRES_PASSWORD` | **Yes** | Password for accessing the postgres database. | `postgres` |
| `USAGE_ESTIMATOR_ENDPOINT` | **Yes** | The endpoint of the usage estimator service. | `http://127.0.0.1:4011` |
| `EMAILS_ENDPOINT` | No (if not provided no limit emails will be sent.) | The endpoint of the GraphQL Hive Email service. | `http://127.0.0.1:6260` |
| `ENVIRONMENT` | No | The environment of your Hive app. (**Note:** This will be used for Sentry reporting.) | `staging` |
| `SENTRY` | No | Whether Sentry error reporting should be enabled. | `1` (enabled) or `0` (disabled) |
| `SENTRY_DSN` | No | The DSN for reporting errors to Sentry. | `https://[email protected]/12121212` |
| `PROMETHEUS_METRICS` | No | Whether Prometheus metrics should be enabled | `1` (enabled) or `0` (disabled) |
| `PROMETHEUS_METRICS_LABEL_INSTANCE` | No | The instance label added for the prometheus metrics. | `rate-limit` |
| `WEB_APP_URL` | No | The base url of the web app | `https://your-instance.com` |
| `REQUEST_LOGGING` | No | Log http requests | `1` (enabled) or `0` (disabled) |
| `LOG_LEVEL` | No | The verbosity of the service logs. One of `trace`, `debug`, `info`, `warn` ,`error`, `fatal` or `silent` | `info` (default) |
| `OPENTELEMETRY_COLLECTOR_ENDPOINT` | No | OpenTelemetry Collector endpoint. The expected traces transport is HTTP (port `4318`). | `http://localhost:4318/v1/traces` |
| Name | Required | Description | Example Value |
| ------------------------------------ | -------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- |
| `PORT` | **Yes** | The HTTP port of the service. | `4012` |
| `LIMIT_CACHE_UPDATE_INTERVAL_MS` | No | The cache update interval limit in milliseconds. | `60_000` |
| `POSTGRES_HOST` | **Yes** | Host of the postgres database | `127.0.0.1` |
| `POSTGRES_PORT` | **Yes** | Port of the postgres database | `5432` |
| `POSTGRES_DB` | **Yes** | Name of the postgres database. | `registry` |
| `POSTGRES_USER` | **Yes** | User name for accessing the postgres database. | `postgres` |
| `POSTGRES_PASSWORD` | **Yes** | Password for accessing the postgres database. | `postgres` |
| `USAGE_ESTIMATOR_ENDPOINT` | **Yes** | The endpoint of the usage estimator service. | `http://127.0.0.1:4011` |
| `EMAILS_ENDPOINT` | No (if not provided no limit emails will be sent.) | The endpoint of the GraphQL Hive Email service. | `http://127.0.0.1:6260` |
| `ENVIRONMENT` | No | The environment of your Hive app. (**Note:** This will be used for Sentry reporting.) | `staging` |
| `SENTRY` | No | Whether Sentry error reporting should be enabled. | `1` (enabled) or `0` (disabled) |
| `SENTRY_DSN` | No | The DSN for reporting errors to Sentry. | `https://[email protected]/12121212` |
| `PROMETHEUS_METRICS` | No | Whether Prometheus metrics should be enabled | `1` (enabled) or `0` (disabled) |
| `PROMETHEUS_METRICS_LABEL_INSTANCE` | No | The instance label added for the prometheus metrics. | `rate-limit` |
| `WEB_APP_URL` | No | The base url of the web app | `https://your-instance.com` |
| `REQUEST_LOGGING` | No | Log http requests | `1` (enabled) or `0` (disabled) |
| `LOG_LEVEL` | No | The verbosity of the service logs. One of `trace`, `debug`, `info`, `warn` ,`error`, `fatal` or `silent` | `info` (default) |
| `OPENTELEMETRY_COLLECTOR_ENDPOINT` | No | OpenTelemetry Collector endpoint. The expected traces transport is HTTP (port `4318`). | `http://localhost:4318/v1/traces` |
| `OPENTELEMETRY_TRACE_USAGE_REQUESTS` | No | If enabled, requests send to this service from `usage` service will be monitored with OTEL. | `1` (enabled, or ``) |
10 changes: 8 additions & 2 deletions packages/services/rate-limit/src/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,13 @@ const configs = {
prometheus: PrometheusModel.safeParse(process.env),
// eslint-disable-next-line no-process-env
log: LogModel.safeParse(process.env),
// eslint-disable-next-line no-process-env
tracing: OpenTelemetryConfigurationModel.safeParse(process.env),
tracing: zod
.object({
...OpenTelemetryConfigurationModel.shape,
OPENTELEMETRY_TRACE_USAGE_REQUESTS: emptyString(zod.literal('1').optional()),
})
// eslint-disable-next-line no-process-env
.safeParse(process.env),
};

const environmentErrors: Array<string> = [];
Expand Down Expand Up @@ -129,6 +134,7 @@ export const env = {
tracing: {
enabled: !!tracing.OPENTELEMETRY_COLLECTOR_ENDPOINT,
collectorEndpoint: tracing.OPENTELEMETRY_COLLECTOR_ENDPOINT,
traceRequestsFromUsageService: tracing.OPENTELEMETRY_TRACE_USAGE_REQUESTS === '1',
},
postgres: {
host: postgres.POSTGRES_HOST,
Expand Down
7 changes: 5 additions & 2 deletions packages/services/rate-limit/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ async function main() {
tracing = configureTracing({
collectorEndpoint: env.tracing.collectorEndpoint,
serviceName: 'rate-limit',
sampler(ctx, traceId, spanName, spanKind, attributes) {
if (attributes['requesting.service'] === 'usage') {
sampler: (ctx, traceId, spanName, spanKind, attributes) => {
if (
attributes['requesting.service'] === 'usage' &&
!env.tracing.traceRequestsFromUsageService
) {
return {
decision: SamplingDecision.NOT_RECORD,
};
Expand Down
3 changes: 2 additions & 1 deletion packages/services/tokens/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ REDIS_PORT="6379"
REDIS_PASSWORD=""
PORT=6001
OPENTELEMETRY_COLLECTOR_ENDPOINT="<sync>"
LOG_LEVEL="debug"
LOG_LEVEL="debug"
OPENTELEMETRY_TRACE_USAGE_REQUESTS=1
Loading

0 comments on commit aa195c2

Please sign in to comment.