diff --git a/cmd/bee/cmd/cmd.go b/cmd/bee/cmd/cmd.go index 0fe00baa922..212fcc41483 100644 --- a/cmd/bee/cmd/cmd.go +++ b/cmd/bee/cmd/cmd.go @@ -40,10 +40,10 @@ const ( optionWelcomeMessage = "welcome-message" optionCORSAllowedOrigins = "cors-allowed-origins" optionNameTracingEnabled = "tracing-enable" - optionNameTracingOTLPEndpoint = "tracing-otlp-endpoint" - optionNameTracingOTLPInsecure = "tracing-otlp-insecure" - optionNameTracingOTLPCAFile = "tracing-otlp-ca-file" - optionNameTracingOTLPProtocol = "tracing-otlp-protocol" + optionNameTracingEndpoint = "tracing-endpoint" + optionNameTracingInsecure = "tracing-insecure" + optionNameTracingCAFile = "tracing-ca-file" + optionNameTracingProtocol = "tracing-protocol" optionNameTracingSamplingRatio = "tracing-sampling-ratio" optionNameTracingServiceName = "tracing-service-name" optionNameVerbosity = "verbosity" @@ -108,10 +108,10 @@ const ( // tracing configKeyTracingEnabled = "tracing.enable" - configKeyTracingOTLPEndpoint = "tracing.otlp-endpoint" - configKeyTracingOTLPInsecure = "tracing.otlp-insecure" - configKeyTracingOTLPCAFile = "tracing.otlp-ca-file" - configKeyTracingOTLPProtocol = "tracing.otlp-protocol" + configKeyTracingEndpoint = "tracing.endpoint" + configKeyTracingInsecure = "tracing.insecure" + configKeyTracingCAFile = "tracing.ca-file" + configKeyTracingProtocol = "tracing.protocol" configKeyTracingSamplingRatio = "tracing.sampling-ratio" configKeyTracingServiceName = "tracing.service-name" ) @@ -126,10 +126,10 @@ var blockchainRpcConfigPairs = []struct{ flat, dotted string }{ var tracingConfigPairs = []struct{ flat, dotted string }{ {optionNameTracingEnabled, configKeyTracingEnabled}, - {optionNameTracingOTLPEndpoint, configKeyTracingOTLPEndpoint}, - {optionNameTracingOTLPInsecure, configKeyTracingOTLPInsecure}, - {optionNameTracingOTLPCAFile, configKeyTracingOTLPCAFile}, - {optionNameTracingOTLPProtocol, configKeyTracingOTLPProtocol}, + {optionNameTracingEndpoint, configKeyTracingEndpoint}, + {optionNameTracingInsecure, configKeyTracingInsecure}, + {optionNameTracingCAFile, configKeyTracingCAFile}, + {optionNameTracingProtocol, configKeyTracingProtocol}, {optionNameTracingSamplingRatio, configKeyTracingSamplingRatio}, {optionNameTracingServiceName, configKeyTracingServiceName}, } @@ -137,17 +137,19 @@ var tracingConfigPairs = []struct{ flat, dotted string }{ // Deprecated tracing options, removed in the OpenTelemetry migration. They are // kept only as hidden no-op flags so that existing configs do not break node // startup; their values are ignored (see deprecatedTracingKeys). +// +// Note: tracing-endpoint is intentionally NOT here — it is reused as the active +// OTLP endpoint flag (see optionNameTracingEndpoint). Its meaning changed from a +// Jaeger agent address to an OTLP collector endpoint. const ( - optionNameTracingEndpoint = "tracing-endpoint" - optionNameTracingHost = "tracing-host" - optionNameTracingPort = "tracing-port" + optionNameTracingHost = "tracing-host" + optionNameTracingPort = "tracing-port" ) // deprecatedTracingKeys are the pre-OpenTelemetry tracing options. They are // registered as hidden no-op flags so stale configs still start the node, but -// their values are ignored; operators should migrate to the tracing-otlp-* keys. +// their values are ignored; operators should migrate to the tracing-* keys. var deprecatedTracingKeys = []string{ - optionNameTracingEndpoint, optionNameTracingHost, optionNameTracingPort, } @@ -326,17 +328,17 @@ func (c *command) setAllFlags(cmd *cobra.Command) { cmd.Flags().Uint64(optionNameNetworkID, chaincfg.Mainnet.NetworkID, "ID of the Swarm network") cmd.Flags().StringSlice(optionCORSAllowedOrigins, []string{}, "origins with CORS headers enabled") cmd.Flags().Bool(optionNameTracingEnabled, false, "enable tracing") - cmd.Flags().String(optionNameTracingOTLPEndpoint, "127.0.0.1:4318", "OTLP endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc") - cmd.Flags().Bool(optionNameTracingOTLPInsecure, false, "disable TLS for the OTLP exporter (useful for a local collector); when false, set --tracing-otlp-ca-file to verify the collector certificate against a private CA") - cmd.Flags().String(optionNameTracingOTLPCAFile, "", "path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when --tracing-otlp-insecure=true") - cmd.Flags().String(optionNameTracingOTLPProtocol, "http", "OTLP exporter transport: http or grpc") + cmd.Flags().String(optionNameTracingEndpoint, "127.0.0.1:4318", "OTLP collector endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc") + cmd.Flags().Bool(optionNameTracingInsecure, false, "disable TLS for the OTLP exporter (useful for a local collector); when false, set --tracing-ca-file to verify the collector certificate against a private CA") + cmd.Flags().String(optionNameTracingCAFile, "", "path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when --tracing-insecure=true") + cmd.Flags().String(optionNameTracingProtocol, "http", "OTLP exporter transport: http or grpc") cmd.Flags().Float64(optionNameTracingSamplingRatio, 1.0, "head-based sampling ratio in [0,1]; 0 samples nothing, 1 samples everything") cmd.Flags().String(optionNameTracingServiceName, "bee", "service name identifier for tracing") // Deprecated, no-op tracing flags kept for backward compatibility so that // existing configs do not break node startup. Their values are ignored. for _, name := range deprecatedTracingKeys { - cmd.Flags().String(name, "", "deprecated and ignored; use the tracing-otlp-* options") - _ = cmd.Flags().MarkDeprecated(name, "no longer used after the OpenTelemetry migration; use the tracing-otlp-* options") + cmd.Flags().String(name, "", "deprecated and ignored; use the tracing-* options") + _ = cmd.Flags().MarkDeprecated(name, "no longer used after the OpenTelemetry migration; use the tracing-* options") } cmd.Flags().String(optionNameVerbosity, "info", "log verbosity level 0=silent, 1=error, 2=warn, 3=info, 4=debug, 5=trace") cmd.Flags().String(optionWelcomeMessage, "", "send a welcome message string during handshakes") @@ -423,8 +425,8 @@ func (c *command) bindBlockchainRpcConfig(cmd *cobra.Command) { c.bindNestedConfig(cmd, blockchainRpcConfigPairs) } -// bindTracingConfig supports both flat (tracing-otlp-endpoint) and nested -// (tracing.otlp-endpoint) YAML forms, with nested taking precedence. +// bindTracingConfig supports both flat (tracing-endpoint) and nested +// (tracing.endpoint) YAML forms, with nested taking precedence. func (c *command) bindTracingConfig(cmd *cobra.Command) { c.bindNestedConfig(cmd, tracingConfigPairs) } @@ -436,7 +438,7 @@ func (c *command) bindTracingConfig(cmd *cobra.Command) { func (c *command) warnDeprecatedTracingKeys() { for _, key := range deprecatedTracingKeys { if c.config.InConfig(key) { - c.logger.Warning("deprecated tracing config key is ignored; use the tracing-otlp-* options", "key", key) + c.logger.Warning("deprecated tracing config key is ignored; use the tracing-* options", "key", key) } } } @@ -445,9 +447,9 @@ func (c *command) warnDeprecatedTracingKeys() { // CA bundle, in which case the OTLP exporter falls back to the system root CAs. func (c *command) warnTracingTLSWithoutCA() { if c.config.GetBool(configKeyTracingEnabled) && - !c.config.GetBool(configKeyTracingOTLPInsecure) && - c.config.GetString(configKeyTracingOTLPCAFile) == "" { - c.logger.Warning("tracing: TLS is enabled but no CA bundle is configured; the OTLP exporter will rely on the system root CAs. Provide --tracing-otlp-ca-file, set --tracing-otlp-insecure=true for a plaintext local collector, or disable tracing.") + !c.config.GetBool(configKeyTracingInsecure) && + c.config.GetString(configKeyTracingCAFile) == "" { + c.logger.Warning("tracing: TLS is enabled but no CA bundle is configured; the OTLP exporter will rely on the system root CAs. Provide --tracing-ca-file, set --tracing-insecure=true for a plaintext local collector, or disable tracing.") } } diff --git a/cmd/bee/cmd/start.go b/cmd/bee/cmd/start.go index 4abcd178341..69ef9120835 100644 --- a/cmd/bee/cmd/start.go +++ b/cmd/bee/cmd/start.go @@ -343,10 +343,10 @@ func buildBeeNode(ctx context.Context, c *command, cmd *cobra.Command, logger lo SwapInitialDeposit: c.config.GetString(optionNameSwapInitialDeposit), TargetNeighborhood: c.config.GetString(optionNameTargetNeighborhood), TracingEnabled: c.config.GetBool(configKeyTracingEnabled), - TracingEndpoint: c.config.GetString(configKeyTracingOTLPEndpoint), - TracingInsecure: c.config.GetBool(configKeyTracingOTLPInsecure), - TracingCAFile: c.config.GetString(configKeyTracingOTLPCAFile), - TracingProtocol: c.config.GetString(configKeyTracingOTLPProtocol), + TracingEndpoint: c.config.GetString(configKeyTracingEndpoint), + TracingInsecure: c.config.GetBool(configKeyTracingInsecure), + TracingCAFile: c.config.GetString(configKeyTracingCAFile), + TracingProtocol: c.config.GetString(configKeyTracingProtocol), TracingSamplingRatio: c.config.GetFloat64(configKeyTracingSamplingRatio), TracingServiceName: c.config.GetString(configKeyTracingServiceName), TrxDebugMode: c.config.GetBool(optionNameTransactionDebugMode), diff --git a/packaging/bee.yaml b/packaging/bee.yaml index 27464575cad..69e70e37371 100644 --- a/packaging/bee.yaml +++ b/packaging/bee.yaml @@ -111,14 +111,14 @@ password-file: "/var/lib/bee/password" # tracing: # ## enable tracing # enable: false -# ## OTLP endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc -# otlp-endpoint: 127.0.0.1:4318 +# ## OTLP collector endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc +# endpoint: 127.0.0.1:4318 # ## disable TLS for the OTLP exporter (useful for a local collector) -# otlp-insecure: false -# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when otlp-insecure is true -# otlp-ca-file: "" +# insecure: false +# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when insecure is true +# ca-file: "" # ## OTLP exporter transport: http or grpc -# otlp-protocol: http +# protocol: http # ## head-based sampling ratio in [0,1]; 0 samples nothing, 1 samples everything # sampling-ratio: 1.0 # ## service name identifier for tracing diff --git a/packaging/homebrew-amd64/bee.yaml b/packaging/homebrew-amd64/bee.yaml index b08681d8af4..9230233ec99 100644 --- a/packaging/homebrew-amd64/bee.yaml +++ b/packaging/homebrew-amd64/bee.yaml @@ -111,14 +111,14 @@ password-file: "/usr/local/var/lib/swarm-bee/password" # tracing: # ## enable tracing # enable: false -# ## OTLP endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc -# otlp-endpoint: 127.0.0.1:4318 +# ## OTLP collector endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc +# endpoint: 127.0.0.1:4318 # ## disable TLS for the OTLP exporter (useful for a local collector) -# otlp-insecure: false -# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when otlp-insecure is true -# otlp-ca-file: "" +# insecure: false +# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when insecure is true +# ca-file: "" # ## OTLP exporter transport: http or grpc -# otlp-protocol: http +# protocol: http # ## head-based sampling ratio in [0,1]; 0 samples nothing, 1 samples everything # sampling-ratio: 1.0 # ## service name identifier for tracing diff --git a/packaging/homebrew-arm64/bee.yaml b/packaging/homebrew-arm64/bee.yaml index 8d796846ec9..721ffc0ab3a 100644 --- a/packaging/homebrew-arm64/bee.yaml +++ b/packaging/homebrew-arm64/bee.yaml @@ -111,14 +111,14 @@ password-file: "/opt/homebrew/var/lib/swarm-bee/password" # tracing: # ## enable tracing # enable: false -# ## OTLP endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc -# otlp-endpoint: 127.0.0.1:4318 +# ## OTLP collector endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc +# endpoint: 127.0.0.1:4318 # ## disable TLS for the OTLP exporter (useful for a local collector) -# otlp-insecure: false -# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when otlp-insecure is true -# otlp-ca-file: "" +# insecure: false +# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when insecure is true +# ca-file: "" # ## OTLP exporter transport: http or grpc -# otlp-protocol: http +# protocol: http # ## head-based sampling ratio in [0,1]; 0 samples nothing, 1 samples everything # sampling-ratio: 1.0 # ## service name identifier for tracing diff --git a/packaging/scoop/bee.yaml b/packaging/scoop/bee.yaml index 4029de2ad0d..88f8f0e4bf0 100644 --- a/packaging/scoop/bee.yaml +++ b/packaging/scoop/bee.yaml @@ -111,14 +111,14 @@ password-file: "./password" # tracing: # ## enable tracing # enable: false -# ## OTLP endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc -# otlp-endpoint: 127.0.0.1:4318 +# ## OTLP collector endpoint to send tracing data (host:port); default port is 4318 for http, 4317 for grpc +# endpoint: 127.0.0.1:4318 # ## disable TLS for the OTLP exporter (useful for a local collector) -# otlp-insecure: false -# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when otlp-insecure is true -# otlp-ca-file: "" +# insecure: false +# ## path to a PEM-encoded CA bundle used to verify the OTLP collector certificate; ignored when insecure is true +# ca-file: "" # ## OTLP exporter transport: http or grpc -# otlp-protocol: http +# protocol: http # ## head-based sampling ratio in [0,1]; 0 samples nothing, 1 samples everything # sampling-ratio: 1.0 # ## service name identifier for tracing diff --git a/pkg/api/api.go b/pkg/api/api.go index 83be07ecaeb..28586141e75 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -63,6 +63,9 @@ import ( "github.com/gorilla/mux" "github.com/hashicorp/go-multierror" "github.com/prometheus/client_golang/prometheus" + "go.opentelemetry.io/otel/codes" + semconv "go.opentelemetry.io/otel/semconv/v1.25.0" + "go.opentelemetry.io/otel/trace" "golang.org/x/sync/semaphore" ) @@ -462,7 +465,7 @@ func (s *Service) newTracingHandler(spanName string) func(h http.Handler) http.H // ignore } - span, _, ctx := s.tracer.StartSpanFromContext(ctx, spanName, s.logger) + span, _, ctx := s.tracer.StartSpanFromContext(ctx, spanName, s.logger, trace.WithSpanKind(trace.SpanKindServer)) defer span.End() err = s.tracer.AddContextHTTPHeader(ctx, r.Header) @@ -472,6 +475,23 @@ func (s *Service) newTracingHandler(spanName string) func(h http.Handler) http.H } h.ServeHTTP(w, r.WithContext(ctx)) + + // The response status code is captured upstream by responseCodeMetricsHandler's + // *responseWriter, which exposes Status(). Read it to annotate the span with the + // standard HTTP server attributes so the request is queryable in Tempo/Grafana. + if sw, ok := w.(interface{ Status() int }); ok { + code := sw.Status() + span.SetAttributes( + semconv.HTTPRequestMethodKey.String(r.Method), + semconv.HTTPRoute(spanName), + semconv.HTTPResponseStatusCode(code), + ) + // OTel server-span convention: 5xx marks the span as Error; 4xx is a client + // error and stays Unset, still queryable via http.response.status_code. + if code >= 500 { + span.SetStatus(codes.Error, http.StatusText(code)) + } + } }) } } diff --git a/pkg/api/api_test.go b/pkg/api/api_test.go index babd816dd06..23782bcbc6b 100644 --- a/pkg/api/api_test.go +++ b/pkg/api/api_test.go @@ -90,6 +90,7 @@ func init() { type testServerOptions struct { Storer api.Storer + Tracer *tracing.Tracer StateStorer storage.StateStorer Resolver resolver.Interface Pss pss.Interface @@ -231,12 +232,16 @@ func newTestServer(t *testing.T, o testServerOptions) (*http.Client, *websocket. s.SetSwarmAddress(&o.Overlay) s.SetProbe(o.Probe) - noOpTracer, tracerCloser, _ := tracing.NewTracer(&tracing.Options{ - Enabled: false, - }) - testutil.CleanupCloser(t, tracerCloser) + tracer := o.Tracer + if tracer == nil { + noOpTracer, tracerCloser, _ := tracing.NewTracer(&tracing.Options{ + Enabled: false, + }) + testutil.CleanupCloser(t, tracerCloser) + tracer = noOpTracer + } - s.Configure(signer, noOpTracer, api.Options{ + s.Configure(signer, tracer, api.Options{ CORSAllowedOrigins: o.CORSAllowedOrigins, WsPingPeriod: o.WsPingPeriod, }, extraOpts, 1, erc20APIService) diff --git a/pkg/api/bytes.go b/pkg/api/bytes.go index 0855c3fba54..c8e0b318037 100644 --- a/pkg/api/bytes.go +++ b/pkg/api/bytes.go @@ -30,7 +30,7 @@ type bytesPostResponse struct { // bytesUploadHandler handles upload of raw binary data of arbitrary length. func (s *Service) bytesUploadHandler(w http.ResponseWriter, r *http.Request) { - span, logger, ctx := s.tracer.StartSpanFromContext(r.Context(), "post_bytes", s.logger.WithName("post_bytes").Build()) + span, logger, ctx := s.tracer.StartSpanFromContext(r.Context(), "bytes-post", s.logger.WithName("post_bytes").Build()) defer span.End() headers := struct { @@ -73,7 +73,7 @@ func (s *Service) bytesUploadHandler(w http.ResponseWriter, r *http.Request) { tracing.RecordError(span, err, attribute.String("action", "tag.create")) return } - span.SetAttributes(attribute.Int64("tagID", int64(tag))) + span.SetAttributes(attribute.Int64("tag_id", int64(tag))) } defer s.observeUploadSpeed(w, r, time.Now(), "bytes", deferred) diff --git a/pkg/api/bzz.go b/pkg/api/bzz.go index 9f62e6f4895..141b3b55d24 100644 --- a/pkg/api/bzz.go +++ b/pkg/api/bzz.go @@ -65,7 +65,7 @@ func lookaheadBufferSize(size int64) int { } func (s *Service) bzzUploadHandler(w http.ResponseWriter, r *http.Request) { - span, logger, ctx := s.tracer.StartSpanFromContext(r.Context(), "post_bzz", s.logger.WithName("post_bzz").Build()) + span, logger, ctx := s.tracer.StartSpanFromContext(r.Context(), "bzz-post", s.logger.WithName("post_bzz").Build()) defer span.End() headers := struct { @@ -107,7 +107,7 @@ func (s *Service) bzzUploadHandler(w http.ResponseWriter, r *http.Request) { tracing.RecordError(span, err, attribute.String("action", "tag.create")) return } - span.SetAttributes(attribute.Int64("tagID", int64(tag))) + span.SetAttributes(attribute.Int64("tag_id", int64(tag))) } putter, err := s.newStamperPutter(ctx, putterOptions{ @@ -340,7 +340,7 @@ func (s *Service) fileUploadHandler( if tagID != 0 { w.Header().Set(SwarmTagHeader, fmt.Sprint(tagID)) - span.SetAttributes(attribute.Int64("tagID", int64(tagID))) + span.SetAttributes(attribute.Int64("tag_id", int64(tagID))) } w.Header().Set(ETagHeader, fmt.Sprintf("%q", reference.String())) w.Header().Set(AccessControlExposeHeaders, SwarmTagHeader) diff --git a/pkg/api/tracing_test.go b/pkg/api/tracing_test.go new file mode 100644 index 00000000000..f07fb5565c7 --- /dev/null +++ b/pkg/api/tracing_test.go @@ -0,0 +1,153 @@ +// Copyright 2026 The Swarm Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package api_test + +import ( + "bytes" + "context" + "net/http" + "testing" + "time" + + "github.com/ethersphere/bee/v2/pkg/api" + "github.com/ethersphere/bee/v2/pkg/jsonhttp" + "github.com/ethersphere/bee/v2/pkg/jsonhttp/jsonhttptest" + "github.com/ethersphere/bee/v2/pkg/log" + mockpost "github.com/ethersphere/bee/v2/pkg/postage/mock" + "github.com/ethersphere/bee/v2/pkg/spinlock" + mockstorer "github.com/ethersphere/bee/v2/pkg/storer/mock" + "github.com/ethersphere/bee/v2/pkg/swarm" + "github.com/ethersphere/bee/v2/pkg/tracing" + "gitlab.com/nolash/go-mockbytes" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" + semconv "go.opentelemetry.io/otel/semconv/v1.25.0" + "go.opentelemetry.io/otel/trace" +) + +// TestTracingHTTPSpan verifies that the HTTP tracing middleware annotates the +// span with the standard server attributes (method, route, status code), marks +// it as a server span, and maps a 5xx response to an Error status while a 2xx +// response leaves the status Unset. These are the fields used to query spans in +// Tempo/Grafana. +func TestTracingHTTPSpan(t *testing.T) { + t.Parallel() + + const resource = "/bytes" + + sr := tracetest.NewSpanRecorder() + tp := sdktrace.NewTracerProvider(sdktrace.WithSpanProcessor(sr)) + t.Cleanup(func() { _ = tp.Shutdown(context.Background()) }) + + storerMock := mockstorer.New() + client, _, _, _ := newTestServer(t, testServerOptions{ + Storer: storerMock, + Tracer: tracing.NewTracerFromProvider(tp), + Logger: log.Noop, + Post: mockpost.New(mockpost.WithAcceptAll()), + }) + + // Upload some content so it can be downloaded with a 200 OK. + g := mockbytes.New(0, mockbytes.MockTypeStandard).WithModulus(255) + content, err := g.SequentialBytes(swarm.ChunkSize * 2) + if err != nil { + t.Fatal(err) + } + + var res api.BytesPostResponse + jsonhttptest.Request(t, client, http.MethodPost, resource, http.StatusCreated, + jsonhttptest.WithRequestHeader(api.SwarmDeferredUploadHeader, "true"), + jsonhttptest.WithRequestHeader(api.SwarmPostageBatchIdHeader, batchOkStr), + jsonhttptest.WithRequestBody(bytes.NewReader(content)), + jsonhttptest.WithUnmarshalJSONResponse(&res), + ) + + // Successful download -> 200 OK. + jsonhttptest.Request(t, client, http.MethodGet, resource+"/"+res.Reference.String(), http.StatusOK, + jsonhttptest.WithExpectedResponse(content), + ) + + // Invalid address triggers an internal error -> 500. + jsonhttptest.Request(t, client, http.MethodGet, resource+"/abcd", http.StatusInternalServerError, + jsonhttptest.WithExpectedJSONResponse(jsonhttp.StatusResponse{ + Message: "joiner failed", + Code: http.StatusInternalServerError, + }), + ) + + // span.End() runs in the middleware's deferred call, which can fire after the + // client has already received the response, so wait for both download spans. + var spans []sdktrace.ReadOnlySpan + if err := spinlock.Wait(time.Second, func() bool { + spans = endedSpansByName(sr, "bytes-download") + return len(spans) == 2 + }); err != nil { + t.Fatalf("expected 2 bytes-download spans, got %d", len(spans)) + } + + var ok200, err500 sdktrace.ReadOnlySpan + for _, s := range spans { + switch spanStatusCode(s) { + case http.StatusOK: + ok200 = s + case http.StatusInternalServerError: + err500 = s + } + } + if ok200 == nil || err500 == nil { + t.Fatalf("missing spans by status code: have ok=%t err=%t", ok200 != nil, err500 != nil) + } + + // Both spans carry the standard HTTP server attributes and are server-kind. + for _, s := range []sdktrace.ReadOnlySpan{ok200, err500} { + if s.SpanKind() != trace.SpanKindServer { + t.Errorf("span kind = %v, want server", s.SpanKind()) + } + if got := spanAttrString(s, semconv.HTTPRequestMethodKey); got != http.MethodGet { + t.Errorf("http.request.method = %q, want %q", got, http.MethodGet) + } + if got := spanAttrString(s, semconv.HTTPRouteKey); got != "bytes-download" { + t.Errorf("http.route = %q, want %q", got, "bytes-download") + } + } + + // A 5xx response marks the span as Error; a 2xx response stays Unset. + if got := err500.Status().Code; got != codes.Error { + t.Errorf("500 span status = %v, want %v", got, codes.Error) + } + if got := ok200.Status().Code; got != codes.Unset { + t.Errorf("200 span status = %v, want %v", got, codes.Unset) + } +} + +func endedSpansByName(sr *tracetest.SpanRecorder, name string) []sdktrace.ReadOnlySpan { + var out []sdktrace.ReadOnlySpan + for _, s := range sr.Ended() { + if s.Name() == name { + out = append(out, s) + } + } + return out +} + +func spanStatusCode(s sdktrace.ReadOnlySpan) int { + for _, a := range s.Attributes() { + if a.Key == semconv.HTTPResponseStatusCodeKey { + return int(a.Value.AsInt64()) + } + } + return 0 +} + +func spanAttrString(s sdktrace.ReadOnlySpan, key attribute.Key) string { + for _, a := range s.Attributes() { + if a.Key == key { + return a.Value.AsString() + } + } + return "" +} diff --git a/pkg/node/node.go b/pkg/node/node.go index afcf55e3cf3..6b14a2c4112 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -26,6 +26,7 @@ import ( "time" "github.com/ethereum/go-ethereum/common" + bee "github.com/ethersphere/bee/v2" "github.com/ethersphere/bee/v2/pkg/accesscontrol" "github.com/ethersphere/bee/v2/pkg/accounting" "github.com/ethersphere/bee/v2/pkg/addressbook" @@ -219,6 +220,19 @@ const ( maxAllowedDoubling = 1 ) +// tracingEnvironment maps a network id to the deployment.environment trace +// attribute. Unknown ids are reported as "private". +func tracingEnvironment(networkID uint64) string { + switch networkID { + case config.Mainnet.NetworkID: + return "mainnet" + case config.Testnet.NetworkID: + return "testnet" + default: + return "private" + } +} + func NewBee( ctx context.Context, addr string, @@ -237,19 +251,6 @@ func NewBee( nodeMetrics := newMetrics() - tracer, tracerCloser, err := tracing.NewTracer(&tracing.Options{ - Enabled: o.TracingEnabled, - Endpoint: o.TracingEndpoint, - Insecure: o.TracingInsecure, - CAFile: o.TracingCAFile, - Protocol: o.TracingProtocol, - SamplingRatio: o.TracingSamplingRatio, - ServiceName: o.TracingServiceName, - }) - if err != nil { - return nil, fmt.Errorf("tracer: %w", err) - } - if err := validatePublicAddress(o.NATAddr); err != nil { return nil, fmt.Errorf("invalid NAT address %s: %w", o.NATAddr, err) } @@ -283,7 +284,6 @@ func NewBee( logger: logger, ctxCancel: ctxCancel, errorLogWriter: sink, - tracerCloser: tracerCloser, syncingStopped: syncutil.NewSignaler(), } @@ -394,6 +394,24 @@ func NewBee( return nil, fmt.Errorf("check overlay address: %w", err) } + tracer, tracerCloser, err := tracing.NewTracer(&tracing.Options{ + Enabled: o.TracingEnabled, + Endpoint: o.TracingEndpoint, + Insecure: o.TracingInsecure, + CAFile: o.TracingCAFile, + Protocol: o.TracingProtocol, + SamplingRatio: o.TracingSamplingRatio, + ServiceName: o.TracingServiceName, + ServiceVersion: bee.Version, + Environment: tracingEnvironment(networkID), + InstanceID: swarmAddress.String(), + Logger: logger, + }) + if err != nil { + return nil, fmt.Errorf("tracer: %w", err) + } + b.tracerCloser = tracerCloser + var ( chequebookService chequebook.Service = new(noOpChequebookService) chequeStore chequebook.ChequeStore diff --git a/pkg/pingpong/pingpong.go b/pkg/pingpong/pingpong.go index 7c90fe04dac..858ba4d4f39 100644 --- a/pkg/pingpong/pingpong.go +++ b/pkg/pingpong/pingpong.go @@ -19,6 +19,7 @@ import ( "github.com/ethersphere/bee/v2/pkg/pingpong/pb" "github.com/ethersphere/bee/v2/pkg/swarm" "github.com/ethersphere/bee/v2/pkg/tracing" + "go.opentelemetry.io/otel/attribute" ) // loggerName is the tree path name of the logger for this package. @@ -65,6 +66,7 @@ func (s *Service) Protocol() p2p.ProtocolSpec { func (s *Service) Ping(ctx context.Context, address swarm.Address, msgs ...string) (rtt time.Duration, err error) { span, _, ctx := s.tracer.StartSpanFromContext(ctx, "pingpong-p2p-ping", s.logger) + span.SetAttributes(attribute.String("peer_address", address.String())) defer span.End() start := time.Now() @@ -104,6 +106,7 @@ func (s *Service) handler(ctx context.Context, p p2p.Peer, stream p2p.Stream) er defer stream.FullClose() span, _, ctx := s.tracer.StartSpanFromContext(ctx, "pingpong-p2p-handler", s.logger) + span.SetAttributes(attribute.String("peer_address", p.Address.String())) defer span.End() var ping pb.Ping diff --git a/pkg/pushsync/pushsync.go b/pkg/pushsync/pushsync.go index 59a5e842afe..f11dfd55f9c 100644 --- a/pkg/pushsync/pushsync.go +++ b/pkg/pushsync/pushsync.go @@ -209,7 +209,7 @@ func (ps *PushSync) handler(ctx context.Context, p p2p.Peer, stream p2p.Stream) span, _, ctx := ps.tracer.StartSpanFromContext(ctx, "pushsync-handler", ps.logger, trace.WithAttributes( attribute.String("address", chunkAddress.String()), - attribute.Int64("tagID", int64(chunk.TagID())), + attribute.Int64("tag_id", int64(chunk.TagID())), attribute.String("sender_address", p.Address.String()), )) diff --git a/pkg/storer/netstore.go b/pkg/storer/netstore.go index 1ec9e199fcf..f9e43ae59c4 100644 --- a/pkg/storer/netstore.go +++ b/pkg/storer/netstore.go @@ -32,6 +32,7 @@ func (db *DB) DirectUpload() PutterSession { defer func() { <-db.directUploadLimiter }() span, logger, ctx := db.tracer.FollowSpanFromContext(ctx, "put-direct-upload", db.logger) + span.SetAttributes(attribute.String("address", ch.Address().String())) defer func() { if err != nil { tracing.RecordError(span, err) @@ -83,6 +84,7 @@ func (db *DB) Download(cache bool) storage.Getter { return getterWithMetrics{ storage.GetterFunc(func(ctx context.Context, address swarm.Address) (ch swarm.Chunk, err error) { span, logger, ctx := db.tracer.StartSpanFromContext(ctx, "get-chunk", db.logger) + span.SetAttributes(attribute.String("address", address.String())) defer func() { if err != nil { tracing.RecordError(span, err) diff --git a/pkg/tracing/export_test.go b/pkg/tracing/export_test.go index 6a1c05f3546..d39d46ca0fe 100644 --- a/pkg/tracing/export_test.go +++ b/pkg/tracing/export_test.go @@ -8,4 +8,7 @@ const ( LogField = logField ) -var LoadCAFile = loadCAFile +var ( + LoadCAFile = loadCAFile + NewResource = newResource +) diff --git a/pkg/tracing/resource_test.go b/pkg/tracing/resource_test.go new file mode 100644 index 00000000000..84fe283b2c5 --- /dev/null +++ b/pkg/tracing/resource_test.go @@ -0,0 +1,112 @@ +// Copyright 2026 The Swarm Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package tracing_test + +import ( + "testing" + + "github.com/ethersphere/bee/v2/pkg/tracing" + "go.opentelemetry.io/otel/attribute" + semconv "go.opentelemetry.io/otel/semconv/v1.25.0" +) + +// attrValue returns the string value of key in res, or ("", false) when absent. +func attrValue(t *testing.T, res interface{ Set() *attribute.Set }, key attribute.Key) (string, bool) { + t.Helper() + + v, ok := res.Set().Value(key) + if !ok { + return "", false + } + return v.AsString(), true +} + +func TestNewResource(t *testing.T) { + // Not parallel: a subtest uses t.Setenv, which forbids a parallel parent. + // The parallel subtests below still pause until this function returns, by + // which point the env-mutating subtest has run inline and restored the env. + + t.Run("service name and version", func(t *testing.T) { + t.Parallel() + + res, err := tracing.NewResource(&tracing.Options{ + ServiceName: "bee", + ServiceVersion: "2.8.0-abcdef", + Environment: "mainnet", + InstanceID: "overlay-abc123", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got, ok := attrValue(t, res, semconv.ServiceNameKey); !ok || got != "bee" { + t.Errorf("service.name = %q (present=%v), want %q", got, ok, "bee") + } + if got, ok := attrValue(t, res, semconv.ServiceVersionKey); !ok || got != "2.8.0-abcdef" { + t.Errorf("service.version = %q (present=%v), want %q", got, ok, "2.8.0-abcdef") + } + if got, ok := attrValue(t, res, semconv.DeploymentEnvironmentKey); !ok || got != "mainnet" { + t.Errorf("deployment.environment = %q (present=%v), want %q", got, ok, "mainnet") + } + if got, ok := attrValue(t, res, semconv.ServiceInstanceIDKey); !ok || got != "overlay-abc123" { + t.Errorf("service.instance.id = %q (present=%v), want %q", got, ok, "overlay-abc123") + } + // WithTelemetrySDK must contribute the SDK identity attributes. + if _, ok := attrValue(t, res, semconv.TelemetrySDKNameKey); !ok { + t.Error("telemetry.sdk.name is absent, WithTelemetrySDK not applied") + } + // WithHost must contribute host.name so spans are attributable per node. + if _, ok := attrValue(t, res, semconv.HostNameKey); !ok { + t.Error("host.name is absent, WithHost not applied") + } + }) + + t.Run("empty environment omits attribute", func(t *testing.T) { + t.Parallel() + + res, err := tracing.NewResource(&tracing.Options{ServiceName: "bee"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got, ok := attrValue(t, res, semconv.DeploymentEnvironmentKey); ok { + t.Errorf("deployment.environment = %q, want it to be absent", got) + } + }) + + t.Run("empty version omits attribute", func(t *testing.T) { + t.Parallel() + + res, err := tracing.NewResource(&tracing.Options{ServiceName: "bee"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got, ok := attrValue(t, res, semconv.ServiceVersionKey); ok { + t.Errorf("service.version = %q, want it to be absent", got) + } + }) + + // Configured service name/version are applied after WithFromEnv, so they + // take precedence over a colliding OTEL_SERVICE_NAME while unrelated + // environment attributes still enrich the resource. This subtest mutates the + // process environment, so it must not run in parallel. + t.Run("configured values win over env, env enriches", func(t *testing.T) { + t.Setenv("OTEL_SERVICE_NAME", "from-env") + t.Setenv("OTEL_RESOURCE_ATTRIBUTES", "deployment.environment=testnet") + + res, err := tracing.NewResource(&tracing.Options{ServiceName: "configured"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got, ok := attrValue(t, res, semconv.ServiceNameKey); !ok || got != "configured" { + t.Errorf("service.name = %q (present=%v), want configured value to win", got, ok) + } + if got, ok := attrValue(t, res, semconv.DeploymentEnvironmentKey); !ok || got != "testnet" { + t.Errorf("deployment.environment = %q (present=%v), want %q from env", got, ok, "testnet") + } + }) +} diff --git a/pkg/tracing/tracing.go b/pkg/tracing/tracing.go index 51f48d2ba28..c1ef0fc614a 100644 --- a/pkg/tracing/tracing.go +++ b/pkg/tracing/tracing.go @@ -20,6 +20,7 @@ import ( "github.com/ethersphere/bee/v2/pkg/log" "github.com/ethersphere/bee/v2/pkg/p2p" + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/exporters/otlp/otlptrace" @@ -96,6 +97,15 @@ type Options struct { Endpoint string // ServiceName is reported as the OTel service.name resource attribute. ServiceName string + // ServiceVersion is reported as the OTel service.version resource + // attribute. When empty the attribute is omitted. + ServiceVersion string + // Environment is reported as the OTel deployment.environment resource + // attribute (e.g. "mainnet", "testnet"). When empty the attribute is omitted. + Environment string + // InstanceID is reported as the OTel service.instance.id resource attribute + // (the node's overlay address). When empty the attribute is omitted. + InstanceID string // Insecure disables TLS for the OTLP exporter (useful for a local collector). Insecure bool // CAFile is an optional path to a PEM-encoded CA bundle used to verify @@ -109,6 +119,10 @@ type Options struct { // Protocol selects the OTLP exporter transport: "http" or "grpc". Empty // defaults to "http". Protocol string + // Logger, when set, receives a confirmation line once tracing is wired up + // and OTLP exporter errors (e.g. an unreachable collector) via the OTel + // global error handler. Optional. + Logger log.Logger } // NewTracer creates a new Tracer and returns a closer that flushes pending @@ -123,12 +137,10 @@ func NewTracer(o *Options) (*Tracer, io.Closer, error) { } if o.Endpoint == "" { - return nil, nil, errors.New("tracing-otlp-endpoint is required when tracing is enabled") + return nil, nil, errors.New("tracing-endpoint is required when tracing is enabled") } - res, err := resource.New(context.Background(), - resource.WithAttributes(semconv.ServiceName(o.ServiceName)), - ) + res, err := newResource(o) if err != nil { return nil, nil, fmt.Errorf("otel resource: %w", err) } @@ -152,18 +164,62 @@ func NewTracer(o *Options) (*Tracer, io.Closer, error) { tp := sdktrace.NewTracerProvider( sdktrace.WithResource(res), sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(ratio))), + // Batch processor keeps SDK defaults; tunable via OTEL_BSP_* env vars. sdktrace.WithBatcher(exporter), ) + if o.Logger != nil { + // Route async OTLP export failures (e.g. an unreachable collector) to the + // node logger so they are visible instead of silently dropped. + otel.SetErrorHandler(otel.ErrorHandlerFunc(func(err error) { + o.Logger.Warning("tracing exporter error", "error", err) + })) + o.Logger.Info("tracing enabled", "endpoint", o.Endpoint, "protocol", o.Protocol, "sampling_ratio", ratio) + } + return &Tracer{tracer: tp.Tracer(instrumentationName)}, providerCloser{tp: tp}, nil } +// NewTracerFromProvider wraps an existing OTel TracerProvider in a Tracer. It is +// primarily useful for tests that need a recording tracer (e.g. one backed by an +// in-memory span recorder) rather than the OTLP exporter pipeline NewTracer builds. +func NewTracerFromProvider(tp trace.TracerProvider) *Tracer { + return &Tracer{tracer: tp.Tracer(instrumentationName)} +} + +// newResource builds the OTel resource describing this node. The env options +// come before WithAttributes so the configured values win over colliding +// OTEL_SERVICE_NAME/OTEL_RESOURCE_ATTRIBUTES, while the environment can still +// add attributes (e.g. cluster, region). +func newResource(o *Options) (*resource.Resource, error) { + attrs := []attribute.KeyValue{semconv.ServiceName(o.ServiceName)} + if o.ServiceVersion != "" { + attrs = append(attrs, semconv.ServiceVersion(o.ServiceVersion)) + } + if o.Environment != "" { + attrs = append(attrs, semconv.DeploymentEnvironment(o.Environment)) + } + if o.InstanceID != "" { + attrs = append(attrs, semconv.ServiceInstanceID(o.InstanceID)) + } + + return resource.New(context.Background(), + resource.WithFromEnv(), + resource.WithTelemetrySDK(), + resource.WithHost(), + resource.WithAttributes(attrs...), + ) +} + // newOTLPClient builds the OTLP client for the configured transport. An empty // Protocol defaults to HTTP for backward compatibility with the initial OTLP // rollout. func newOTLPClient(o *Options) (otlptrace.Client, error) { switch o.Protocol { case "", protocolHTTP: - opts := []otlptracehttp.Option{otlptracehttp.WithEndpoint(o.Endpoint)} + opts := []otlptracehttp.Option{ + otlptracehttp.WithEndpoint(o.Endpoint), + otlptracehttp.WithCompression(otlptracehttp.GzipCompression), + } if o.Insecure { opts = append(opts, otlptracehttp.WithInsecure()) } else if o.CAFile != "" { @@ -175,7 +231,10 @@ func newOTLPClient(o *Options) (otlptrace.Client, error) { } return otlptracehttp.NewClient(opts...), nil case protocolGRPC: - opts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(o.Endpoint)} + opts := []otlptracegrpc.Option{ + otlptracegrpc.WithEndpoint(o.Endpoint), + otlptracegrpc.WithCompressor("gzip"), + } if o.Insecure { opts = append(opts, otlptracegrpc.WithInsecure()) } else if o.CAFile != "" {