diff --git a/cli_flags.go b/cli_flags.go index ac1ca806..182cd768 100644 --- a/cli_flags.go +++ b/cli_flags.go @@ -12,6 +12,7 @@ import ( "github.com/peterbourgon/ff/v3" "go.opentelemetry.io/ebpf-profiler/internal/controller" + "go.opentelemetry.io/ebpf-profiler/support" "go.opentelemetry.io/ebpf-profiler/tracer" ) @@ -24,6 +25,7 @@ const ( defaultProbabilisticThreshold = tracer.ProbabilisticThresholdMax defaultProbabilisticInterval = 1 * time.Minute defaultArgSendErrorFrames = false + defaultOffCPUThreshold = support.OffCPUThresholdMax // This is the X in 2^(n + x) where n is the default hardcoded map size value defaultArgMapScaleFactor = 0 @@ -61,6 +63,11 @@ var ( "If zero, monotonic-realtime clock sync will be performed once, " + "on agent startup, but not periodically." sendErrorFramesHelp = "Send error frames (devfiler only, breaks Kibana)" + offCPUThresholdHelp = fmt.Sprintf("If set to a value between 1 and %d will enable "+ + "off-cpu profiling: Every time an off-cpu entry point is hit, a random number between "+ + "0 and %d is chosen. If the given threshold is greater than this random number, the "+ + "off-cpu trace is collected and reported.", + support.OffCPUThresholdMax-1, support.OffCPUThresholdMax-1) ) // Package-scope variable, so that conditionally compiled other components can refer @@ -114,6 +121,9 @@ func parseArgs() (*controller.Config, error) { fs.BoolVar(&args.VerboseMode, "verbose", false, verboseModeHelp) fs.BoolVar(&args.Version, "version", false, versionHelp) + fs.UintVar(&args.OffCPUThreshold, "off-cpu-threshold", + defaultOffCPUThreshold, offCPUThresholdHelp) + fs.Usage = func() { fs.PrintDefaults() } diff --git a/host/host.go b/host/host.go index e4bf5629..d04c8c38 100644 --- a/host/host.go +++ b/host/host.go @@ -55,6 +55,8 @@ type Trace struct { KTime times.KTime PID libpf.PID TID libpf.PID + Origin libpf.Origin + OffTime int64 // Time a task was off-cpu in nanoseconds. APMTraceID libpf.APMTraceID APMTransactionID libpf.APMTransactionID CPU int diff --git a/internal/controller/config.go b/internal/controller/config.go index dfd96b91..9a6ece49 100644 --- a/internal/controller/config.go +++ b/internal/controller/config.go @@ -33,7 +33,8 @@ type Config struct { // HostName is the name of the host. HostName string // IPAddress is the IP address of the host that sends data to CollAgentAddr. - IPAddress string + IPAddress string + OffCPUThreshold uint Reporter reporter.Reporter diff --git a/internal/controller/controller.go b/internal/controller/controller.go index ada953b0..d56c604a 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -11,6 +11,7 @@ import ( "go.opentelemetry.io/ebpf-profiler/host" "go.opentelemetry.io/ebpf-profiler/metrics" "go.opentelemetry.io/ebpf-profiler/reporter" + "go.opentelemetry.io/ebpf-profiler/support" "go.opentelemetry.io/ebpf-profiler/times" "go.opentelemetry.io/ebpf-profiler/tracehandler" "go.opentelemetry.io/ebpf-profiler/tracer" @@ -86,6 +87,7 @@ func (c *Controller) Start(ctx context.Context) error { BPFVerifierLogLevel: uint32(c.config.BpfVerifierLogLevel), ProbabilisticInterval: c.config.ProbabilisticInterval, ProbabilisticThreshold: c.config.ProbabilisticThreshold, + OffCPUThreshold: uint32(c.config.OffCPUThreshold), }) if err != nil { return fmt.Errorf("failed to load eBPF tracer: %w", err) @@ -106,6 +108,13 @@ func (c *Controller) Start(ctx context.Context) error { } log.Info("Attached tracer program") + if c.config.OffCPUThreshold < support.OffCPUThresholdMax { + if err := trc.StartOffCPUProfiling(); err != nil { + return fmt.Errorf("failed to start off-cpu profiling: %v", err) + } + log.Printf("Enabled off-cpu profiling") + } + if c.config.ProbabilisticThreshold < tracer.ProbabilisticThresholdMax { trc.StartProbabilisticProfiling(ctx) log.Printf("Enabled probabilistic profiling") diff --git a/libpf/libpf.go b/libpf/libpf.go index ac6a9f3f..b9b6884d 100644 --- a/libpf/libpf.go +++ b/libpf/libpf.go @@ -60,3 +60,6 @@ type Void struct{} // source line numbers associated with offsets in native code, or for source line numbers in // interpreted code. type SourceLineno uint64 + +// Origin determines the source of a trace. +type Origin int diff --git a/libpf/symbol.go b/libpf/symbol.go index cf5c6ec3..81fd1ce5 100644 --- a/libpf/symbol.go +++ b/libpf/symbol.go @@ -6,6 +6,7 @@ package libpf // import "go.opentelemetry.io/ebpf-profiler/libpf" import ( "fmt" "sort" + "strings" ) // SymbolValue represents the value associated with a symbol, e.g. either an @@ -81,6 +82,17 @@ func (symmap *SymbolMap) LookupSymbol(symbolName SymbolName) (*Symbol, error) { return nil, fmt.Errorf("symbol %v not present in map", symbolName) } +// LookupSymbolByPrefix loops over all known symbols and returns the first symbol +// that starts with the given prefix. +func (symmap *SymbolMap) LookupSymbolByPrefix(prefix string) (*Symbol, error) { + for name, sym := range symmap.nameToSymbol { + if strings.HasPrefix(string(name), prefix) { + return sym, nil + } + } + return nil, fmt.Errorf("no symbol present that starts with '%s'", prefix) +} + // LookupSymbolAddress returns the address of a symbol. // Returns SymbolValueInvalid and error if not found. func (symmap *SymbolMap) LookupSymbolAddress(symbolName SymbolName) (SymbolValue, error) { diff --git a/reporter/base_reporter.go b/reporter/base_reporter.go index c024257d..2b65d162 100644 --- a/reporter/base_reporter.go +++ b/reporter/base_reporter.go @@ -13,6 +13,7 @@ import ( "go.opentelemetry.io/ebpf-profiler/libpf/xsync" "go.opentelemetry.io/ebpf-profiler/reporter/internal/pdata" "go.opentelemetry.io/ebpf-profiler/reporter/internal/samples" + "go.opentelemetry.io/ebpf-profiler/support" ) // baseReporter encapsulates shared behavior between all the available reporters. @@ -35,7 +36,7 @@ type baseReporter struct { cgroupv2ID *lru.SyncedLRU[libpf.PID, string] // traceEvents stores reported trace events (trace metadata with frames and counts) - traceEvents xsync.RWMutex[map[samples.TraceAndMetaKey]*samples.TraceEvents] + traceEvents xsync.RWMutex[map[libpf.Origin]samples.KeyToEventMapping] // hostmetadata stores metadata that is sent out with every request. hostmetadata *lru.SyncedLRU[string, string] @@ -97,8 +98,11 @@ func (*baseReporter) ReportMetrics(_ uint32, _ []uint32, _ []int64) {} func (*baseReporter) SupportsReportTraceEvent() bool { return true } func (b *baseReporter) ReportTraceEvent(trace *libpf.Trace, meta *TraceEventMeta) { - traceEventsMap := b.traceEvents.WLock() - defer b.traceEvents.WUnlock(&traceEventsMap) + if meta.Origin != support.TraceOriginSampling && meta.Origin != support.TraceOriginOffCPU { + // At the moment only on-CPU and off-CPU traces are reported. + log.Errorf("Skip reporting trace for unexpected %d origin", meta.Origin) + return + } var extraMeta any if b.cfg.ExtraSampleAttrProd != nil { @@ -122,13 +126,17 @@ func (b *baseReporter) ReportTraceEvent(trace *libpf.Trace, meta *TraceEventMeta ExtraMeta: extraMeta, } - if events, exists := (*traceEventsMap)[key]; exists { + traceEventsMap := b.traceEvents.WLock() + defer b.traceEvents.WUnlock(&traceEventsMap) + + if events, exists := (*traceEventsMap)[meta.Origin][key]; exists { events.Timestamps = append(events.Timestamps, uint64(meta.Timestamp)) - (*traceEventsMap)[key] = events + events.OffTimes = append(events.OffTimes, meta.OffTime) + (*traceEventsMap)[meta.Origin][key] = events return } - (*traceEventsMap)[key] = &samples.TraceEvents{ + (*traceEventsMap)[meta.Origin][key] = &samples.TraceEvents{ Files: trace.Files, Linenos: trace.Linenos, FrameTypes: trace.FrameTypes, @@ -136,6 +144,7 @@ func (b *baseReporter) ReportTraceEvent(trace *libpf.Trace, meta *TraceEventMeta MappingEnds: trace.MappingEnd, MappingFileOffsets: trace.MappingFileOffsets, Timestamps: []uint64{uint64(meta.Timestamp)}, + OffTimes: []int64{meta.OffTime}, } } diff --git a/reporter/collector_reporter.go b/reporter/collector_reporter.go index 253f50a7..8bedb5bf 100644 --- a/reporter/collector_reporter.go +++ b/reporter/collector_reporter.go @@ -16,6 +16,7 @@ import ( "go.opentelemetry.io/ebpf-profiler/libpf/xsync" "go.opentelemetry.io/ebpf-profiler/reporter/internal/pdata" "go.opentelemetry.io/ebpf-profiler/reporter/internal/samples" + "go.opentelemetry.io/ebpf-profiler/support" ) // Assert that we implement the full Reporter interface. @@ -56,16 +57,20 @@ func NewCollector(cfg *Config, nextConsumer xconsumer.Profiles) (*CollectorRepor return nil, err } + originsMap := make(map[libpf.Origin]samples.KeyToEventMapping, 2) + for _, origin := range []libpf.Origin{support.TraceOriginSampling, + support.TraceOriginOffCPU} { + originsMap[origin] = make(samples.KeyToEventMapping) + } + return &CollectorReporter{ baseReporter: &baseReporter{ - cfg: cfg, - name: cfg.Name, - version: cfg.Version, - pdata: data, - cgroupv2ID: cgroupv2ID, - traceEvents: xsync.NewRWMutex( - map[samples.TraceAndMetaKey]*samples.TraceEvents{}, - ), + cfg: cfg, + name: cfg.Name, + version: cfg.Version, + pdata: data, + cgroupv2ID: cgroupv2ID, + traceEvents: xsync.NewRWMutex(originsMap), hostmetadata: hostmetadata, runLoop: &runLoop{ stopSignal: make(chan libpf.Void), diff --git a/reporter/internal/pdata/generate.go b/reporter/internal/pdata/generate.go index 15e47c4b..6f57f024 100644 --- a/reporter/internal/pdata/generate.go +++ b/reporter/internal/pdata/generate.go @@ -15,6 +15,7 @@ import ( "go.opentelemetry.io/ebpf-profiler/libpf" "go.opentelemetry.io/ebpf-profiler/reporter/internal/samples" + "go.opentelemetry.io/ebpf-profiler/support" ) const ( @@ -24,14 +25,16 @@ const ( // Generate generates a pdata request out of internal profiles data, to be // exported. -func (p Pdata) Generate(events map[samples.TraceAndMetaKey]*samples.TraceEvents) pprofile.Profiles { +func (p Pdata) Generate(events map[libpf.Origin]samples.KeyToEventMapping) pprofile.Profiles { profiles := pprofile.NewProfiles() rp := profiles.ResourceProfiles().AppendEmpty() sp := rp.ScopeProfiles().AppendEmpty() - prof := sp.Profiles().AppendEmpty() - prof.SetProfileID(pprofile.ProfileID(mkProfileID())) - p.setProfile(events, prof) - + for _, origin := range []libpf.Origin{support.TraceOriginSampling, + support.TraceOriginOffCPU} { + prof := sp.Profiles().AppendEmpty() + prof.SetProfileID(pprofile.ProfileID(mkProfileID())) + p.setProfile(origin, events[origin], prof) + } return profiles } @@ -48,6 +51,7 @@ func mkProfileID() []byte { // setProfile sets the data an OTLP profile with all collected samples up to // this moment. func (p *Pdata) setProfile( + origin libpf.Origin, events map[samples.TraceAndMetaKey]*samples.TraceEvents, profile pprofile.Profile, ) { @@ -62,13 +66,23 @@ func (p *Pdata) setProfile( funcMap[samples.FuncInfo{Name: "", FileName: ""}] = 0 st := profile.SampleType().AppendEmpty() - st.SetTypeStrindex(getStringMapIndex(stringMap, "samples")) - st.SetUnitStrindex(getStringMapIndex(stringMap, "count")) - - pt := profile.PeriodType() - pt.SetTypeStrindex(getStringMapIndex(stringMap, "cpu")) - pt.SetUnitStrindex(getStringMapIndex(stringMap, "nanoseconds")) - profile.SetPeriod(1e9 / int64(p.samplesPerSecond)) + switch origin { + case support.TraceOriginSampling: + st.SetTypeStrindex(getStringMapIndex(stringMap, "samples")) + st.SetUnitStrindex(getStringMapIndex(stringMap, "count")) + + pt := profile.PeriodType() + pt.SetTypeStrindex(getStringMapIndex(stringMap, "cpu")) + pt.SetUnitStrindex(getStringMapIndex(stringMap, "nanoseconds")) + + profile.SetPeriod(1e9 / int64(p.samplesPerSecond)) + case support.TraceOriginOffCPU: + st.SetTypeStrindex(getStringMapIndex(stringMap, "events")) + st.SetUnitStrindex(getStringMapIndex(stringMap, "nanoseconds")) + default: + log.Errorf("Generating profile for unsupported origin %d", origin) + return + } // Temporary lookup to reference existing Mappings. fileIDtoMapping := make(map[libpf.FileID]int32) @@ -85,7 +99,13 @@ func (p *Pdata) setProfile( endTS = pcommon.Timestamp(traceInfo.Timestamps[len(traceInfo.Timestamps)-1]) sample.TimestampsUnixNano().FromRaw(traceInfo.Timestamps) - sample.Value().Append(1) + + switch origin { + case support.TraceOriginSampling: + sample.Value().Append(1) + case support.TraceOriginOffCPU: + sample.Value().Append(traceInfo.OffTimes...) + } // Walk every frame of the trace. for i := range traceInfo.FrameTypes { diff --git a/reporter/internal/pdata/generate_test.go b/reporter/internal/pdata/generate_test.go index 049ca4d6..ee543099 100644 --- a/reporter/internal/pdata/generate_test.go +++ b/reporter/internal/pdata/generate_test.go @@ -11,6 +11,7 @@ import ( "go.opentelemetry.io/ebpf-profiler/libpf" "go.opentelemetry.io/ebpf-profiler/libpf/xsync" "go.opentelemetry.io/ebpf-profiler/reporter/internal/samples" + "go.opentelemetry.io/ebpf-profiler/support" ) func TestGetStringMapIndex(t *testing.T) { @@ -168,7 +169,7 @@ func TestFunctionTableOrder(t *testing.T) { name string executables map[libpf.FileID]samples.ExecInfo frames map[libpf.FileID]map[libpf.AddressOrLineno]samples.SourceInfo - events map[samples.TraceAndMetaKey]*samples.TraceEvents + events map[libpf.Origin]samples.KeyToEventMapping wantFunctionTable []string }{ @@ -176,7 +177,7 @@ func TestFunctionTableOrder(t *testing.T) { name: "with no executables", executables: map[libpf.FileID]samples.ExecInfo{}, frames: map[libpf.FileID]map[libpf.AddressOrLineno]samples.SourceInfo{}, - events: map[samples.TraceAndMetaKey]*samples.TraceEvents{}, + events: map[libpf.Origin]samples.KeyToEventMapping{}, wantFunctionTable: []string{""}, }, { name: "single executable", @@ -192,27 +193,29 @@ func TestFunctionTableOrder(t *testing.T) { libpf.AddressOrLineno(0x4ef): {FunctionName: "func5"}, }, }, - events: map[samples.TraceAndMetaKey]*samples.TraceEvents{ - {}: { - Files: []libpf.FileID{ - libpf.NewFileID(2, 3), - libpf.NewFileID(2, 3), - libpf.NewFileID(2, 3), - libpf.NewFileID(2, 3), - libpf.NewFileID(2, 3), + events: map[libpf.Origin]samples.KeyToEventMapping{ + support.TraceOriginSampling: map[samples.TraceAndMetaKey]*samples.TraceEvents{ + {}: { + Files: []libpf.FileID{ + libpf.NewFileID(2, 3), + libpf.NewFileID(2, 3), + libpf.NewFileID(2, 3), + libpf.NewFileID(2, 3), + libpf.NewFileID(2, 3), + }, + Linenos: []libpf.AddressOrLineno{ + libpf.AddressOrLineno(0xef), + libpf.AddressOrLineno(0x1ef), + libpf.AddressOrLineno(0x2ef), + libpf.AddressOrLineno(0x3ef), + libpf.AddressOrLineno(0x4ef), + }, + FrameTypes: slices.Repeat([]libpf.FrameType{libpf.KernelFrame}, 5), + MappingStarts: slices.Repeat([]libpf.Address{libpf.Address(0)}, 5), + MappingEnds: slices.Repeat([]libpf.Address{libpf.Address(0)}, 5), + MappingFileOffsets: slices.Repeat([]uint64{0}, 5), + Timestamps: []uint64{1, 2, 3, 4, 5}, }, - Linenos: []libpf.AddressOrLineno{ - libpf.AddressOrLineno(0xef), - libpf.AddressOrLineno(0x1ef), - libpf.AddressOrLineno(0x2ef), - libpf.AddressOrLineno(0x3ef), - libpf.AddressOrLineno(0x4ef), - }, - FrameTypes: slices.Repeat([]libpf.FrameType{libpf.KernelFrame}, 5), - MappingStarts: slices.Repeat([]libpf.Address{libpf.Address(0)}, 5), - MappingEnds: slices.Repeat([]libpf.Address{libpf.Address(0)}, 5), - MappingFileOffsets: slices.Repeat([]uint64{0}, 5), - Timestamps: []uint64{1, 2, 3, 4, 5}, }, }, wantFunctionTable: []string{ @@ -233,7 +236,7 @@ func TestFunctionTableOrder(t *testing.T) { res := d.Generate(tt.events) require.Equal(t, 1, res.ResourceProfiles().Len()) require.Equal(t, 1, res.ResourceProfiles().At(0).ScopeProfiles().Len()) - require.Equal(t, 1, res.ResourceProfiles().At(0).ScopeProfiles().At(0).Profiles().Len()) + require.Equal(t, 2, res.ResourceProfiles().At(0).ScopeProfiles().At(0).Profiles().Len()) p := res.ResourceProfiles().At(0).ScopeProfiles().At(0).Profiles().At(0) require.Equal(t, len(tt.wantFunctionTable), p.FunctionTable().Len()) for i := 0; i < p.FunctionTable().Len(); i++ { diff --git a/reporter/internal/samples/samples.go b/reporter/internal/samples/samples.go index d1777620..72b18613 100644 --- a/reporter/internal/samples/samples.go +++ b/reporter/internal/samples/samples.go @@ -13,6 +13,8 @@ type TraceEventMeta struct { APMServiceName string PID, TID libpf.PID CPU int + Origin libpf.Origin + OffTime int64 } // TraceEvents holds known information about a trace. @@ -24,6 +26,7 @@ type TraceEvents struct { MappingEnds []libpf.Address MappingFileOffsets []uint64 Timestamps []uint64 // in nanoseconds + OffTimes []int64 // in nanoseconds } // TraceAndMetaKey is the deduplication key for samples. This **must always** @@ -46,6 +49,9 @@ type TraceAndMetaKey struct { ExtraMeta any } +// KeyToEventMapping supports temporary mapping traces to additional information. +type KeyToEventMapping map[TraceAndMetaKey]*TraceEvents + // AttrKeyValue is a helper to populate Profile.attribute_table. type AttrKeyValue[T string | int64] struct { Key string diff --git a/reporter/otlp_reporter.go b/reporter/otlp_reporter.go index 4525f642..e4cc5828 100644 --- a/reporter/otlp_reporter.go +++ b/reporter/otlp_reporter.go @@ -23,6 +23,7 @@ import ( "go.opentelemetry.io/ebpf-profiler/libpf/xsync" "go.opentelemetry.io/ebpf-profiler/reporter/internal/pdata" "go.opentelemetry.io/ebpf-profiler/reporter/internal/samples" + "go.opentelemetry.io/ebpf-profiler/support" ) // Assert that we implement the full Reporter interface. @@ -86,16 +87,20 @@ func NewOTLP(cfg *Config) (*OTLPReporter, error) { return nil, err } + originsMap := make(map[libpf.Origin]samples.KeyToEventMapping, 2) + for _, origin := range []libpf.Origin{support.TraceOriginSampling, + support.TraceOriginOffCPU} { + originsMap[origin] = make(samples.KeyToEventMapping) + } + return &OTLPReporter{ baseReporter: &baseReporter{ - cfg: cfg, - name: cfg.Name, - version: cfg.Version, - pdata: data, - cgroupv2ID: cgroupv2ID, - traceEvents: xsync.NewRWMutex( - map[samples.TraceAndMetaKey]*samples.TraceEvents{}, - ), + cfg: cfg, + name: cfg.Name, + version: cfg.Version, + pdata: data, + cgroupv2ID: cgroupv2ID, + traceEvents: xsync.NewRWMutex(originsMap), hostmetadata: hostmetadata, runLoop: &runLoop{ stopSignal: make(chan libpf.Void), @@ -165,7 +170,13 @@ func (r *OTLPReporter) Start(ctx context.Context) error { func (r *OTLPReporter) reportOTLPProfile(ctx context.Context) error { traceEvents := r.traceEvents.WLock() events := maps.Clone(*traceEvents) + originsMap := make(map[libpf.Origin]samples.KeyToEventMapping, 2) clear(*traceEvents) + for _, origin := range []libpf.Origin{support.TraceOriginSampling, + support.TraceOriginOffCPU} { + originsMap[origin] = make(samples.KeyToEventMapping) + } + *traceEvents = originsMap r.traceEvents.WUnlock(&traceEvents) profiles := r.pdata.Generate(events) diff --git a/support/ebpf/bpfdefs.h b/support/ebpf/bpfdefs.h index 7171b3c2..1271f845 100644 --- a/support/ebpf/bpfdefs.h +++ b/support/ebpf/bpfdefs.h @@ -83,6 +83,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, unsigned long long fla (void *)BPF_FUNC_perf_event_output; static int (*bpf_get_stackid)(void *ctx, void *map, u64 flags) = (void *)BPF_FUNC_get_stackid; +static unsigned long long (*bpf_get_prandom_u32)(void) = + (void *) BPF_FUNC_get_prandom_u32; __attribute__ ((format (printf, 1, 3))) static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = diff --git a/support/ebpf/dotnet_tracer.ebpf.c b/support/ebpf/dotnet_tracer.ebpf.c index 7ebac864..9b6ca040 100644 --- a/support/ebpf/dotnet_tracer.ebpf.c +++ b/support/ebpf/dotnet_tracer.ebpf.c @@ -244,7 +244,7 @@ ErrorCode unwind_one_dotnet_frame(PerCPURecord *record, DotnetProcInfo *vi, bool // unwind_dotnet is the entry point for tracing when invoked from the native tracer // or interpreter dispatcher. It does not reset the trace object and will append the // dotnet stack frames to the trace object for the current CPU. -SEC("perf_event/unwind_dotnet") +static inline __attribute__((__always_inline__)) int unwind_dotnet(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) { @@ -289,3 +289,4 @@ int unwind_dotnet(struct pt_regs *ctx) { DEBUG_PRINT("dotnet: tail call for next frame unwinder (%d) failed", unwinder); return -1; } +MULTI_USE_FUNC(unwind_dotnet) diff --git a/support/ebpf/extmaps.h b/support/ebpf/extmaps.h index 5922d9ed..56c58a67 100644 --- a/support/ebpf/extmaps.h +++ b/support/ebpf/extmaps.h @@ -6,8 +6,9 @@ #include "bpf_map.h" // References to map definitions in *.ebpf.c. -extern bpf_map_def progs; +extern bpf_map_def perf_progs; extern bpf_map_def per_cpu_records; +extern bpf_map_def kernel_stackmap; extern bpf_map_def pid_page_to_mapping_info; extern bpf_map_def metrics; extern bpf_map_def report_events; @@ -41,7 +42,6 @@ extern bpf_map_def exe_id_to_21_stack_deltas; extern bpf_map_def exe_id_to_22_stack_deltas; extern bpf_map_def exe_id_to_23_stack_deltas; extern bpf_map_def hotspot_procs; -extern bpf_map_def kernel_stackmap; extern bpf_map_def dotnet_procs; extern bpf_map_def perl_procs; extern bpf_map_def php_procs; diff --git a/support/ebpf/hotspot_tracer.ebpf.c b/support/ebpf/hotspot_tracer.ebpf.c index 9ae1fdc7..54290e11 100644 --- a/support/ebpf/hotspot_tracer.ebpf.c +++ b/support/ebpf/hotspot_tracer.ebpf.c @@ -890,7 +890,7 @@ static ErrorCode hotspot_unwind_one_frame(PerCPURecord *record, HotspotProcInfo // unwind_hotspot is the entry point for tracing when invoked from the native tracer // and it recursive unwinds all HotSpot frames and then jumps back to unwind further // native frames that follow. -SEC("perf_event/unwind_hotspot") +static inline __attribute__((__always_inline__)) int unwind_hotspot(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) @@ -927,3 +927,4 @@ int unwind_hotspot(struct pt_regs *ctx) { DEBUG_PRINT("jvm: tail call for next frame unwinder (%d) failed", unwinder); return -1; } +MULTI_USE_FUNC(unwind_hotspot) diff --git a/support/ebpf/integration_test.ebpf.c b/support/ebpf/integration_test.ebpf.c index 510e72c6..8927c3f0 100644 --- a/support/ebpf/integration_test.ebpf.c +++ b/support/ebpf/integration_test.ebpf.c @@ -24,6 +24,8 @@ void send_sample_traces(void *ctx, u64 pid, s32 kstack) { trace->comm[1] = 0xBB; trace->comm[2] = 0xCC; + trace->origin = TRACE_SAMPLING; + trace->comm[3] = 1; trace->pid = pid; trace->tid = pid; @@ -80,10 +82,10 @@ void send_sample_traces(void *ctx, u64 pid, s32 kstack) { send_trace(ctx, trace); } -// tracepoint__sched_switch fetches the current kernel stack ID from kernel_stackmap and -// communicates it to userspace via kernel_stack_id map. -SEC("tracepoint/sched/sched_switch") -int tracepoint__sched_switch(void *ctx) { +// tracepoint_integration__sched_switch fetches the current kernel stack ID from +// kernel_stackmap and communicates it to userspace via kernel_stack_id map. +SEC("tracepoint/integration/sched_switch") +int tracepoint_integration__sched_switch(void *ctx) { u64 id = bpf_get_current_pid_tgid(); u64 pid = id >> 32; diff --git a/support/ebpf/interpreter_dispatcher.ebpf.c b/support/ebpf/interpreter_dispatcher.ebpf.c index fbc5c598..fcb4b329 100644 --- a/support/ebpf/interpreter_dispatcher.ebpf.c +++ b/support/ebpf/interpreter_dispatcher.ebpf.c @@ -25,8 +25,8 @@ bpf_map_def SEC("maps") metrics = { .max_entries = metricID_Max, }; -// progs maps from a program ID to an eBPF program -bpf_map_def SEC("maps") progs = { +// perf_progs maps from a program ID to a perf eBPF program +bpf_map_def SEC("maps") perf_progs = { .type = BPF_MAP_TYPE_PROG_ARRAY, .key_size = sizeof(u32), .value_size = sizeof(u32), @@ -172,7 +172,8 @@ void maybe_add_apm_info(Trace *trace) { trace->apm_transaction_id.as_int, corr_buf.trace_flags); } -SEC("perf_event/unwind_stop") +// unwind_stop is the tail call destination for PROG_UNWIND_STOP. +static inline __attribute__((__always_inline__)) int unwind_stop(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) @@ -238,6 +239,7 @@ int unwind_stop(struct pt_regs *ctx) { return 0; } +MULTI_USE_FUNC(unwind_stop) char _license[] SEC("license") = "GPL"; // this number will be interpreted by the elf loader diff --git a/support/ebpf/native_stack_trace.ebpf.c b/support/ebpf/native_stack_trace.ebpf.c index 959099cb..c205440d 100644 --- a/support/ebpf/native_stack_trace.ebpf.c +++ b/support/ebpf/native_stack_trace.ebpf.c @@ -4,14 +4,6 @@ #include "tracemgmt.h" #include "stackdeltatypes.h" -#ifndef __USER32_CS - // defined in arch/x86/include/asm/segment.h - #define GDT_ENTRY_DEFAULT_USER32_CS 4 - #define GDT_ENTRY_DEFAULT_USER_DS 5 - #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) - #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) -#endif - // Macro to create a map named exe_id_to_X_stack_deltas that is a nested maps with a fileID for the // outer map and an array as inner map that holds up to 2^X stack delta entries for the given fileID. #define STACK_DELTA_BUCKET(X) \ @@ -94,30 +86,6 @@ ErrorCode push_native(Trace *trace, u64 file, u64 line, bool return_address) { return _push_with_return_address(trace, file, line, FRAME_MARKER_NATIVE, return_address); } -#ifdef __aarch64__ -// Strips the PAC tag from a pointer. -// -// While all pointers can contain PAC tags, we only apply this function to code pointers, because -// that's where normalization is required to make the stack delta lookups work. Note that if that -// should ever change, we'd need a different mask for the data pointers, because it might diverge -// from the mask for code pointers. -static inline u64 normalize_pac_ptr(u64 ptr) { - // Retrieve PAC mask from the system config. - u32 key = 0; - SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key); - if (!syscfg) { - // Unreachable: array maps are always fully initialized. - return ptr; - } - - // Mask off PAC bits. Since we're always applying this to usermode pointers that should have all - // the high bits set to 0, we don't need to consider the case of having to fill up the resulting - // hole with 1s (like we'd have to for kernel ptrs). - ptr &= syscfg->inverse_pac_mask; - return ptr; -} -#endif - // A single step for the bsearch into the big_stack_deltas array. This is really a textbook bsearch // step, built in a way to update the value of *lo and *hi. This function will be called repeatedly // (since we cannot do loops). The return value signals whether the bsearch came to an end / found @@ -607,157 +575,8 @@ static ErrorCode unwind_one_frame(u64 pid, u32 frame_idx, struct UnwindState *st #error unsupported architecture #endif -// Initialize state from pt_regs -static inline ErrorCode copy_state_regs(UnwindState *state, - struct pt_regs *regs, - bool interrupted_kernelmode) -{ -#if defined(__x86_64__) - // Check if the process is running in 32-bit mode on the x86_64 system. - // This check follows the Linux kernel implementation of user_64bit_mode() in - // arch/x86/include/asm/ptrace.h. - if (regs->cs == __USER32_CS) { - return ERR_NATIVE_X64_32BIT_COMPAT_MODE; - } - state->pc = regs->ip; - state->sp = regs->sp; - state->fp = regs->bp; - state->rax = regs->ax; - state->r9 = regs->r9; - state->r11 = regs->r11; - state->r13 = regs->r13; - state->r15 = regs->r15; - - // Treat syscalls as return addresses, but not IRQ handling, page faults, etc.. - // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39 - // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847 - state->return_address = interrupted_kernelmode && regs->orig_ax != -1; -#elif defined(__aarch64__) - // For backwards compatibility aarch64 can run 32-bit code. - // Check if the process is running in this 32-bit compat mod. - if (regs->pstate & PSR_MODE32_BIT) { - return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE; - } - state->pc = normalize_pac_ptr(regs->pc); - state->sp = regs->sp; - state->fp = regs->regs[29]; - state->lr = normalize_pac_ptr(regs->regs[30]); - state->r22 = regs->regs[22]; - - // Treat syscalls as return addresses, but not IRQ handling, page faults, etc.. - // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118 - // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209 - // - // Note: We do not use `unwinder_mark_nonleaf_frame` here, - // because the frame is a leaf frame from the perspective of the user stack, - // regardless of whether we are in a syscall. - state->return_address = interrupted_kernelmode && regs->syscallno != -1; - state->lr_invalid = false; -#endif - - return ERR_OK; -} - -#ifndef TESTING_COREDUMP - -// Read the task's entry stack pt_regs. This has identical functionality -// to bpf_task_pt_regs which is emulated to support older kernels. -// Once kernel requirement is increased to 5.15 this can be replaced with -// the bpf_task_pt_regs() helper. -static inline -long get_task_pt_regs(struct task_struct *task, SystemConfig* syscfg) { - u64 stack_ptr = (u64)task + syscfg->task_stack_offset; - long stack_base; - if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void*) stack_ptr)) { - return 0; - } - return stack_base + syscfg->stack_ptregs_offset; -} - -// Determine whether the given pt_regs are from user-mode register context. -// This needs to detect also invalid pt_regs in case we its kernel thread stack -// without valid user mode pt_regs so is_kernel_address(pc) is not enough. -static inline -bool ptregs_is_usermode(struct pt_regs *regs) { -#if defined(__x86_64__) - // On x86_64 the user mode SS should always be __USER_DS. - if (regs->ss != __USER_DS) { - return false; - } - return true; -#elif defined(__aarch64__) - // Check if the processor state is in the EL0t what linux uses for usermode. - if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) { - return false; - } - return true; -#else -#error add support for new architecture -#endif -} - -// Extract the usermode pt_regs for current task. Use context given pt_regs -// if it is usermode regs, or resolve it via struct task_struct. -// -// State registers are not touched (get_pristine_per_cpu_record already reset it) -// if something fails. has_usermode_regs is set to true if a user-mode register -// context was found: not every thread that we interrupt will actually have -// a user-mode context (e.g. kernel worker threads won't). -static inline ErrorCode get_usermode_regs(struct pt_regs *ctx, - UnwindState *state, - bool *has_usermode_regs) { - ErrorCode error; - - if (!ptregs_is_usermode(ctx)) { - u32 key = 0; - SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key); - if (!syscfg) { - // Unreachable: array maps are always fully initialized. - return ERR_UNREACHABLE; - } - - // Use the current task's entry pt_regs - struct task_struct *task = (struct task_struct *) bpf_get_current_task(); - long ptregs_addr = get_task_pt_regs(task, syscfg); - - struct pt_regs regs; - if (!ptregs_addr || bpf_probe_read_kernel(®s, sizeof(regs), (void*) ptregs_addr)) { - increment_metric(metricID_UnwindNativeErrReadKernelModeRegs); - return ERR_NATIVE_READ_KERNELMODE_REGS; - } - - if (!ptregs_is_usermode(®s)) { - // No usermode registers context found. - return ERR_OK; - } - error = copy_state_regs(state, ®s, true); - } else { - // User mode code interrupted, registers are available via the ebpf context. - error = copy_state_regs(state, ctx, false); - } - if (error == ERR_OK) { - DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp); - *has_usermode_regs = true; - } - return error; -} - -#else // TESTING_COREDUMP - -static inline ErrorCode get_usermode_regs(struct pt_regs *ctx, - UnwindState *state, - bool *has_usermode_regs) { - // Coredumps provide always usermode pt_regs directly. - ErrorCode error = copy_state_regs(state, ctx, false); - if (error == ERR_OK) { - *has_usermode_regs = true; - } - return error; -} - -#endif - -SEC("perf_event/unwind_native") +// unwind_native is the tail call destination for PROG_UNWIND_NATIVE. +static inline __attribute__((__always_inline__)) int unwind_native(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) @@ -809,8 +628,8 @@ int unwind_native(struct pt_regs *ctx) { return -1; } -static inline -int collect_trace(struct pt_regs *ctx) { +SEC("perf_event/native_tracer_entry") +int native_tracer_entry(struct bpf_perf_event_data *ctx) { // Get the PID and TGID register. u64 id = bpf_get_current_pid_tgid(); u32 pid = id >> 32; @@ -820,54 +639,7 @@ int collect_trace(struct pt_regs *ctx) { return 0; } - u64 ktime = bpf_ktime_get_ns(); - - DEBUG_PRINT("==== do_perf_event ===="); - - // The trace is reused on each call to this function so we have to reset the - // variables used to maintain state. - DEBUG_PRINT("Resetting CPU record"); - PerCPURecord *record = get_pristine_per_cpu_record(); - if (!record) { - return -1; - } - - Trace *trace = &record->trace; - trace->pid = pid; - trace->tid = tid; - trace->ktime = ktime; - if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) { - increment_metric(metricID_ErrBPFCurrentComm); - } - - // Get the kernel mode stack trace first - trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID); - DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id); - - // Recursive unwind frames - int unwinder = PROG_UNWIND_STOP; - bool has_usermode_regs = false; - ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs); - if (error || !has_usermode_regs) { - goto exit; - } - - if (!pid_information_exists(ctx, pid)) { - if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) { - increment_metric(metricID_NumProcNew); - } - return 0; - } - error = get_next_unwinder_after_native_frame(record, &unwinder); - -exit: - record->state.unwind_error = error; - tail_call(ctx, unwinder); - DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder); - return -1; -} - -SEC("perf_event/native_tracer_entry") -int native_tracer_entry(struct bpf_perf_event_data *ctx) { - return collect_trace((struct pt_regs*) &ctx->regs); + u64 ts = bpf_ktime_get_ns(); + return collect_trace((struct pt_regs*) &ctx->regs, TRACE_SAMPLING, pid, tid, ts, 0); } +MULTI_USE_FUNC(unwind_native) diff --git a/support/ebpf/off_cpu.ebpf.c b/support/ebpf/off_cpu.ebpf.c new file mode 100644 index 00000000..ec8e86da --- /dev/null +++ b/support/ebpf/off_cpu.ebpf.c @@ -0,0 +1,87 @@ +#include "bpfdefs.h" +#include "tracemgmt.h" +#include "types.h" + +// kprobe_progs maps from a program ID to a kprobe eBPF program +bpf_map_def SEC("maps") kprobe_progs = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = NUM_TRACER_PROGS, +}; + +// sched_times keeps track of sched_switch call times. +bpf_map_def SEC("maps") sched_times = { + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, + .key_size = sizeof(u64), // pid_tgid + .value_size = sizeof(u64), // time in ns + .max_entries = 256, // value is adjusted at load time in loadAllMaps. +}; + +// tracepoint__sched_switch serves as entry point for off cpu profiling. +SEC("tracepoint/sched/sched_switch") +int tracepoint__sched_switch(void *ctx) { + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = pid_tgid & 0xFFFFFFFF; + + if (pid == 0 || tid == 0) { + return 0; + } + + u32 key = 0; + SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key); + if (!syscfg) { + // Unreachable: array maps are always fully initialized. + return ERR_UNREACHABLE; + } + + if (bpf_get_prandom_u32() % OFF_CPU_THRESHOLD_MAX > + syscfg->off_cpu_threshold) { + return 0; + } + + u64 ts = bpf_ktime_get_ns(); + + if (bpf_map_update_elem(&sched_times, &pid_tgid, &ts, BPF_ANY) < 0) { + DEBUG_PRINT("Failed to record sched_switch event entry"); + return 0; + } + + return 0; +} + +// dummy is never loaded or called. It just makes sure kprobe_progs is +// referenced and make the compiler and linker happy. +SEC("kprobe/dummy") +int dummy(struct pt_regs *ctx) { + bpf_tail_call(ctx, &kprobe_progs, 0); + return 0; +} + +// kp__finish_task_switch is triggered right after the scheduler updated +// the CPU registers. +SEC("kprobe/finish_task_switch") +int finish_task_switch(struct pt_regs *ctx) { + // Get the PID and TGID register. + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = pid_tgid & 0xFFFFFFFF; + + if (pid == 0 || tid == 0) { + return 0; + } + + u64 ts = bpf_ktime_get_ns(); + + u64 *start_ts = bpf_map_lookup_elem(&sched_times, &pid_tgid); + if (!start_ts || *start_ts == 0) { + // There is no information from the sched/sched_switch entry hook. + return 0; + } + + u64 diff = ts - *start_ts; + DEBUG_PRINT("==== finish_task_switch ===="); + + return collect_trace(ctx, TRACE_OFF_CPU, pid, tid, ts, diff); +} diff --git a/support/ebpf/perl_tracer.ebpf.c b/support/ebpf/perl_tracer.ebpf.c index c498341b..1e143468 100644 --- a/support/ebpf/perl_tracer.ebpf.c +++ b/support/ebpf/perl_tracer.ebpf.c @@ -356,7 +356,7 @@ int walk_perl_stack(PerCPURecord *record, const PerlProcInfo *perlinfo) { // unwind_perl is the entry point for tracing when invoked from the native tracer // or interpreter dispatcher. It does not reset the trace object and will append the // Perl stack frames to the trace object for the current CPU. -SEC("perf_event/unwind_perl") +static inline __attribute__((__always_inline__)) int unwind_perl(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) { @@ -426,3 +426,4 @@ int unwind_perl(struct pt_regs *ctx) { tail_call(ctx, unwinder); return -1; } +MULTI_USE_FUNC(unwind_perl) diff --git a/support/ebpf/php_tracer.ebpf.c b/support/ebpf/php_tracer.ebpf.c index 677cd185..506f63e3 100644 --- a/support/ebpf/php_tracer.ebpf.c +++ b/support/ebpf/php_tracer.ebpf.c @@ -182,7 +182,8 @@ int walk_php_stack(PerCPURecord *record, PHPProcInfo *phpinfo, bool is_jitted) { return unwinder; } -SEC("perf_event/unwind_php") +// unwind_php is the tail call destination for PROG_UNWIND_PHP. +static inline __attribute__((__always_inline__)) int unwind_php(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) @@ -239,3 +240,4 @@ int unwind_php(struct pt_regs *ctx) { tail_call(ctx, unwinder); return -1; } +MULTI_USE_FUNC(unwind_php) diff --git a/support/ebpf/python_tracer.ebpf.c b/support/ebpf/python_tracer.ebpf.c index d99147be..9d03375f 100644 --- a/support/ebpf/python_tracer.ebpf.c +++ b/support/ebpf/python_tracer.ebpf.c @@ -276,7 +276,7 @@ ErrorCode get_PyFrame(const PyProcInfo *pyinfo, void **frame) { // unwind_python is the entry point for tracing when invoked from the native tracer // or interpreter dispatcher. It does not reset the trace object and will append the // Python stack frames to the trace object for the current CPU. -SEC("perf_event/unwind_python") +static inline __attribute__((__always_inline__)) int unwind_python(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) @@ -318,3 +318,4 @@ int unwind_python(struct pt_regs *ctx) { tail_call(ctx, unwinder); return -1; } +MULTI_USE_FUNC(unwind_python) diff --git a/support/ebpf/ruby_tracer.ebpf.c b/support/ebpf/ruby_tracer.ebpf.c index 41ecacaa..57fa1101 100644 --- a/support/ebpf/ruby_tracer.ebpf.c +++ b/support/ebpf/ruby_tracer.ebpf.c @@ -216,7 +216,8 @@ ErrorCode walk_ruby_stack(PerCPURecord *record, const RubyProcInfo *rubyinfo, return ERR_OK; } -SEC("perf_event/unwind_ruby") +// unwind_ruby is the tail call destination for PROG_UNWIND_RUBY. +static inline __attribute__((__always_inline__)) int unwind_ruby(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) @@ -273,3 +274,4 @@ int unwind_ruby(struct pt_regs *ctx) { tail_call(ctx, unwinder); return -1; } +MULTI_USE_FUNC(unwind_ruby) diff --git a/support/ebpf/tracemgmt.h b/support/ebpf/tracemgmt.h index c0e08a45..ab347190 100644 --- a/support/ebpf/tracemgmt.h +++ b/support/ebpf/tracemgmt.h @@ -10,6 +10,19 @@ #include "types.h" #include "errors.h" +// MULTI_USE_FUNC generates perf event and kprobe eBPF programs +// for a given function. +#define MULTI_USE_FUNC(func_name) \ + SEC("perf_event/"#func_name) \ + int perf_##func_name(struct pt_regs *ctx) { \ + return func_name(ctx); \ + } \ + \ + SEC("kprobe/"#func_name) \ + int kprobe_##func_name(struct pt_regs *ctx) { \ + return func_name(ctx); \ + } + // increment_metric increments the value of the given metricID by 1 static inline __attribute__((__always_inline__)) void increment_metric(u32 metricID) { @@ -443,10 +456,10 @@ int get_next_unwinder_after_interpreter(const PerCPURecord *record) { // tail_call is a wrapper around bpf_tail_call() and ensures that the number of tail calls is not // reached while unwinding the stack. static inline __attribute__((__always_inline__)) -void tail_call(void *ctx, int next) { +void tail_call(void *ctx, int next) { PerCPURecord *record = get_per_cpu_record(); if (!record) { - bpf_tail_call(ctx, &progs, PROG_UNWIND_STOP); + bpf_tail_call(ctx, &perf_progs, PROG_UNWIND_STOP); // In theory bpf_tail_call() should never return. But due to instruction reordering by the // compiler we have to place return here to bribe the verifier to accept this. return; @@ -464,7 +477,237 @@ void tail_call(void *ctx, int next) { } record->tailCalls += 1 ; - bpf_tail_call(ctx, &progs, next); + bpf_tail_call(ctx, &perf_progs, next); +} + +#ifndef __USER32_CS + // defined in arch/x86/include/asm/segment.h + #define GDT_ENTRY_DEFAULT_USER32_CS 4 + #define GDT_ENTRY_DEFAULT_USER_DS 5 + #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) + #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#endif + +#ifdef __aarch64__ +// Strips the PAC tag from a pointer. +// +// While all pointers can contain PAC tags, we only apply this function to code pointers, because +// that's where normalization is required to make the stack delta lookups work. Note that if that +// should ever change, we'd need a different mask for the data pointers, because it might diverge +// from the mask for code pointers. +static inline u64 normalize_pac_ptr(u64 ptr) { + // Retrieve PAC mask from the system config. + u32 key = 0; + SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key); + if (!syscfg) { + // Unreachable: array maps are always fully initialized. + return ptr; + } + + // Mask off PAC bits. Since we're always applying this to usermode pointers that should have all + // the high bits set to 0, we don't need to consider the case of having to fill up the resulting + // hole with 1s (like we'd have to for kernel ptrs). + ptr &= syscfg->inverse_pac_mask; + return ptr; +} +#endif + +// Initialize state from pt_regs +static inline ErrorCode copy_state_regs(UnwindState *state, + struct pt_regs *regs, + bool interrupted_kernelmode) +{ +#if defined(__x86_64__) + // Check if the process is running in 32-bit mode on the x86_64 system. + // This check follows the Linux kernel implementation of user_64bit_mode() in + // arch/x86/include/asm/ptrace.h. + if (regs->cs == __USER32_CS) { + return ERR_NATIVE_X64_32BIT_COMPAT_MODE; + } + state->pc = regs->ip; + state->sp = regs->sp; + state->fp = regs->bp; + state->rax = regs->ax; + state->r9 = regs->r9; + state->r11 = regs->r11; + state->r13 = regs->r13; + state->r15 = regs->r15; + + // Treat syscalls as return addresses, but not IRQ handling, page faults, etc.. + // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39 + // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847 + state->return_address = interrupted_kernelmode && regs->orig_ax != -1; +#elif defined(__aarch64__) + // For backwards compatibility aarch64 can run 32-bit code. + // Check if the process is running in this 32-bit compat mod. + if (regs->pstate & PSR_MODE32_BIT) { + return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE; + } + state->pc = normalize_pac_ptr(regs->pc); + state->sp = regs->sp; + state->fp = regs->regs[29]; + state->lr = normalize_pac_ptr(regs->regs[30]); + state->r22 = regs->regs[22]; + + // Treat syscalls as return addresses, but not IRQ handling, page faults, etc.. + // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118 + // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209 + // + // Note: We do not use `unwinder_mark_nonleaf_frame` here, + // because the frame is a leaf frame from the perspective of the user stack, + // regardless of whether we are in a syscall. + state->return_address = interrupted_kernelmode && regs->syscallno != -1; + state->lr_invalid = false; +#endif + + return ERR_OK; +} + +#ifndef TESTING_COREDUMP + +// Read the task's entry stack pt_regs. This has identical functionality +// to bpf_task_pt_regs which is emulated to support older kernels. +// Once kernel requirement is increased to 5.15 this can be replaced with +// the bpf_task_pt_regs() helper. +static inline +long get_task_pt_regs(struct task_struct *task, SystemConfig* syscfg) { + u64 stack_ptr = (u64)task + syscfg->task_stack_offset; + long stack_base; + if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void*) stack_ptr)) { + return 0; + } + return stack_base + syscfg->stack_ptregs_offset; +} + +// Determine whether the given pt_regs are from user-mode register context. +// This needs to detect also invalid pt_regs in case we its kernel thread stack +// without valid user mode pt_regs so is_kernel_address(pc) is not enough. +static inline +bool ptregs_is_usermode(struct pt_regs *regs) { +#if defined(__x86_64__) + // On x86_64 the user mode SS should always be __USER_DS. + if (regs->ss != __USER_DS) { + return false; + } + return true; +#elif defined(__aarch64__) + // Check if the processor state is in the EL0t what linux uses for usermode. + if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) { + return false; + } + return true; +#else +#error add support for new architecture +#endif +} + +// Extract the usermode pt_regs for current task. Use context given pt_regs +// if it is usermode regs, or resolve it via struct task_struct. +// +// State registers are not touched (get_pristine_per_cpu_record already reset it) +// if something fails. has_usermode_regs is set to true if a user-mode register +// context was found: not every thread that we interrupt will actually have +// a user-mode context (e.g. kernel worker threads won't). +static inline ErrorCode get_usermode_regs(struct pt_regs *ctx, + UnwindState *state, + bool *has_usermode_regs) { + ErrorCode error; + + if (!ptregs_is_usermode(ctx)) { + u32 key = 0; + SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key); + if (!syscfg) { + // Unreachable: array maps are always fully initialized. + return ERR_UNREACHABLE; + } + + // Use the current task's entry pt_regs + struct task_struct *task = (struct task_struct *) bpf_get_current_task(); + long ptregs_addr = get_task_pt_regs(task, syscfg); + + struct pt_regs regs; + if (!ptregs_addr || bpf_probe_read_kernel(®s, sizeof(regs), (void*) ptregs_addr)) { + increment_metric(metricID_UnwindNativeErrReadKernelModeRegs); + return ERR_NATIVE_READ_KERNELMODE_REGS; + } + + if (!ptregs_is_usermode(®s)) { + // No usermode registers context found. + return ERR_OK; + } + error = copy_state_regs(state, ®s, true); + } else { + // User mode code interrupted, registers are available via the ebpf context. + error = copy_state_regs(state, ctx, false); + } + if (error == ERR_OK) { + DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp); + *has_usermode_regs = true; + } + return error; +} + +#else // TESTING_COREDUMP + +static inline ErrorCode get_usermode_regs(struct pt_regs *ctx, + UnwindState *state, + bool *has_usermode_regs) { + // Coredumps provide always usermode pt_regs directly. + ErrorCode error = copy_state_regs(state, ctx, false); + if (error == ERR_OK) { + *has_usermode_regs = true; + } + return error; +} + +#endif // TESTING_COREDUMP + +static inline +int collect_trace(struct pt_regs *ctx, TraceOrigin origin, u32 pid, u32 tid, + u64 trace_timestamp, u64 off_cpu_time) { + // The trace is reused on each call to this function so we have to reset the + // variables used to maintain state. + DEBUG_PRINT("Resetting CPU record"); + PerCPURecord *record = get_pristine_per_cpu_record(); + if (!record) { + return -1; + } + + Trace *trace = &record->trace; + trace->origin = origin; + trace->pid = pid; + trace->tid = tid; + trace->ktime = trace_timestamp; + trace->offtime = off_cpu_time; + if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) { + increment_metric(metricID_ErrBPFCurrentComm); + } + + // Get the kernel mode stack trace first + trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID); + DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id); + + // Recursive unwind frames + int unwinder = PROG_UNWIND_STOP; + bool has_usermode_regs = false; + ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs); + if (error || !has_usermode_regs) { + goto exit; + } + + if (!pid_information_exists(ctx, pid)) { + if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) { + increment_metric(metricID_NumProcNew); + } + return 0; + } + error = get_next_unwinder_after_native_frame(record, &unwinder); + +exit: + record->state.unwind_error = error; + tail_call(ctx, unwinder); + DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder); + return -1; } #endif diff --git a/support/ebpf/tracer.ebpf.release.amd64 b/support/ebpf/tracer.ebpf.release.amd64 index d0f9056f..93097d5f 100644 Binary files a/support/ebpf/tracer.ebpf.release.amd64 and b/support/ebpf/tracer.ebpf.release.amd64 differ diff --git a/support/ebpf/tracer.ebpf.release.arm64 b/support/ebpf/tracer.ebpf.release.arm64 index fbf531cb..5621f4cd 100644 Binary files a/support/ebpf/tracer.ebpf.release.arm64 and b/support/ebpf/tracer.ebpf.release.arm64 differ diff --git a/support/ebpf/types.h b/support/ebpf/types.h index e5592ff8..da3a9efd 100644 --- a/support/ebpf/types.h +++ b/support/ebpf/types.h @@ -331,6 +331,17 @@ typedef enum TracePrograms { NUM_TRACER_PROGS, } TracePrograms; +// TraceOrigin describes the source of the trace. This enables +// origin specific handling of traces in user space. +typedef enum TraceOrigin { + TRACE_UNKNOWN, + TRACE_SAMPLING, + TRACE_OFF_CPU, +} TraceOrigin; + +// OFF_CPU_THRESHOLD_MAX defines the maximum threshold. +#define OFF_CPU_THRESHOLD_MAX 1000 + // MAX_FRAME_UNWINDS defines the maximum number of frames per // Trace we can unwind and respect the limit of eBPF instructions, // limit of tail calls and limit of stack size per eBPF program. @@ -532,6 +543,13 @@ typedef struct Trace { s32 kernel_stack_id; // The number of frames in the stack. u32 stack_len; + + // origin indicates the source of the trace. + TraceOrigin origin; + + // offtime stores the nanoseconds that the trace was off-cpu for. + u64 offtime; + // The frames of the stack trace. Frame frames[MAX_FRAME_UNWINDS]; @@ -851,6 +869,9 @@ typedef struct SystemConfig { // The offset of struct pt_regs within the kernel entry stack. u32 stack_ptregs_offset; + // User defined threshold for off-cpu profiling. + u32 off_cpu_threshold; + // Enables the temporary hack that drops pure errors frames in unwind_stop. bool drop_error_only_traces; } SystemConfig; diff --git a/support/ebpf/v8_tracer.ebpf.c b/support/ebpf/v8_tracer.ebpf.c index d6da2a71..1faa1834 100644 --- a/support/ebpf/v8_tracer.ebpf.c +++ b/support/ebpf/v8_tracer.ebpf.c @@ -284,7 +284,7 @@ ErrorCode unwind_one_v8_frame(PerCPURecord *record, V8ProcInfo *vi, bool top) { // unwind_v8 is the entry point for tracing when invoked from the native tracer // or interpreter dispatcher. It does not reset the trace object and will append the // V8 stack frames to the trace object for the current CPU. -SEC("perf_event/unwind_v8") +static inline __attribute__((__always_inline__)) int unwind_v8(struct pt_regs *ctx) { PerCPURecord *record = get_per_cpu_record(); if (!record) { @@ -328,3 +328,4 @@ int unwind_v8(struct pt_regs *ctx) { DEBUG_PRINT("v8: tail call for next frame unwinder (%d) failed", unwinder); return -1; } +MULTI_USE_FUNC(unwind_v8) diff --git a/support/types.go b/support/types.go index 6387e4d0..a37f38aa 100644 --- a/support/types.go +++ b/support/types.go @@ -105,3 +105,11 @@ const ( // PerfMaxStackDepth is the bpf map data array length for BPF_MAP_TYPE_STACK_TRACE traces PerfMaxStackDepth = C.PERF_MAX_STACK_DEPTH ) + +const ( + TraceOriginUnknown = C.TRACE_UNKNOWN + TraceOriginSampling = C.TRACE_SAMPLING + TraceOriginOffCPU = C.TRACE_OFF_CPU +) + +const OffCPUThresholdMax = C.OFF_CPU_THRESHOLD_MAX diff --git a/tracehandler/tracehandler.go b/tracehandler/tracehandler.go index be7f755b..a0f70a1d 100644 --- a/tracehandler/tracehandler.go +++ b/tracehandler/tracehandler.go @@ -117,6 +117,9 @@ func newTraceHandler(rep reporter.TraceReporter, traceProcessor TraceProcessor, } func (m *traceHandler) HandleTrace(bpfTrace *host.Trace) { + if bpfTrace == nil { + return + } defer m.traceProcessor.SymbolizationComplete(bpfTrace.KTime) timestamp := libpf.UnixTime64(bpfTrace.KTime.UnixNano()) @@ -129,6 +132,8 @@ func (m *traceHandler) HandleTrace(bpfTrace *host.Trace) { CPU: bpfTrace.CPU, ProcessName: bpfTrace.ProcessName, ExecutablePath: bpfTrace.ExecutablePath, + Origin: bpfTrace.Origin, + OffTime: bpfTrace.OffTime, } if !m.reporter.SupportsReportTraceEvent() { diff --git a/tracer/ebpf_integration_test.go b/tracer/ebpf_integration_test.go index 19e3c648..124c1679 100644 --- a/tracer/ebpf_integration_test.go +++ b/tracer/ebpf_integration_test.go @@ -56,7 +56,7 @@ func runKernelFrameProbe(t *testing.T, tracer *Tracer) { require.NoError(t, err) defer restoreRlimit() - prog, err := cebpf.NewProgram(coll.Programs["tracepoint__sched_switch"]) + prog, err := cebpf.NewProgram(coll.Programs["tracepoint_integration__sched_switch"]) require.NoError(t, err) defer prog.Close() @@ -136,6 +136,7 @@ func TestTraceTransmissionAndParsing(t *testing.T) { BPFVerifierLogLevel: 0, ProbabilisticInterval: 100, ProbabilisticThreshold: 100, + OffCPUThreshold: support.OffCPUThresholdMax, }) require.NoError(t, err) @@ -255,7 +256,14 @@ func TestAllTracers(t *testing.T) { kernelSymbols, err := proc.GetKallsyms("/proc/kallsyms") require.NoError(t, err) - _, _, err = initializeMapsAndPrograms(tracertypes.AllTracers(), kernelSymbols, - false, 1, false, false, 0) + _, _, err = initializeMapsAndPrograms(kernelSymbols, &Config{ + IncludeTracers: tracertypes.AllTracers(), + MapScaleFactor: 1, + FilterErrorFrames: false, + KernelVersionCheck: false, + DebugTracer: false, + BPFVerifierLogLevel: 0, + OffCPUThreshold: 10, + }) require.NoError(t, err) } diff --git a/tracer/systemconfig.go b/tracer/systemconfig.go index 0e3a3daf..b8dc3ad4 100644 --- a/tracer/systemconfig.go +++ b/tracer/systemconfig.go @@ -227,7 +227,7 @@ func determineStackLayout(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map func loadSystemConfig(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map, kernelSymbols *libpf.SymbolMap, includeTracers types.IncludedTracers, - filterErrorFrames bool) error { + offCPUThreshold uint32, filterErrorFrames bool) error { pacMask := pacmask.GetPACMask() if pacMask != 0 { log.Infof("Determined PAC mask to be 0x%016X", pacMask) @@ -237,6 +237,7 @@ func loadSystemConfig(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map, syscfg := C.SystemConfig{ inverse_pac_mask: ^C.u64(pacMask), drop_error_only_traces: C.bool(filterErrorFrames), + off_cpu_threshold: C.u32(offCPUThreshold), } if err := parseBTF(&syscfg); err != nil { diff --git a/tracer/tracer.go b/tracer/tracer.go index 4cb624b2..9237fbba 100644 --- a/tracer/tracer.go +++ b/tracer/tracer.go @@ -19,6 +19,7 @@ import ( "unsafe" cebpf "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" "github.com/cilium/ebpf/link" "github.com/elastic/go-perf" log "github.com/sirupsen/logrus" @@ -153,6 +154,8 @@ type Config struct { ProbabilisticInterval time.Duration // ProbabilisticThreshold is the threshold for probabilistic profiling. ProbabilisticThreshold uint + // OffCPUThreshold is the user defined threshold for off-cpu profiling. + OffCPUThreshold uint32 } // hookPoint specifies the group and name of the hooked point in the kernel. @@ -160,6 +163,18 @@ type hookPoint struct { group, name string } +// progLoaderHelper supports the loading process of eBPF programs. +type progLoaderHelper struct { + // enable tells whether a prog shall be loaded. + enable bool + // name of the eBPF program + name string + // progID defines the ID for the eBPF program that is used as key in the tailcallMap. + progID uint32 + // noTailCallTarget indicates if this eBPF program should be added to the tailcallMap. + noTailCallTarget bool +} + // processKernelModulesMetadata computes the FileID of kernel files and reports executable metadata // for all kernel modules and the vmlinux image. func processKernelModulesMetadata(rep reporter.SymbolReporter, kernelModules *libpf.SymbolMap, @@ -267,9 +282,7 @@ func NewTracer(ctx context.Context, cfg *Config) (*Tracer, error) { } // Based on includeTracers we decide later which are loaded into the kernel. - ebpfMaps, ebpfProgs, err := initializeMapsAndPrograms(cfg.IncludeTracers, kernelSymbols, - cfg.FilterErrorFrames, cfg.MapScaleFactor, cfg.KernelVersionCheck, cfg.DebugTracer, - cfg.BPFVerifierLogLevel) + ebpfMaps, ebpfProgs, err := initializeMapsAndPrograms(kernelSymbols, cfg) if err != nil { return nil, fmt.Errorf("failed to load eBPF code: %v", err) } @@ -369,9 +382,7 @@ func buildStackDeltaTemplates(coll *cebpf.CollectionSpec) error { // initializeMapsAndPrograms loads the definitions for the eBPF maps and programs provided // by the embedded elf file and loads these into the kernel. -func initializeMapsAndPrograms(includeTracers types.IncludedTracers, - kernelSymbols *libpf.SymbolMap, filterErrorFrames bool, mapScaleFactor int, - kernelVersionCheck bool, debugTracer bool, bpfVerifierLogLevel uint32) ( +func initializeMapsAndPrograms(kernelSymbols *libpf.SymbolMap, cfg *Config) ( ebpfMaps map[string]*cebpf.Map, ebpfProgs map[string]*cebpf.Program, err error) { // Loading specifications about eBPF programs and maps from the embedded elf file // does not load them into the kernel. @@ -379,7 +390,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers, // References to eBPF maps in the eBPF programs are just placeholders that need to be // replaced by the actual loaded maps later on with RewriteMaps before loading the // programs into the kernel. - coll, err := support.LoadCollectionSpec(debugTracer) + coll, err := support.LoadCollectionSpec(cfg.DebugTracer) if err != nil { return nil, nil, fmt.Errorf("failed to load specification for tracers: %v", err) } @@ -395,7 +406,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers, // Load all maps into the kernel that are used later on in eBPF programs. So we can rewrite // in the next step the placesholders in the eBPF programs with the file descriptors of the // loaded maps in the kernel. - if err = loadAllMaps(coll, ebpfMaps, mapScaleFactor); err != nil { + if err = loadAllMaps(coll, cfg, ebpfMaps); err != nil { return nil, nil, fmt.Errorf("failed to load eBPF maps: %v", err) } @@ -406,7 +417,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers, return nil, nil, fmt.Errorf("failed to rewrite maps: %v", err) } - if kernelVersionCheck { + if cfg.KernelVersionCheck { var major, minor, patch uint32 major, minor, patch, err = GetCurrentKernelVersion() if err != nil { @@ -426,13 +437,68 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers, } } - if err = loadUnwinders(coll, ebpfProgs, ebpfMaps["progs"], includeTracers, - bpfVerifierLogLevel); err != nil { - return nil, nil, fmt.Errorf("failed to load eBPF programs: %v", err) + tailCallProgs := []progLoaderHelper{ + { + progID: uint32(support.ProgUnwindStop), + name: "unwind_stop", + enable: true, + }, + { + progID: uint32(support.ProgUnwindNative), + name: "unwind_native", + enable: true, + }, + { + progID: uint32(support.ProgUnwindHotspot), + name: "unwind_hotspot", + enable: cfg.IncludeTracers.Has(types.HotspotTracer), + }, + { + progID: uint32(support.ProgUnwindPerl), + name: "unwind_perl", + enable: cfg.IncludeTracers.Has(types.PerlTracer), + }, + { + progID: uint32(support.ProgUnwindPHP), + name: "unwind_php", + enable: cfg.IncludeTracers.Has(types.PHPTracer), + }, + { + progID: uint32(support.ProgUnwindPython), + name: "unwind_python", + enable: cfg.IncludeTracers.Has(types.PythonTracer), + }, + { + progID: uint32(support.ProgUnwindRuby), + name: "unwind_ruby", + enable: cfg.IncludeTracers.Has(types.RubyTracer), + }, + { + progID: uint32(support.ProgUnwindV8), + name: "unwind_v8", + enable: cfg.IncludeTracers.Has(types.V8Tracer), + }, + { + progID: uint32(support.ProgUnwindDotnet), + name: "unwind_dotnet", + enable: cfg.IncludeTracers.Has(types.DotnetTracer), + }, + } + + if err = loadPerfUnwinders(coll, ebpfProgs, ebpfMaps["perf_progs"], tailCallProgs, + cfg.BPFVerifierLogLevel); err != nil { + return nil, nil, fmt.Errorf("failed to load perf eBPF programs: %v", err) + } + + if cfg.OffCPUThreshold < support.OffCPUThresholdMax { + if err = loadKProbeUnwinders(coll, ebpfProgs, ebpfMaps["kprobe_progs"], tailCallProgs, + cfg.BPFVerifierLogLevel, ebpfMaps["perf_progs"].FD()); err != nil { + return nil, nil, fmt.Errorf("failed to load kprobe eBPF programs: %v", err) + } } - if err = loadSystemConfig(coll, ebpfMaps, kernelSymbols, includeTracers, - filterErrorFrames); err != nil { + if err = loadSystemConfig(coll, ebpfMaps, kernelSymbols, cfg.IncludeTracers, + cfg.OffCPUThreshold, cfg.FilterErrorFrames); err != nil { return nil, nil, fmt.Errorf("failed to load system config: %v", err) } @@ -457,8 +523,8 @@ func removeTemporaryMaps(ebpfMaps map[string]*cebpf.Map) error { } // loadAllMaps loads all eBPF maps that are used in our eBPF programs. -func loadAllMaps(coll *cebpf.CollectionSpec, ebpfMaps map[string]*cebpf.Map, - mapScaleFactor int) error { +func loadAllMaps(coll *cebpf.CollectionSpec, cfg *Config, + ebpfMaps map[string]*cebpf.Map) error { restoreRlimit, err := rlimit.MaximizeMemlock() if err != nil { return fmt.Errorf("failed to adjust rlimit: %v", err) @@ -479,13 +545,20 @@ func loadAllMaps(coll *cebpf.CollectionSpec, ebpfMaps map[string]*cebpf.Map, ) adaption["pid_page_to_mapping_info"] = - 1 << uint32(pidPageMappingInfoSize+mapScaleFactor) + 1 << uint32(pidPageMappingInfoSize+cfg.MapScaleFactor) adaption["stack_delta_page_to_info"] = - 1 << uint32(stackDeltaPageToInfoSize+mapScaleFactor) + 1 << uint32(stackDeltaPageToInfoSize+cfg.MapScaleFactor) + + // To not loose too many scheduling events but also not oversize + // sched_times, calculate a size based on some assumptions. + // On modern systems /proc/sys/kernel/pid_max defaults to 4194304. + // Try to fit this PID space scaled down with cfg.OffCPUThreshold into + // this map. + adaption["sched_times"] = (4194304 / support.OffCPUThresholdMax) * cfg.OffCPUThreshold for i := support.StackDeltaBucketSmallest; i <= support.StackDeltaBucketLargest; i++ { mapName := fmt.Sprintf("exe_id_to_%d_stack_deltas", i) - adaption[mapName] = 1 << uint32(exeIDToStackDeltasSize+mapScaleFactor) + adaption[mapName] = 1 << uint32(exeIDToStackDeltasSize+cfg.MapScaleFactor) } for mapName, mapSpec := range coll.Maps { @@ -493,6 +566,11 @@ func loadAllMaps(coll *cebpf.CollectionSpec, ebpfMaps map[string]*cebpf.Map, log.Debugf("Size of eBPF map %s: %v", mapName, newSize) mapSpec.MaxEntries = newSize } + if mapName == "sched_times" && + cfg.OffCPUThreshold >= support.OffCPUThresholdMax { + // Off CPU Profiling is not enabled. So do not load this map. + continue + } ebpfMap, err := cebpf.NewMap(mapSpec) if err != nil { return fmt.Errorf("failed to load %s: %v", mapName, err) @@ -503,126 +581,170 @@ func loadAllMaps(coll *cebpf.CollectionSpec, ebpfMaps map[string]*cebpf.Map, return nil } -// loadUnwinders just satisfies the proof of concept and loads all eBPF programs -func loadUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program, - tailcallMap *cebpf.Map, includeTracers types.IncludedTracers, +// loadPerfUnwinders loads all perf eBPF Programs and their tail call targets. +func loadPerfUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program, + tailcallMap *cebpf.Map, tailCallProgs []progLoaderHelper, bpfVerifierLogLevel uint32) error { - restoreRlimit, err := rlimit.MaximizeMemlock() - if err != nil { - return fmt.Errorf("failed to adjust rlimit: %v", err) + programOptions := cebpf.ProgramOptions{ + LogLevel: cebpf.LogLevel(bpfVerifierLogLevel), } - defer restoreRlimit() - type prog struct { - // enable tells whether a prog shall be loaded. - enable bool - // name of the eBPF program - name string - // progID defines the ID for the eBPF program that is used as key in the tailcallMap. - progID uint32 - // noTailCallTarget indicates if this eBPF program should be added to the tailcallMap. - noTailCallTarget bool + progs := make([]progLoaderHelper, len(tailCallProgs)+2) + copy(progs, tailCallProgs) + progs = append(progs, + progLoaderHelper{ + name: "tracepoint__sched_process_exit", + noTailCallTarget: true, + enable: true, + }, + progLoaderHelper{ + name: "native_tracer_entry", + noTailCallTarget: true, + enable: true, + }) + + for _, unwindProg := range progs { + if !unwindProg.enable { + continue + } + + unwindProgName := unwindProg.name + if !unwindProg.noTailCallTarget { + unwindProgName = "perf_" + unwindProg.name + } + + progSpec, ok := coll.Programs[unwindProgName] + if !ok { + return fmt.Errorf("program %s does not exist", unwindProgName) + } + + if err := loadProgram(ebpfProgs, tailcallMap, unwindProg.progID, progSpec, + programOptions, unwindProg.noTailCallTarget); err != nil { + return err + } } + return nil +} + +// progArrayReferences returns a list of instructions which load a specified tail +// call FD. +func progArrayReferences(perfTailCallMapFD int, insns asm.Instructions) []int { + insNos := []int{} + for i := range insns { + ins := &insns[i] + if asm.OpCode(ins.OpCode.Class()) != asm.OpCode(asm.LdClass) { + continue + } + m := ins.Map() + if m == nil { + continue + } + if perfTailCallMapFD == m.FD() { + insNos = append(insNos, i) + } + } + return insNos +} + +// loadKProbeUnwinders reuses large parts of loadPerfUnwinders. By default all eBPF programs +// are written as perf event eBPF programs. loadKProbeUnwinders dynamically rewrites the +// specification of these programs to kprobe eBPF programs and adjusts tail call maps. +func loadKProbeUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program, + tailcallMap *cebpf.Map, tailCallProgs []progLoaderHelper, + bpfVerifierLogLevel uint32, perfTailCallMapFD int) error { programOptions := cebpf.ProgramOptions{ LogLevel: cebpf.LogLevel(bpfVerifierLogLevel), } - for _, unwindProg := range []prog{ - { - progID: uint32(support.ProgUnwindStop), - name: "unwind_stop", - enable: true, - }, - { - progID: uint32(support.ProgUnwindNative), - name: "unwind_native", - enable: true, - }, - { - progID: uint32(support.ProgUnwindHotspot), - name: "unwind_hotspot", - enable: includeTracers.Has(types.HotspotTracer), - }, - { - progID: uint32(support.ProgUnwindPerl), - name: "unwind_perl", - enable: includeTracers.Has(types.PerlTracer), - }, - { - progID: uint32(support.ProgUnwindPHP), - name: "unwind_php", - enable: includeTracers.Has(types.PHPTracer), - }, - { - progID: uint32(support.ProgUnwindPython), - name: "unwind_python", - enable: includeTracers.Has(types.PythonTracer), - }, - { - progID: uint32(support.ProgUnwindRuby), - name: "unwind_ruby", - enable: includeTracers.Has(types.RubyTracer), - }, - { - progID: uint32(support.ProgUnwindV8), - name: "unwind_v8", - enable: includeTracers.Has(types.V8Tracer), - }, - { - progID: uint32(support.ProgUnwindDotnet), - name: "unwind_dotnet", - enable: includeTracers.Has(types.DotnetTracer), - }, - { - name: "tracepoint__sched_process_exit", + progs := make([]progLoaderHelper, len(tailCallProgs)+2) + copy(progs, tailCallProgs) + progs = append(progs, + progLoaderHelper{ + name: "finish_task_switch", noTailCallTarget: true, enable: true, }, - { - name: "native_tracer_entry", + progLoaderHelper{ + name: "tracepoint__sched_switch", noTailCallTarget: true, enable: true, }, - } { + ) + + for _, unwindProg := range progs { if !unwindProg.enable { continue } - // Load the eBPF program into the kernel. If no error is returned, - // the eBPF program can be used/called/triggered from now on. - unwinder, err := cebpf.NewProgramWithOptions(coll.Programs[unwindProg.name], - programOptions) - if err != nil { - // These errors tend to have hundreds of lines (or more), - // so we print each line individually. - if ve, ok := err.(*cebpf.VerifierError); ok { - for _, line := range ve.Log { - log.Error(line) - } - } else { - scanner := bufio.NewScanner(strings.NewReader(err.Error())) - for scanner.Scan() { - log.Error(scanner.Text()) - } + unwindProgName := unwindProg.name + if !unwindProg.noTailCallTarget { + unwindProgName = "kprobe_" + unwindProg.name + } + + progSpec, ok := coll.Programs[unwindProgName] + if !ok { + return fmt.Errorf("program %s does not exist", unwindProgName) + } + + // Replace the prog array for the tail calls. + insns := progArrayReferences(perfTailCallMapFD, progSpec.Instructions) + for _, ins := range insns { + if err := progSpec.Instructions[ins].AssociateMap(tailcallMap); err != nil { + return fmt.Errorf("failed to rewrite map ptr: %v", err) } - return fmt.Errorf("failed to load %s", unwindProg.name) } - ebpfProgs[unwindProg.name] = unwinder - fd := uint32(unwinder.FD()) - if unwindProg.noTailCallTarget { - continue + if err := loadProgram(ebpfProgs, tailcallMap, unwindProg.progID, progSpec, + programOptions, unwindProg.noTailCallTarget); err != nil { + return err } - if err := tailcallMap.Update(unsafe.Pointer(&unwindProg.progID), unsafe.Pointer(&fd), - cebpf.UpdateAny); err != nil { - // Every eBPF program that is loaded within loadUnwinders can be the - // destination of a tail call of another eBPF program. If we can not update - // the eBPF map that manages these destinations our unwinding will fail. - return fmt.Errorf("failed to update tailcall map: %v", err) + } + + return nil +} + +// loadProgram loads an eBPF program from progSpec and populates the related maps. +func loadProgram(ebpfProgs map[string]*cebpf.Program, tailcallMap *cebpf.Map, + progID uint32, progSpec *cebpf.ProgramSpec, programOptions cebpf.ProgramOptions, + noTailCallTarget bool) error { + restoreRlimit, err := rlimit.MaximizeMemlock() + if err != nil { + return fmt.Errorf("failed to adjust rlimit: %v", err) + } + defer restoreRlimit() + + // Load the eBPF program into the kernel. If no error is returned, + // the eBPF program can be used/called/triggered from now on. + unwinder, err := cebpf.NewProgramWithOptions(progSpec, programOptions) + if err != nil { + // These errors tend to have hundreds of lines (or more), + // so we print each line individually. + if ve, ok := err.(*cebpf.VerifierError); ok { + for _, line := range ve.Log { + log.Error(line) + } + } else { + scanner := bufio.NewScanner(strings.NewReader(err.Error())) + for scanner.Scan() { + log.Error(scanner.Text()) + } } + return fmt.Errorf("failed to load %s", progSpec.Name) } + ebpfProgs[progSpec.Name] = unwinder + if noTailCallTarget { + return nil + } + fd := uint32(unwinder.FD()) + if err := tailcallMap.Update(unsafe.Pointer(&progID), unsafe.Pointer(&fd), + cebpf.UpdateAny); err != nil { + // Every eBPF program that is loaded within loadUnwinders can be the + // destination of a tail call of another eBPF program. If we can not update + // the eBPF map that manages these destinations our unwinding will fail. + return fmt.Errorf("failed to update tailcall map: %v", err) + } return nil } @@ -865,18 +987,27 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) *host.Trace { APMTransactionID: *(*libpf.APMTransactionID)(unsafe.Pointer(&ptr.apm_transaction_id)), PID: pid, TID: libpf.PID(ptr.tid), + Origin: libpf.Origin(ptr.origin), + OffTime: int64(ptr.offtime), KTime: times.KTime(ptr.ktime), CPU: cpu, } + if trace.Origin != support.TraceOriginSampling && trace.Origin != support.TraceOriginOffCPU { + log.Warnf("Skip handling trace from unexpected %d origin", trace.Origin) + return nil + } + // Trace fields included in the hash: // - PID, kernel stack ID, length & frame array // Intentionally excluded: - // - ktime, COMM, APM trace, APM transaction ID + // - ktime, COMM, APM trace, APM transaction ID, Origin and Off Time ptr.comm = [16]C.char{} ptr.apm_trace_id = C.ApmTraceID{} ptr.apm_transaction_id = C.ApmSpanID{} ptr.ktime = 0 + ptr.origin = 0 + ptr.offtime = 0 trace.Hash = host.TraceHash(xxh3.Hash128(raw).Lo) userFrameOffs := 0 @@ -1162,6 +1293,39 @@ func (t *Tracer) StartProbabilisticProfiling(ctx context.Context) { }) } +// StartOffCPUProfiling starts off-cpu profiling by attaching the programs to the hooks. +func (t *Tracer) StartOffCPUProfiling() error { + // Attach the second hook for off-cpu profiling first. + kprobeProg, ok := t.ebpfProgs["finish_task_switch"] + if !ok { + return errors.New("off-cpu program finish_task_switch is not available") + } + + kprobeSymbol, err := t.kernelSymbols.LookupSymbolByPrefix("finish_task_switch") + if err != nil { + return errors.New("failed to find kernel symbol for finish_task_switch") + } + + kprobeLink, err := link.Kprobe(string(kprobeSymbol.Name), kprobeProg, nil) + if err != nil { + return err + } + t.hooks[hookPoint{group: "kprobe", name: "finish_task_switch"}] = kprobeLink + + // Attach the first hook that enables off-cpu profiling. + tpProg, ok := t.ebpfProgs["tracepoint__sched_switch"] + if !ok { + return errors.New("tracepoint__sched_switch is not available") + } + tpLink, err := link.Tracepoint("sched", "sched_switch", tpProg, nil) + if err != nil { + return nil + } + t.hooks[hookPoint{group: "sched", name: "sched_switch"}] = tpLink + + return nil +} + // TraceProcessor gets the trace processor. func (t *Tracer) TraceProcessor() tracehandler.TraceProcessor { return t.processManager