forgejo/modules/process/manager_stacktraces.go
zeripath c88547ce71
Add Goroutine stack inspector to admin/monitor (#19207)
Continues on from #19202.

Following the addition of pprof labels we can now more easily understand the relationship between a goroutine and the requests that spawn them. 

This PR takes advantage of the labels and adds a few others, then provides a mechanism for the monitoring page to query the pprof goroutine profile.

The binary profile that results from this profile is immediately piped in to the google library for parsing this and then stack traces are formed for the goroutines.

If the goroutine is within a context or has been created from a goroutine within a process context it will acquire the process description labels for that process. 

The goroutines are mapped with there associate pids and any that do not have an associated pid are placed in a group at the bottom as unbound.

In this way we should be able to more easily examine goroutines that have been stuck.

A manager command `gitea manager processes` is also provided that can export the processes (with or without stacktraces) to the command line.

Signed-off-by: Andrew Thornton <art27@cantab.net>
2022-03-31 19:01:43 +02:00

355 lines
10 KiB
Go

// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package process
import (
"fmt"
"io"
"runtime/pprof"
"sort"
"time"
"github.com/google/pprof/profile"
)
// StackEntry is an entry on a stacktrace
type StackEntry struct {
Function string
File string
Line int
}
// Label represents a pprof label assigned to goroutine stack
type Label struct {
Name string
Value string
}
// Stack is a stacktrace relating to a goroutine. (Multiple goroutines may have the same stacktrace)
type Stack struct {
Count int64 // Number of goroutines with this stack trace
Description string
Labels []*Label `json:",omitempty"`
Entry []*StackEntry `json:",omitempty"`
}
// A Process is a combined representation of a Process and a Stacktrace for the goroutines associated with it
type Process struct {
PID IDType
ParentPID IDType
Description string
Start time.Time
Type string
Children []*Process `json:",omitempty"`
Stacks []*Stack `json:",omitempty"`
}
// Processes gets the processes in a thread safe manner
func (pm *Manager) Processes(flat, noSystem bool) ([]*Process, int) {
pm.mutex.Lock()
processCount := len(pm.processMap)
processes := make([]*Process, 0, len(pm.processMap))
if flat {
for _, process := range pm.processMap {
if noSystem && process.Type == SystemProcessType {
continue
}
processes = append(processes, process.toProcess())
}
} else {
// We need our own processMap
processMap := map[IDType]*Process{}
for _, internalProcess := range pm.processMap {
process, ok := processMap[internalProcess.PID]
if !ok {
process = internalProcess.toProcess()
processMap[process.PID] = process
}
// Check its parent
if process.ParentPID == "" {
processes = append(processes, process)
continue
}
internalParentProcess, ok := pm.processMap[internalProcess.ParentPID]
if ok {
parentProcess, ok := processMap[process.ParentPID]
if !ok {
parentProcess = internalParentProcess.toProcess()
processMap[parentProcess.PID] = parentProcess
}
parentProcess.Children = append(parentProcess.Children, process)
continue
}
processes = append(processes, process)
}
}
pm.mutex.Unlock()
if !flat && noSystem {
for i := 0; i < len(processes); i++ {
process := processes[i]
if process.Type != SystemProcessType {
continue
}
processes[len(processes)-1], processes[i] = processes[i], processes[len(processes)-1]
processes = append(processes[:len(processes)-1], process.Children...)
i--
}
}
// Sort by process' start time. Oldest process appears first.
sort.Slice(processes, func(i, j int) bool {
left, right := processes[i], processes[j]
return left.Start.Before(right.Start)
})
return processes, processCount
}
// ProcessStacktraces gets the processes and stacktraces in a thread safe manner
func (pm *Manager) ProcessStacktraces(flat, noSystem bool) ([]*Process, int, int64, error) {
var stacks *profile.Profile
var err error
// We cannot use the pm.ProcessMap here because we will release the mutex ...
processMap := map[IDType]*Process{}
processCount := 0
// Lock the manager
pm.mutex.Lock()
processCount = len(pm.processMap)
// Add a defer to unlock in case there is a panic
unlocked := false
defer func() {
if !unlocked {
pm.mutex.Unlock()
}
}()
processes := make([]*Process, 0, len(pm.processMap))
if flat {
for _, internalProcess := range pm.processMap {
process := internalProcess.toProcess()
processMap[process.PID] = process
if noSystem && internalProcess.Type == SystemProcessType {
continue
}
processes = append(processes, process)
}
} else {
for _, internalProcess := range pm.processMap {
process, ok := processMap[internalProcess.PID]
if !ok {
process = internalProcess.toProcess()
processMap[process.PID] = process
}
// Check its parent
if process.ParentPID == "" {
processes = append(processes, process)
continue
}
internalParentProcess, ok := pm.processMap[internalProcess.ParentPID]
if ok {
parentProcess, ok := processMap[process.ParentPID]
if !ok {
parentProcess = internalParentProcess.toProcess()
processMap[parentProcess.PID] = parentProcess
}
parentProcess.Children = append(parentProcess.Children, process)
continue
}
processes = append(processes, process)
}
}
// Now from within the lock we need to get the goroutines.
// Why? If we release the lock then between between filling the above map and getting
// the stacktraces another process could be created which would then look like a dead process below
reader, writer := io.Pipe()
defer reader.Close()
go func() {
err := pprof.Lookup("goroutine").WriteTo(writer, 0)
_ = writer.CloseWithError(err)
}()
stacks, err = profile.Parse(reader)
if err != nil {
return nil, 0, 0, err
}
// Unlock the mutex
pm.mutex.Unlock()
unlocked = true
goroutineCount := int64(0)
// Now walk through the "Sample" slice in the goroutines stack
for _, sample := range stacks.Sample {
// In the "goroutine" pprof profile each sample represents one or more goroutines
// with the same labels and stacktraces.
// We will represent each goroutine by a `Stack`
stack := &Stack{}
// Add the non-process associated labels from the goroutine sample to the Stack
for name, value := range sample.Label {
if name == DescriptionPProfLabel || name == PIDPProfLabel || (!flat && name == PPIDPProfLabel) || name == ProcessTypePProfLabel {
continue
}
// Labels from the "goroutine" pprof profile only have one value.
// This is because the underlying representation is a map[string]string
if len(value) != 1 {
// Unexpected...
return nil, 0, 0, fmt.Errorf("label: %s in goroutine stack with unexpected number of values: %v", name, value)
}
stack.Labels = append(stack.Labels, &Label{Name: name, Value: value[0]})
}
// The number of goroutines that this sample represents is the `stack.Value[0]`
stack.Count = sample.Value[0]
goroutineCount += stack.Count
// Now we want to associate this Stack with a Process.
var process *Process
// Try to get the PID from the goroutine labels
if pidvalue, ok := sample.Label[PIDPProfLabel]; ok && len(pidvalue) == 1 {
pid := IDType(pidvalue[0])
// Now try to get the process from our map
process, ok = processMap[pid]
if !ok && pid != "" {
// This means that no process has been found in the process map - but there was a process PID
// Therefore this goroutine belongs to a dead process and it has escaped control of the process as it
// should have died with the process context cancellation.
// We need to create a dead process holder for this process and label it appropriately
// get the parent PID
ppid := IDType("")
if value, ok := sample.Label[PPIDPProfLabel]; ok && len(value) == 1 {
ppid = IDType(value[0])
}
// format the description
description := "(dead process)"
if value, ok := sample.Label[DescriptionPProfLabel]; ok && len(value) == 1 {
description = value[0] + " " + description
}
// override the type of the process to "code" but add the old type as a label on the first stack
ptype := NoneProcessType
if value, ok := sample.Label[ProcessTypePProfLabel]; ok && len(value) == 1 {
stack.Labels = append(stack.Labels, &Label{Name: ProcessTypePProfLabel, Value: value[0]})
}
process = &Process{
PID: pid,
ParentPID: ppid,
Description: description,
Type: ptype,
}
// Now add the dead process back to the map and tree so we don't go back through this again.
processMap[process.PID] = process
added := false
if process.ParentPID != "" && !flat {
if parent, ok := processMap[process.ParentPID]; ok {
parent.Children = append(parent.Children, process)
added = true
}
}
if !added {
processes = append(processes, process)
}
}
}
if process == nil {
// This means that the sample we're looking has no PID label
var ok bool
process, ok = processMap[""]
if !ok {
// this is the first time we've come acrross an unassociated goroutine so create a "process" to hold them
process = &Process{
Description: "(unassociated)",
Type: NoneProcessType,
}
processMap[process.PID] = process
processes = append(processes, process)
}
}
// The sample.Location represents a stack trace for this goroutine,
// however each Location can represent multiple lines (mostly due to inlining)
// so we need to walk the lines too
for _, location := range sample.Location {
for _, line := range location.Line {
entry := &StackEntry{
Function: line.Function.Name,
File: line.Function.Filename,
Line: int(line.Line),
}
stack.Entry = append(stack.Entry, entry)
}
}
// Now we need a short-descriptive name to call the stack trace if when it is folded and
// assuming the stack trace has some lines we'll choose the bottom of the stack (i.e. the
// initial function that started the stack trace.) The top of the stack is unlikely to
// be very helpful as a lot of the time it will be runtime.select or some other call into
// a std library.
stack.Description = "(unknown)"
if len(stack.Entry) > 0 {
stack.Description = stack.Entry[len(stack.Entry)-1].Function
}
process.Stacks = append(process.Stacks, stack)
}
// restrict to not show system processes
if noSystem {
for i := 0; i < len(processes); i++ {
process := processes[i]
if process.Type != SystemProcessType && process.Type != NoneProcessType {
continue
}
processes[len(processes)-1], processes[i] = processes[i], processes[len(processes)-1]
processes = append(processes[:len(processes)-1], process.Children...)
i--
}
}
// Now finally re-sort the processes. Newest process appears first
after := func(processes []*Process) func(i, j int) bool {
return func(i, j int) bool {
left, right := processes[i], processes[j]
return left.Start.After(right.Start)
}
}
sort.Slice(processes, after(processes))
if !flat {
var sortChildren func(process *Process)
sortChildren = func(process *Process) {
sort.Slice(process.Children, after(process.Children))
for _, child := range process.Children {
sortChildren(child)
}
}
}
return processes, processCount, goroutineCount, err
}