Source file src/runtime/metrics_test.go

     1  // Copyright 2020 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package runtime_test
     6  
     7  import (
     8  	"bytes"
     9  	"internal/abi"
    10  	"internal/goexperiment"
    11  	"internal/profile"
    12  	"internal/testenv"
    13  	"os"
    14  	"reflect"
    15  	"runtime"
    16  	"runtime/debug"
    17  	"runtime/metrics"
    18  	"runtime/pprof"
    19  	"runtime/trace"
    20  	"slices"
    21  	"sort"
    22  	"strings"
    23  	"sync"
    24  	"sync/atomic"
    25  	"syscall"
    26  	"testing"
    27  	"time"
    28  	"unsafe"
    29  )
    30  
    31  func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
    32  	all := metrics.All()
    33  	samples := make([]metrics.Sample, len(all))
    34  	descs := make(map[string]metrics.Description)
    35  	for i := range all {
    36  		samples[i].Name = all[i].Name
    37  		descs[all[i].Name] = all[i]
    38  	}
    39  	return descs, samples
    40  }
    41  
    42  func TestReadMetrics(t *testing.T) {
    43  	// Run a GC cycle to get some of the stats to be non-zero.
    44  	runtime.GC()
    45  
    46  	// Set an arbitrary memory limit to check the metric for it
    47  	limit := int64(512 * 1024 * 1024)
    48  	oldLimit := debug.SetMemoryLimit(limit)
    49  	defer debug.SetMemoryLimit(oldLimit)
    50  
    51  	// Set a GC percent to check the metric for it
    52  	gcPercent := 99
    53  	oldGCPercent := debug.SetGCPercent(gcPercent)
    54  	defer debug.SetGCPercent(oldGCPercent)
    55  
    56  	// Tests whether readMetrics produces values aligning
    57  	// with ReadMemStats while the world is stopped.
    58  	var mstats runtime.MemStats
    59  	_, samples := prepareAllMetricsSamples()
    60  	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
    61  
    62  	checkUint64 := func(t *testing.T, m string, got, want uint64) {
    63  		t.Helper()
    64  		if got != want {
    65  			t.Errorf("metric %q: got %d, want %d", m, got, want)
    66  		}
    67  	}
    68  
    69  	// Check to make sure the values we read line up with other values we read.
    70  	var allocsBySize, gcPauses, schedPausesTotalGC *metrics.Float64Histogram
    71  	var tinyAllocs uint64
    72  	var mallocs, frees uint64
    73  	for i := range samples {
    74  		switch name := samples[i].Name; name {
    75  		case "/cgo/go-to-c-calls:calls":
    76  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(runtime.NumCgoCall()))
    77  		case "/memory/classes/heap/free:bytes":
    78  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
    79  		case "/memory/classes/heap/released:bytes":
    80  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
    81  		case "/memory/classes/heap/objects:bytes":
    82  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
    83  		case "/memory/classes/heap/unused:bytes":
    84  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
    85  		case "/memory/classes/heap/stacks:bytes":
    86  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
    87  		case "/memory/classes/metadata/mcache/free:bytes":
    88  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
    89  		case "/memory/classes/metadata/mcache/inuse:bytes":
    90  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
    91  		case "/memory/classes/metadata/mspan/free:bytes":
    92  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
    93  		case "/memory/classes/metadata/mspan/inuse:bytes":
    94  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
    95  		case "/memory/classes/metadata/other:bytes":
    96  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
    97  		case "/memory/classes/os-stacks:bytes":
    98  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
    99  		case "/memory/classes/other:bytes":
   100  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
   101  		case "/memory/classes/profiling/buckets:bytes":
   102  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
   103  		case "/memory/classes/total:bytes":
   104  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
   105  		case "/gc/heap/allocs-by-size:bytes":
   106  			hist := samples[i].Value.Float64Histogram()
   107  			// Skip size class 0 in BySize, because it's always empty and not represented
   108  			// in the histogram.
   109  			for i, sc := range mstats.BySize[1:] {
   110  				if b, s := hist.Buckets[i+1], float64(sc.Size+1); b != s {
   111  					t.Errorf("bucket does not match size class: got %f, want %f", b, s)
   112  					// The rest of the checks aren't expected to work anyway.
   113  					continue
   114  				}
   115  				if c, m := hist.Counts[i], sc.Mallocs; c != m {
   116  					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, m)
   117  				}
   118  			}
   119  			allocsBySize = hist
   120  		case "/gc/heap/allocs:bytes":
   121  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc)
   122  		case "/gc/heap/frees-by-size:bytes":
   123  			hist := samples[i].Value.Float64Histogram()
   124  			// Skip size class 0 in BySize, because it's always empty and not represented
   125  			// in the histogram.
   126  			for i, sc := range mstats.BySize[1:] {
   127  				if b, s := hist.Buckets[i+1], float64(sc.Size+1); b != s {
   128  					t.Errorf("bucket does not match size class: got %f, want %f", b, s)
   129  					// The rest of the checks aren't expected to work anyway.
   130  					continue
   131  				}
   132  				if c, f := hist.Counts[i], sc.Frees; c != f {
   133  					t.Errorf("histogram counts do not match BySize for class %d: got %d, want %d", i, c, f)
   134  				}
   135  			}
   136  		case "/gc/heap/frees:bytes":
   137  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc-mstats.HeapAlloc)
   138  		case "/gc/heap/tiny/allocs:objects":
   139  			// Currently, MemStats adds tiny alloc count to both Mallocs AND Frees.
   140  			// The reason for this is because MemStats couldn't be extended at the time
   141  			// but there was a desire to have Mallocs at least be a little more representative,
   142  			// while having Mallocs - Frees still represent a live object count.
   143  			// Unfortunately, MemStats doesn't actually export a large allocation count,
   144  			// so it's impossible to pull this number out directly.
   145  			//
   146  			// Check tiny allocation count outside of this loop, by using the allocs-by-size
   147  			// histogram in order to figure out how many large objects there are.
   148  			tinyAllocs = samples[i].Value.Uint64()
   149  			// Because the next two metrics tests are checking against Mallocs and Frees,
   150  			// we can't check them directly for the same reason: we need to account for tiny
   151  			// allocations included in Mallocs and Frees.
   152  		case "/gc/heap/allocs:objects":
   153  			mallocs = samples[i].Value.Uint64()
   154  		case "/gc/heap/frees:objects":
   155  			frees = samples[i].Value.Uint64()
   156  		case "/gc/heap/live:bytes":
   157  			// Check for "obviously wrong" values. We can't check a stronger invariant,
   158  			// such as live <= HeapAlloc, because live is not 100% accurate. It's computed
   159  			// under racy conditions, and some objects may be double-counted (this is
   160  			// intentional and necessary for GC performance).
   161  			//
   162  			// Instead, check against a much more reasonable upper-bound: the amount of
   163  			// mapped heap memory. We can't possibly overcount to the point of exceeding
   164  			// total mapped heap memory, except if there's an accounting bug.
   165  			if live := samples[i].Value.Uint64(); live > mstats.HeapSys {
   166  				t.Errorf("live bytes: %d > heap sys: %d", live, mstats.HeapSys)
   167  			} else if live == 0 {
   168  				// Might happen if we don't call runtime.GC() above.
   169  				t.Error("live bytes is 0")
   170  			}
   171  		case "/gc/gomemlimit:bytes":
   172  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(limit))
   173  		case "/gc/heap/objects:objects":
   174  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
   175  		case "/gc/heap/goal:bytes":
   176  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
   177  		case "/gc/gogc:percent":
   178  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(gcPercent))
   179  		case "/gc/cycles/automatic:gc-cycles":
   180  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
   181  		case "/gc/cycles/forced:gc-cycles":
   182  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
   183  		case "/gc/cycles/total:gc-cycles":
   184  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
   185  		case "/gc/pauses:seconds":
   186  			gcPauses = samples[i].Value.Float64Histogram()
   187  		case "/sched/pauses/total/gc:seconds":
   188  			schedPausesTotalGC = samples[i].Value.Float64Histogram()
   189  		}
   190  	}
   191  
   192  	// Check tinyAllocs.
   193  	nonTinyAllocs := uint64(0)
   194  	for _, c := range allocsBySize.Counts {
   195  		nonTinyAllocs += c
   196  	}
   197  	checkUint64(t, "/gc/heap/tiny/allocs:objects", tinyAllocs, mstats.Mallocs-nonTinyAllocs)
   198  
   199  	// Check allocation and free counts.
   200  	checkUint64(t, "/gc/heap/allocs:objects", mallocs, mstats.Mallocs-tinyAllocs)
   201  	checkUint64(t, "/gc/heap/frees:objects", frees, mstats.Frees-tinyAllocs)
   202  
   203  	// Verify that /gc/pauses:seconds is a copy of /sched/pauses/total/gc:seconds
   204  	if !slices.Equal(gcPauses.Buckets, schedPausesTotalGC.Buckets) {
   205  		t.Errorf("/gc/pauses:seconds buckets %v do not match /sched/pauses/total/gc:seconds buckets %v", gcPauses.Buckets, schedPausesTotalGC.Counts)
   206  	}
   207  	if !slices.Equal(gcPauses.Counts, schedPausesTotalGC.Counts) {
   208  		t.Errorf("/gc/pauses:seconds counts %v do not match /sched/pauses/total/gc:seconds counts %v", gcPauses.Counts, schedPausesTotalGC.Counts)
   209  	}
   210  }
   211  
   212  func TestReadMetricsConsistency(t *testing.T) {
   213  	// Tests whether readMetrics produces consistent, sensible values.
   214  	// The values are read concurrently with the runtime doing other
   215  	// things (e.g. allocating) so what we read can't reasonably compared
   216  	// to other runtime values (e.g. MemStats).
   217  
   218  	// Run a few GC cycles to get some of the stats to be non-zero.
   219  	runtime.GC()
   220  	runtime.GC()
   221  	runtime.GC()
   222  
   223  	// Set GOMAXPROCS high then sleep briefly to ensure we generate
   224  	// some idle time.
   225  	oldmaxprocs := runtime.GOMAXPROCS(10)
   226  	time.Sleep(time.Millisecond)
   227  	runtime.GOMAXPROCS(oldmaxprocs)
   228  
   229  	// Read all the supported metrics through the metrics package.
   230  	descs, samples := prepareAllMetricsSamples()
   231  	metrics.Read(samples)
   232  
   233  	// Check to make sure the values we read make sense.
   234  	var totalVirtual struct {
   235  		got, want uint64
   236  	}
   237  	var objects struct {
   238  		alloc, free             *metrics.Float64Histogram
   239  		allocs, frees           uint64
   240  		allocdBytes, freedBytes uint64
   241  		total, totalBytes       uint64
   242  	}
   243  	var gc struct {
   244  		numGC  uint64
   245  		pauses uint64
   246  	}
   247  	var totalScan struct {
   248  		got, want uint64
   249  	}
   250  	var cpu struct {
   251  		gcAssist    float64
   252  		gcDedicated float64
   253  		gcIdle      float64
   254  		gcPause     float64
   255  		gcTotal     float64
   256  
   257  		idle float64
   258  		user float64
   259  
   260  		scavengeAssist float64
   261  		scavengeBg     float64
   262  		scavengeTotal  float64
   263  
   264  		total float64
   265  	}
   266  	for i := range samples {
   267  		kind := samples[i].Value.Kind()
   268  		if want := descs[samples[i].Name].Kind; kind != want {
   269  			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
   270  			continue
   271  		}
   272  		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
   273  			v := samples[i].Value.Uint64()
   274  			totalVirtual.want += v
   275  
   276  			// None of these stats should ever get this big.
   277  			// If they do, there's probably overflow involved,
   278  			// usually due to bad accounting.
   279  			if int64(v) < 0 {
   280  				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
   281  			}
   282  		}
   283  		switch samples[i].Name {
   284  		case "/cpu/classes/gc/mark/assist:cpu-seconds":
   285  			cpu.gcAssist = samples[i].Value.Float64()
   286  		case "/cpu/classes/gc/mark/dedicated:cpu-seconds":
   287  			cpu.gcDedicated = samples[i].Value.Float64()
   288  		case "/cpu/classes/gc/mark/idle:cpu-seconds":
   289  			cpu.gcIdle = samples[i].Value.Float64()
   290  		case "/cpu/classes/gc/pause:cpu-seconds":
   291  			cpu.gcPause = samples[i].Value.Float64()
   292  		case "/cpu/classes/gc/total:cpu-seconds":
   293  			cpu.gcTotal = samples[i].Value.Float64()
   294  		case "/cpu/classes/idle:cpu-seconds":
   295  			cpu.idle = samples[i].Value.Float64()
   296  		case "/cpu/classes/scavenge/assist:cpu-seconds":
   297  			cpu.scavengeAssist = samples[i].Value.Float64()
   298  		case "/cpu/classes/scavenge/background:cpu-seconds":
   299  			cpu.scavengeBg = samples[i].Value.Float64()
   300  		case "/cpu/classes/scavenge/total:cpu-seconds":
   301  			cpu.scavengeTotal = samples[i].Value.Float64()
   302  		case "/cpu/classes/total:cpu-seconds":
   303  			cpu.total = samples[i].Value.Float64()
   304  		case "/cpu/classes/user:cpu-seconds":
   305  			cpu.user = samples[i].Value.Float64()
   306  		case "/memory/classes/total:bytes":
   307  			totalVirtual.got = samples[i].Value.Uint64()
   308  		case "/memory/classes/heap/objects:bytes":
   309  			objects.totalBytes = samples[i].Value.Uint64()
   310  		case "/gc/heap/objects:objects":
   311  			objects.total = samples[i].Value.Uint64()
   312  		case "/gc/heap/allocs:bytes":
   313  			objects.allocdBytes = samples[i].Value.Uint64()
   314  		case "/gc/heap/allocs:objects":
   315  			objects.allocs = samples[i].Value.Uint64()
   316  		case "/gc/heap/allocs-by-size:bytes":
   317  			objects.alloc = samples[i].Value.Float64Histogram()
   318  		case "/gc/heap/frees:bytes":
   319  			objects.freedBytes = samples[i].Value.Uint64()
   320  		case "/gc/heap/frees:objects":
   321  			objects.frees = samples[i].Value.Uint64()
   322  		case "/gc/heap/frees-by-size:bytes":
   323  			objects.free = samples[i].Value.Float64Histogram()
   324  		case "/gc/cycles:gc-cycles":
   325  			gc.numGC = samples[i].Value.Uint64()
   326  		case "/gc/pauses:seconds":
   327  			h := samples[i].Value.Float64Histogram()
   328  			gc.pauses = 0
   329  			for i := range h.Counts {
   330  				gc.pauses += h.Counts[i]
   331  			}
   332  		case "/gc/scan/heap:bytes":
   333  			totalScan.want += samples[i].Value.Uint64()
   334  		case "/gc/scan/globals:bytes":
   335  			totalScan.want += samples[i].Value.Uint64()
   336  		case "/gc/scan/stack:bytes":
   337  			totalScan.want += samples[i].Value.Uint64()
   338  		case "/gc/scan/total:bytes":
   339  			totalScan.got = samples[i].Value.Uint64()
   340  		case "/sched/gomaxprocs:threads":
   341  			if got, want := samples[i].Value.Uint64(), uint64(runtime.GOMAXPROCS(-1)); got != want {
   342  				t.Errorf("gomaxprocs doesn't match runtime.GOMAXPROCS: got %d, want %d", got, want)
   343  			}
   344  		case "/sched/goroutines:goroutines":
   345  			if samples[i].Value.Uint64() < 1 {
   346  				t.Error("number of goroutines is less than one")
   347  			}
   348  		}
   349  	}
   350  	// Only check this on Linux where we can be reasonably sure we have a high-resolution timer.
   351  	if runtime.GOOS == "linux" {
   352  		if cpu.gcDedicated <= 0 && cpu.gcAssist <= 0 && cpu.gcIdle <= 0 {
   353  			t.Errorf("found no time spent on GC work: %#v", cpu)
   354  		}
   355  		if cpu.gcPause <= 0 {
   356  			t.Errorf("found no GC pauses: %f", cpu.gcPause)
   357  		}
   358  		if cpu.idle <= 0 {
   359  			t.Errorf("found no idle time: %f", cpu.idle)
   360  		}
   361  		if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.001) {
   362  			t.Errorf("calculated total GC CPU time not within %%0.1 of total: %f vs. %f", total, cpu.gcTotal)
   363  		}
   364  		if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.001) {
   365  			t.Errorf("calculated total scavenge CPU not within %%0.1 of total: %f vs. %f", total, cpu.scavengeTotal)
   366  		}
   367  		if cpu.total <= 0 {
   368  			t.Errorf("found no total CPU time passed")
   369  		}
   370  		if cpu.user <= 0 {
   371  			t.Errorf("found no user time passed")
   372  		}
   373  		if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.001) {
   374  			t.Errorf("calculated total CPU not within %%0.1 of total: %f vs. %f", total, cpu.total)
   375  		}
   376  	}
   377  	if totalVirtual.got != totalVirtual.want {
   378  		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
   379  	}
   380  	if got, want := objects.allocs-objects.frees, objects.total; got != want {
   381  		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
   382  	}
   383  	if got, want := objects.allocdBytes-objects.freedBytes, objects.totalBytes; got != want {
   384  		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
   385  	}
   386  	if b, c := len(objects.alloc.Buckets), len(objects.alloc.Counts); b != c+1 {
   387  		t.Errorf("allocs-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
   388  	}
   389  	if b, c := len(objects.free.Buckets), len(objects.free.Counts); b != c+1 {
   390  		t.Errorf("frees-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
   391  	}
   392  	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
   393  		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
   394  	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
   395  		t.Error("allocs-by-size and frees-by-size counts don't match in length")
   396  	} else {
   397  		for i := range objects.alloc.Buckets {
   398  			ba := objects.alloc.Buckets[i]
   399  			bf := objects.free.Buckets[i]
   400  			if ba != bf {
   401  				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
   402  			}
   403  		}
   404  		if !t.Failed() {
   405  			var gotAlloc, gotFree uint64
   406  			want := objects.total
   407  			for i := range objects.alloc.Counts {
   408  				if objects.alloc.Counts[i] < objects.free.Counts[i] {
   409  					t.Errorf("found more allocs than frees in object dist bucket %d", i)
   410  					continue
   411  				}
   412  				gotAlloc += objects.alloc.Counts[i]
   413  				gotFree += objects.free.Counts[i]
   414  			}
   415  			if got := gotAlloc - gotFree; got != want {
   416  				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
   417  			}
   418  			if gotAlloc != objects.allocs {
   419  				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotAlloc, objects.allocs)
   420  			}
   421  			if gotFree != objects.frees {
   422  				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotFree, objects.frees)
   423  			}
   424  		}
   425  	}
   426  	// The current GC has at least 2 pauses per GC.
   427  	// Check to see if that value makes sense.
   428  	if gc.pauses < gc.numGC*2 {
   429  		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
   430  	}
   431  	if totalScan.got <= 0 {
   432  		t.Errorf("scannable GC space is empty: %d", totalScan.got)
   433  	}
   434  	if totalScan.got != totalScan.want {
   435  		t.Errorf("/gc/scan/total:bytes doesn't line up with sum of /gc/scan*: total %d vs. sum %d", totalScan.got, totalScan.want)
   436  	}
   437  }
   438  
   439  func BenchmarkReadMetricsLatency(b *testing.B) {
   440  	stop := applyGCLoad(b)
   441  
   442  	// Spend this much time measuring latencies.
   443  	latencies := make([]time.Duration, 0, 1024)
   444  	_, samples := prepareAllMetricsSamples()
   445  
   446  	// Hit metrics.Read continuously and measure.
   447  	b.ResetTimer()
   448  	for i := 0; i < b.N; i++ {
   449  		start := time.Now()
   450  		metrics.Read(samples)
   451  		latencies = append(latencies, time.Since(start))
   452  	}
   453  	// Make sure to stop the timer before we wait! The load created above
   454  	// is very heavy-weight and not easy to stop, so we could end up
   455  	// confusing the benchmarking framework for small b.N.
   456  	b.StopTimer()
   457  	stop()
   458  
   459  	// Disable the default */op metrics.
   460  	// ns/op doesn't mean anything because it's an average, but we
   461  	// have a sleep in our b.N loop above which skews this significantly.
   462  	b.ReportMetric(0, "ns/op")
   463  	b.ReportMetric(0, "B/op")
   464  	b.ReportMetric(0, "allocs/op")
   465  
   466  	// Sort latencies then report percentiles.
   467  	sort.Slice(latencies, func(i, j int) bool {
   468  		return latencies[i] < latencies[j]
   469  	})
   470  	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
   471  	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
   472  	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
   473  }
   474  
   475  var readMetricsSink [1024]interface{}
   476  
   477  func TestReadMetricsCumulative(t *testing.T) {
   478  	// Set up the set of metrics marked cumulative.
   479  	descs := metrics.All()
   480  	var samples [2][]metrics.Sample
   481  	samples[0] = make([]metrics.Sample, len(descs))
   482  	samples[1] = make([]metrics.Sample, len(descs))
   483  	total := 0
   484  	for i := range samples[0] {
   485  		if !descs[i].Cumulative {
   486  			continue
   487  		}
   488  		samples[0][total].Name = descs[i].Name
   489  		total++
   490  	}
   491  	samples[0] = samples[0][:total]
   492  	samples[1] = samples[1][:total]
   493  	copy(samples[1], samples[0])
   494  
   495  	// Start some noise in the background.
   496  	var wg sync.WaitGroup
   497  	wg.Add(1)
   498  	done := make(chan struct{})
   499  	go func() {
   500  		defer wg.Done()
   501  		for {
   502  			// Add more things here that could influence metrics.
   503  			for i := 0; i < 10; i++ {
   504  				runtime.AddCleanup(new(*int), func(_ struct{}) {}, struct{}{})
   505  				runtime.SetFinalizer(new(*int), func(_ **int) {})
   506  			}
   507  			for i := 0; i < len(readMetricsSink); i++ {
   508  				readMetricsSink[i] = make([]byte, 1024)
   509  				select {
   510  				case <-done:
   511  					return
   512  				default:
   513  				}
   514  			}
   515  			runtime.GC()
   516  		}
   517  	}()
   518  
   519  	sum := func(us []uint64) uint64 {
   520  		total := uint64(0)
   521  		for _, u := range us {
   522  			total += u
   523  		}
   524  		return total
   525  	}
   526  
   527  	// Populate the first generation.
   528  	metrics.Read(samples[0])
   529  
   530  	// Check to make sure that these metrics only grow monotonically.
   531  	for gen := 1; gen < 10; gen++ {
   532  		metrics.Read(samples[gen%2])
   533  		for i := range samples[gen%2] {
   534  			name := samples[gen%2][i].Name
   535  			vNew, vOld := samples[gen%2][i].Value, samples[1-(gen%2)][i].Value
   536  
   537  			switch vNew.Kind() {
   538  			case metrics.KindUint64:
   539  				new := vNew.Uint64()
   540  				old := vOld.Uint64()
   541  				if new < old {
   542  					t.Errorf("%s decreased: %d < %d", name, new, old)
   543  				}
   544  			case metrics.KindFloat64:
   545  				new := vNew.Float64()
   546  				old := vOld.Float64()
   547  				if new < old {
   548  					t.Errorf("%s decreased: %f < %f", name, new, old)
   549  				}
   550  			case metrics.KindFloat64Histogram:
   551  				new := sum(vNew.Float64Histogram().Counts)
   552  				old := sum(vOld.Float64Histogram().Counts)
   553  				if new < old {
   554  					t.Errorf("%s counts decreased: %d < %d", name, new, old)
   555  				}
   556  			}
   557  		}
   558  	}
   559  	close(done)
   560  
   561  	wg.Wait()
   562  }
   563  
   564  func withinEpsilon(v1, v2, e float64) bool {
   565  	return v2-v2*e <= v1 && v1 <= v2+v2*e
   566  }
   567  
   568  func TestMutexWaitTimeMetric(t *testing.T) {
   569  	var sample [1]metrics.Sample
   570  	sample[0].Name = "/sync/mutex/wait/total:seconds"
   571  
   572  	locks := []locker2{
   573  		new(mutex),
   574  		new(rwmutexWrite),
   575  		new(rwmutexReadWrite),
   576  		new(rwmutexWriteRead),
   577  	}
   578  	for _, lock := range locks {
   579  		t.Run(reflect.TypeOf(lock).Elem().Name(), func(t *testing.T) {
   580  			metrics.Read(sample[:])
   581  			before := time.Duration(sample[0].Value.Float64() * 1e9)
   582  
   583  			minMutexWaitTime := generateMutexWaitTime(lock)
   584  
   585  			metrics.Read(sample[:])
   586  			after := time.Duration(sample[0].Value.Float64() * 1e9)
   587  
   588  			if wt := after - before; wt < minMutexWaitTime {
   589  				t.Errorf("too little mutex wait time: got %s, want %s", wt, minMutexWaitTime)
   590  			}
   591  		})
   592  	}
   593  }
   594  
   595  // locker2 represents an API surface of two concurrent goroutines
   596  // locking the same resource, but through different APIs. It's intended
   597  // to abstract over the relationship of two Lock calls or an RLock
   598  // and a Lock call.
   599  type locker2 interface {
   600  	Lock1()
   601  	Unlock1()
   602  	Lock2()
   603  	Unlock2()
   604  }
   605  
   606  type mutex struct {
   607  	mu sync.Mutex
   608  }
   609  
   610  func (m *mutex) Lock1()   { m.mu.Lock() }
   611  func (m *mutex) Unlock1() { m.mu.Unlock() }
   612  func (m *mutex) Lock2()   { m.mu.Lock() }
   613  func (m *mutex) Unlock2() { m.mu.Unlock() }
   614  
   615  type rwmutexWrite struct {
   616  	mu sync.RWMutex
   617  }
   618  
   619  func (m *rwmutexWrite) Lock1()   { m.mu.Lock() }
   620  func (m *rwmutexWrite) Unlock1() { m.mu.Unlock() }
   621  func (m *rwmutexWrite) Lock2()   { m.mu.Lock() }
   622  func (m *rwmutexWrite) Unlock2() { m.mu.Unlock() }
   623  
   624  type rwmutexReadWrite struct {
   625  	mu sync.RWMutex
   626  }
   627  
   628  func (m *rwmutexReadWrite) Lock1()   { m.mu.RLock() }
   629  func (m *rwmutexReadWrite) Unlock1() { m.mu.RUnlock() }
   630  func (m *rwmutexReadWrite) Lock2()   { m.mu.Lock() }
   631  func (m *rwmutexReadWrite) Unlock2() { m.mu.Unlock() }
   632  
   633  type rwmutexWriteRead struct {
   634  	mu sync.RWMutex
   635  }
   636  
   637  func (m *rwmutexWriteRead) Lock1()   { m.mu.Lock() }
   638  func (m *rwmutexWriteRead) Unlock1() { m.mu.Unlock() }
   639  func (m *rwmutexWriteRead) Lock2()   { m.mu.RLock() }
   640  func (m *rwmutexWriteRead) Unlock2() { m.mu.RUnlock() }
   641  
   642  // generateMutexWaitTime causes a couple of goroutines
   643  // to block a whole bunch of times on a sync.Mutex, returning
   644  // the minimum amount of time that should be visible in the
   645  // /sync/mutex-wait:seconds metric.
   646  func generateMutexWaitTime(mu locker2) time.Duration {
   647  	// Set up the runtime to always track casgstatus transitions for metrics.
   648  	*runtime.CasGStatusAlwaysTrack = true
   649  
   650  	mu.Lock1()
   651  
   652  	// Start up a goroutine to wait on the lock.
   653  	gc := make(chan *runtime.G)
   654  	done := make(chan bool)
   655  	go func() {
   656  		gc <- runtime.Getg()
   657  
   658  		for {
   659  			mu.Lock2()
   660  			mu.Unlock2()
   661  			if <-done {
   662  				return
   663  			}
   664  		}
   665  	}()
   666  	gp := <-gc
   667  
   668  	// Set the block time high enough so that it will always show up, even
   669  	// on systems with coarse timer granularity.
   670  	const blockTime = 100 * time.Millisecond
   671  
   672  	// Make sure the goroutine spawned above actually blocks on the lock.
   673  	for {
   674  		if runtime.GIsWaitingOnMutex(gp) {
   675  			break
   676  		}
   677  		runtime.Gosched()
   678  	}
   679  
   680  	// Let some amount of time pass.
   681  	time.Sleep(blockTime)
   682  
   683  	// Let the other goroutine acquire the lock.
   684  	mu.Unlock1()
   685  	done <- true
   686  
   687  	// Reset flag.
   688  	*runtime.CasGStatusAlwaysTrack = false
   689  	return blockTime
   690  }
   691  
   692  // See issue #60276.
   693  func TestCPUMetricsSleep(t *testing.T) {
   694  	if runtime.GOOS == "wasip1" {
   695  		// Since wasip1 busy-waits in the scheduler, there's no meaningful idle
   696  		// time. This is accurately reflected in the metrics, but it means this
   697  		// test is basically meaningless on this platform.
   698  		t.Skip("wasip1 currently busy-waits in idle time; test not applicable")
   699  	}
   700  
   701  	names := []string{
   702  		"/cpu/classes/idle:cpu-seconds",
   703  
   704  		"/cpu/classes/gc/mark/assist:cpu-seconds",
   705  		"/cpu/classes/gc/mark/dedicated:cpu-seconds",
   706  		"/cpu/classes/gc/mark/idle:cpu-seconds",
   707  		"/cpu/classes/gc/pause:cpu-seconds",
   708  		"/cpu/classes/gc/total:cpu-seconds",
   709  		"/cpu/classes/scavenge/assist:cpu-seconds",
   710  		"/cpu/classes/scavenge/background:cpu-seconds",
   711  		"/cpu/classes/scavenge/total:cpu-seconds",
   712  		"/cpu/classes/total:cpu-seconds",
   713  		"/cpu/classes/user:cpu-seconds",
   714  	}
   715  	prep := func() []metrics.Sample {
   716  		mm := make([]metrics.Sample, len(names))
   717  		for i := range names {
   718  			mm[i].Name = names[i]
   719  		}
   720  		return mm
   721  	}
   722  	m1, m2 := prep(), prep()
   723  
   724  	const (
   725  		// Expected time spent idle.
   726  		dur = 100 * time.Millisecond
   727  
   728  		// maxFailures is the number of consecutive failures requires to cause the test to fail.
   729  		maxFailures = 10
   730  	)
   731  
   732  	failureIdleTimes := make([]float64, 0, maxFailures)
   733  
   734  	// If the bug we expect is happening, then the Sleep CPU time will be accounted for
   735  	// as user time rather than idle time. In an ideal world we'd expect the whole application
   736  	// to go instantly idle the moment this goroutine goes to sleep, and stay asleep for that
   737  	// duration. However, the Go runtime can easily eat into idle time while this goroutine is
   738  	// blocked in a sleep. For example, slow platforms might spend more time expected in the
   739  	// scheduler. Another example is that a Go runtime background goroutine could run while
   740  	// everything else is idle. Lastly, if a running goroutine is descheduled by the OS, enough
   741  	// time may pass such that the goroutine is ready to wake, even though the runtime couldn't
   742  	// observe itself as idle with nanotime.
   743  	//
   744  	// To deal with all this, we give a half-proc's worth of leniency.
   745  	//
   746  	// We also retry multiple times to deal with the fact that the OS might deschedule us before
   747  	// we yield and go idle. That has a rare enough chance that retries should resolve it.
   748  	// If the issue we expect is happening, it should be persistent.
   749  	minIdleCPUSeconds := dur.Seconds() * (float64(runtime.GOMAXPROCS(-1)) - 0.5)
   750  
   751  	// Let's make sure there's no background scavenge work to do.
   752  	//
   753  	// The runtime.GC calls below ensure the background sweeper
   754  	// will not run during the idle period.
   755  	debug.FreeOSMemory()
   756  
   757  	for retries := 0; retries < maxFailures; retries++ {
   758  		// Read 1.
   759  		runtime.GC() // Update /cpu/classes metrics.
   760  		metrics.Read(m1)
   761  
   762  		// Sleep.
   763  		time.Sleep(dur)
   764  
   765  		// Read 2.
   766  		runtime.GC() // Update /cpu/classes metrics.
   767  		metrics.Read(m2)
   768  
   769  		dt := m2[0].Value.Float64() - m1[0].Value.Float64()
   770  		if dt >= minIdleCPUSeconds {
   771  			// All is well. Test passed.
   772  			return
   773  		}
   774  		failureIdleTimes = append(failureIdleTimes, dt)
   775  		// Try again.
   776  	}
   777  
   778  	// We couldn't observe the expected idle time even once.
   779  	for i, dt := range failureIdleTimes {
   780  		t.Logf("try %2d: idle time = %.5fs\n", i+1, dt)
   781  	}
   782  	t.Logf("try %d breakdown:\n", len(failureIdleTimes))
   783  	for i := range names {
   784  		if m1[i].Value.Kind() == metrics.KindBad {
   785  			continue
   786  		}
   787  		t.Logf("\t%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64())
   788  	}
   789  	t.Errorf(`time.Sleep did not contribute enough to "idle" class: minimum idle time = %.5fs`, minIdleCPUSeconds)
   790  }
   791  
   792  // Call f() and verify that the correct STW metrics increment. If isGC is true,
   793  // fn triggers a GC STW. Otherwise, fn triggers an other STW.
   794  func testSchedPauseMetrics(t *testing.T, fn func(t *testing.T), isGC bool) {
   795  	m := []metrics.Sample{
   796  		{Name: "/sched/pauses/stopping/gc:seconds"},
   797  		{Name: "/sched/pauses/stopping/other:seconds"},
   798  		{Name: "/sched/pauses/total/gc:seconds"},
   799  		{Name: "/sched/pauses/total/other:seconds"},
   800  	}
   801  
   802  	stoppingGC := &m[0]
   803  	stoppingOther := &m[1]
   804  	totalGC := &m[2]
   805  	totalOther := &m[3]
   806  
   807  	sampleCount := func(s *metrics.Sample) uint64 {
   808  		h := s.Value.Float64Histogram()
   809  
   810  		var n uint64
   811  		for _, c := range h.Counts {
   812  			n += c
   813  		}
   814  		return n
   815  	}
   816  
   817  	// Read baseline.
   818  	metrics.Read(m)
   819  
   820  	baselineStartGC := sampleCount(stoppingGC)
   821  	baselineStartOther := sampleCount(stoppingOther)
   822  	baselineTotalGC := sampleCount(totalGC)
   823  	baselineTotalOther := sampleCount(totalOther)
   824  
   825  	fn(t)
   826  
   827  	metrics.Read(m)
   828  
   829  	if isGC {
   830  		if got := sampleCount(stoppingGC); got <= baselineStartGC {
   831  			t.Errorf("/sched/pauses/stopping/gc:seconds sample count %d did not increase from baseline of %d", got, baselineStartGC)
   832  		}
   833  		if got := sampleCount(totalGC); got <= baselineTotalGC {
   834  			t.Errorf("/sched/pauses/total/gc:seconds sample count %d did not increase from baseline of %d", got, baselineTotalGC)
   835  		}
   836  
   837  		if got := sampleCount(stoppingOther); got != baselineStartOther {
   838  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d changed from baseline of %d", got, baselineStartOther)
   839  		}
   840  		if got := sampleCount(totalOther); got != baselineTotalOther {
   841  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d changed from baseline of %d", got, baselineTotalOther)
   842  		}
   843  	} else {
   844  		if got := sampleCount(stoppingGC); got != baselineStartGC {
   845  			t.Errorf("/sched/pauses/stopping/gc:seconds sample count %d changed from baseline of %d", got, baselineStartGC)
   846  		}
   847  		if got := sampleCount(totalGC); got != baselineTotalGC {
   848  			t.Errorf("/sched/pauses/total/gc:seconds sample count %d changed from baseline of %d", got, baselineTotalGC)
   849  		}
   850  
   851  		if got := sampleCount(stoppingOther); got <= baselineStartOther {
   852  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d did not increase from baseline of %d", got, baselineStartOther)
   853  		}
   854  		if got := sampleCount(totalOther); got <= baselineTotalOther {
   855  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d did not increase from baseline of %d", got, baselineTotalOther)
   856  		}
   857  	}
   858  }
   859  
   860  func TestSchedPauseMetrics(t *testing.T) {
   861  	tests := []struct {
   862  		name string
   863  		isGC bool
   864  		fn   func(t *testing.T)
   865  	}{
   866  		{
   867  			name: "runtime.GC",
   868  			isGC: true,
   869  			fn: func(t *testing.T) {
   870  				runtime.GC()
   871  			},
   872  		},
   873  		{
   874  			name: "runtime.GOMAXPROCS",
   875  			fn: func(t *testing.T) {
   876  				if runtime.GOARCH == "wasm" {
   877  					t.Skip("GOMAXPROCS >1 not supported on wasm")
   878  				}
   879  
   880  				n := runtime.GOMAXPROCS(0)
   881  				defer runtime.GOMAXPROCS(n)
   882  
   883  				runtime.GOMAXPROCS(n + 1)
   884  			},
   885  		},
   886  		{
   887  			name: "runtime.GoroutineProfile",
   888  			fn: func(t *testing.T) {
   889  				var s [1]runtime.StackRecord
   890  				runtime.GoroutineProfile(s[:])
   891  			},
   892  		},
   893  		{
   894  			name: "runtime.ReadMemStats",
   895  			fn: func(t *testing.T) {
   896  				var mstats runtime.MemStats
   897  				runtime.ReadMemStats(&mstats)
   898  			},
   899  		},
   900  		{
   901  			name: "runtime.Stack",
   902  			fn: func(t *testing.T) {
   903  				var b [64]byte
   904  				runtime.Stack(b[:], true)
   905  			},
   906  		},
   907  		{
   908  			name: "runtime/debug.WriteHeapDump",
   909  			fn: func(t *testing.T) {
   910  				if runtime.GOOS == "js" {
   911  					t.Skip("WriteHeapDump not supported on js")
   912  				}
   913  
   914  				f, err := os.CreateTemp(t.TempDir(), "heapdumptest")
   915  				if err != nil {
   916  					t.Fatalf("os.CreateTemp failed: %v", err)
   917  				}
   918  				defer os.Remove(f.Name())
   919  				defer f.Close()
   920  				debug.WriteHeapDump(f.Fd())
   921  			},
   922  		},
   923  		{
   924  			name: "runtime/trace.Start",
   925  			fn: func(t *testing.T) {
   926  				if trace.IsEnabled() {
   927  					t.Skip("tracing already enabled")
   928  				}
   929  
   930  				var buf bytes.Buffer
   931  				if err := trace.Start(&buf); err != nil {
   932  					t.Errorf("trace.Start err got %v want nil", err)
   933  				}
   934  				trace.Stop()
   935  			},
   936  		},
   937  	}
   938  
   939  	// These tests count STW pauses, classified based on whether they're related
   940  	// to the GC or not. Disable automatic GC cycles during the test so we don't
   941  	// have an incidental GC pause when we're trying to observe only
   942  	// non-GC-related pauses. This is especially important for the
   943  	// runtime/trace.Start test, since (as of this writing) that will block
   944  	// until any active GC mark phase completes.
   945  	defer debug.SetGCPercent(debug.SetGCPercent(-1))
   946  	runtime.GC()
   947  
   948  	for _, tc := range tests {
   949  		t.Run(tc.name, func(t *testing.T) {
   950  			testSchedPauseMetrics(t, tc.fn, tc.isGC)
   951  		})
   952  	}
   953  }
   954  
   955  func TestRuntimeLockMetricsAndProfile(t *testing.T) {
   956  	old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
   957  	defer runtime.SetMutexProfileFraction(old)
   958  	if old != 0 {
   959  		t.Fatalf("need MutexProfileRate 0, got %d", old)
   960  	}
   961  
   962  	t.Logf("NumCPU %d", runtime.NumCPU())
   963  	t.Logf("GOMAXPROCS %d", runtime.GOMAXPROCS(0))
   964  	if minCPU := 2; runtime.NumCPU() < minCPU {
   965  		t.Skipf("creating and observing contention on runtime-internal locks requires NumCPU >= %d", minCPU)
   966  	}
   967  
   968  	loadProfile := func(t *testing.T) *profile.Profile {
   969  		var w bytes.Buffer
   970  		pprof.Lookup("mutex").WriteTo(&w, 0)
   971  		p, err := profile.Parse(&w)
   972  		if err != nil {
   973  			t.Fatalf("failed to parse profile: %v", err)
   974  		}
   975  		if err := p.CheckValid(); err != nil {
   976  			t.Fatalf("invalid profile: %v", err)
   977  		}
   978  		return p
   979  	}
   980  
   981  	measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
   982  		beforeProfile := loadProfile(t)
   983  		beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
   984  		metrics.Read(beforeMetrics)
   985  
   986  		fn()
   987  
   988  		afterProfile := loadProfile(t)
   989  		afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
   990  		metrics.Read(afterMetrics)
   991  
   992  		sumSamples := func(p *profile.Profile, i int) int64 {
   993  			var sum int64
   994  			for _, s := range p.Sample {
   995  				sum += s.Value[i]
   996  			}
   997  			return sum
   998  		}
   999  
  1000  		metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
  1001  		profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
  1002  
  1003  		// The internal/profile package does not support compaction; this delta
  1004  		// profile will include separate positive and negative entries.
  1005  		p = afterProfile.Copy()
  1006  		if len(beforeProfile.Sample) > 0 {
  1007  			err := p.Merge(beforeProfile, -1)
  1008  			if err != nil {
  1009  				t.Fatalf("Merge profiles: %v", err)
  1010  			}
  1011  		}
  1012  
  1013  		return metricGrowth, profileGrowth, p
  1014  	}
  1015  
  1016  	testcase := func(strictTiming bool, acceptStacks [][]string, workers int, fn func() bool) func(t *testing.T) (metricGrowth float64, profileGrowth []int64, n, value int64, explain func()) {
  1017  		return func(t *testing.T) (metricGrowth float64, profileGrowth []int64, n, value int64, explain func()) {
  1018  			metricGrowth, totalProfileGrowth, p := measureDelta(t, func() {
  1019  				var started, stopped sync.WaitGroup
  1020  				started.Add(workers)
  1021  				stopped.Add(workers)
  1022  				for i := 0; i < workers; i++ {
  1023  					w := &contentionWorker{
  1024  						before: func() {
  1025  							started.Done()
  1026  							started.Wait()
  1027  						},
  1028  						after: func() {
  1029  							stopped.Done()
  1030  						},
  1031  						fn: fn,
  1032  					}
  1033  					go w.run()
  1034  				}
  1035  				stopped.Wait()
  1036  			})
  1037  
  1038  			if totalProfileGrowth == 0 {
  1039  				t.Errorf("no increase in mutex profile")
  1040  			}
  1041  			if metricGrowth == 0 && strictTiming {
  1042  				// If the critical section is very short, systems with low timer
  1043  				// resolution may be unable to measure it via nanotime.
  1044  				//
  1045  				// This is sampled at 1 per gTrackingPeriod, but the explicit
  1046  				// runtime.mutex tests create 200 contention events. Observing
  1047  				// zero of those has a probability of (7/8)^200 = 2.5e-12 which
  1048  				// is acceptably low (though the calculation has a tenuous
  1049  				// dependency on cheaprandn being a good-enough source of
  1050  				// entropy).
  1051  				t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
  1052  			}
  1053  			// This comparison is possible because the time measurements in support of
  1054  			// runtime/pprof and runtime/metrics for runtime-internal locks are so close
  1055  			// together. It doesn't work as well for user-space contention, where the
  1056  			// involved goroutines are not _Grunnable the whole time and so need to pass
  1057  			// through the scheduler.
  1058  			t.Logf("lock contention growth in runtime/pprof's view  (%fs)", totalProfileGrowth)
  1059  			t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
  1060  
  1061  			acceptStacks = append([][]string(nil), acceptStacks...)
  1062  			for i, stk := range acceptStacks {
  1063  				if goexperiment.StaticLockRanking {
  1064  					if !slices.ContainsFunc(stk, func(s string) bool {
  1065  						return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
  1066  					}) {
  1067  						// stk is a call stack that is still on the user stack when
  1068  						// it calls runtime.unlock. Add the extra function that
  1069  						// we'll see, when the static lock ranking implementation of
  1070  						// runtime.unlockWithRank switches to the system stack.
  1071  						stk = append([]string{"runtime.unlockWithRank"}, stk...)
  1072  					}
  1073  				}
  1074  				acceptStacks[i] = stk
  1075  			}
  1076  
  1077  			var stks [][]string
  1078  			values := make([][2]int64, len(acceptStacks)+1)
  1079  			for _, s := range p.Sample {
  1080  				var have []string
  1081  				for _, loc := range s.Location {
  1082  					for _, line := range loc.Line {
  1083  						have = append(have, line.Function.Name)
  1084  					}
  1085  				}
  1086  				stks = append(stks, have)
  1087  				found := false
  1088  				for i, stk := range acceptStacks {
  1089  					if slices.Equal(have, stk) {
  1090  						values[i][0] += s.Value[0]
  1091  						values[i][1] += s.Value[1]
  1092  						found = true
  1093  						break
  1094  					}
  1095  				}
  1096  				if !found {
  1097  					values[len(values)-1][0] += s.Value[0]
  1098  					values[len(values)-1][1] += s.Value[1]
  1099  				}
  1100  			}
  1101  			profileGrowth = make([]int64, len(acceptStacks)+1)
  1102  			profileGrowth[len(profileGrowth)-1] = values[len(values)-1][1]
  1103  			for i, stk := range acceptStacks {
  1104  				n += values[i][0]
  1105  				value += values[i][1]
  1106  				profileGrowth[i] = values[i][1]
  1107  				t.Logf("stack %v has samples totaling n=%d value=%d", stk, values[i][0], values[i][1])
  1108  			}
  1109  			if n == 0 && value == 0 {
  1110  				t.Logf("profile:\n%s", p)
  1111  				for _, have := range stks {
  1112  					t.Logf("have stack %v", have)
  1113  				}
  1114  				for _, stk := range acceptStacks {
  1115  					t.Errorf("want stack %v", stk)
  1116  				}
  1117  			}
  1118  
  1119  			return metricGrowth, profileGrowth, n, value, func() {
  1120  				t.Logf("profile:\n%s", p)
  1121  			}
  1122  		}
  1123  	}
  1124  
  1125  	name := t.Name()
  1126  
  1127  	t.Run("runtime.lock", func(t *testing.T) {
  1128  		// The goroutine that acquires the lock will only proceed when it
  1129  		// detects that its partner is contended for the lock. That will lead to
  1130  		// live-lock if anything (such as a STW) prevents the partner goroutine
  1131  		// from running. Allowing the contention workers to pause and restart
  1132  		// (to allow a STW to proceed) makes it harder to confirm that we're
  1133  		// counting the correct number of contention events, since some locks
  1134  		// will end up contended twice. Instead, disable the GC.
  1135  		defer debug.SetGCPercent(debug.SetGCPercent(-1))
  1136  
  1137  		mus := make([]runtime.Mutex, 200)
  1138  		var needContention atomic.Int64
  1139  
  1140  		baseDelay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
  1141  		fastDelayMicros := baseDelay.Microseconds()
  1142  		slowDelayMicros := baseDelay.Microseconds() * 4
  1143  
  1144  		const (
  1145  			fastRole = iota
  1146  			slowRole
  1147  			workerCount
  1148  		)
  1149  		if runtime.GOMAXPROCS(0) < workerCount {
  1150  			t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workerCount)
  1151  		}
  1152  
  1153  		minTicks := make([][]int64, workerCount) // lower bound, known-contended time, measured by cputicks
  1154  		maxTicks := make([][]int64, workerCount) // upper bound, total lock() duration, measured by cputicks
  1155  		for i := range minTicks {
  1156  			minTicks[i] = make([]int64, len(mus))
  1157  			maxTicks[i] = make([]int64, len(mus))
  1158  		}
  1159  		var id atomic.Int32
  1160  		fn := func() bool {
  1161  			n := int(needContention.Load())
  1162  			if n < 0 {
  1163  				return false
  1164  			}
  1165  			mu := &mus[n]
  1166  
  1167  			// Each worker has a role: to have a fast or slow critical section.
  1168  			// Rotate the role assignments as we step through the mutex slice so
  1169  			// we don't end up with one M always claiming the same kind of work.
  1170  			id := int(id.Add(1))
  1171  			role := (id + n) % workerCount
  1172  
  1173  			marker, delayMicros := fastMarkerFrame, fastDelayMicros
  1174  			if role == slowRole {
  1175  				marker, delayMicros = slowMarkerFrame, slowDelayMicros
  1176  			}
  1177  
  1178  			// Each lock is used by two different critical sections, one fast
  1179  			// and one slow, identified in profiles by their different "marker"
  1180  			// functions. We expect the profile to blame each for the amount of
  1181  			// delay it inflicts on other users of the lock. We run one worker
  1182  			// of each kind, so any contention in one would be due to the other.
  1183  			//
  1184  			// We measure how long our runtime.lock call takes, which sets an
  1185  			// upper bound on how much blame to expect for the other worker type
  1186  			// in the profile. And if we acquire the lock first, we wait for the
  1187  			// other worker to announce its contention. We measure the
  1188  			// known-contended time, to use as a lower bound on how much blame
  1189  			// we expect of ourselves in the profile. Then we stall for a little
  1190  			// while (different amounts for "fast" versus "slow") before
  1191  			// unlocking the mutex.
  1192  
  1193  			marker(func() {
  1194  				t0 := runtime.Cputicks()
  1195  				runtime.Lock(mu)
  1196  				maxTicks[role][n] = runtime.Cputicks() - t0
  1197  				minTicks[role][n] = 0
  1198  				for int(needContention.Load()) == n {
  1199  					if runtime.MutexContended(mu) {
  1200  						t1 := runtime.Cputicks()
  1201  						// make them wait a little while
  1202  						for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
  1203  							runtime.Usleep(uint32(1 + delayMicros/8))
  1204  						}
  1205  						minTicks[role][n] = runtime.Cputicks() - t1
  1206  						break
  1207  					}
  1208  				}
  1209  				runtime.Unlock(mu)
  1210  				needContention.Store(int64(n - 1))
  1211  			})
  1212  
  1213  			return true
  1214  		}
  1215  
  1216  		stks := make([][]string, 2)
  1217  		for i := range stks {
  1218  			marker := "runtime_test.fastMarkerFrame"
  1219  			if i == slowRole {
  1220  				marker = "runtime_test.slowMarkerFrame"
  1221  			}
  1222  
  1223  			stks[i] = []string{
  1224  				"runtime.unlock",
  1225  				"runtime_test." + name + ".func4.1.1",
  1226  				marker,
  1227  				"runtime_test." + name + ".func4.1",
  1228  				"runtime_test.(*contentionWorker).run",
  1229  			}
  1230  		}
  1231  
  1232  		t.Run("sample-1", func(t *testing.T) {
  1233  			old := runtime.SetMutexProfileFraction(1)
  1234  			defer runtime.SetMutexProfileFraction(old)
  1235  
  1236  			needContention.Store(int64(len(mus) - 1))
  1237  			metricGrowth, profileGrowth, n, _, explain := testcase(true, stks, workerCount, fn)(t)
  1238  			defer func() {
  1239  				if t.Failed() {
  1240  					explain()
  1241  				}
  1242  			}()
  1243  
  1244  			t.Run("metric", func(t *testing.T) {
  1245  				// The runtime/metrics view may be sampled at 1 per
  1246  				// gTrackingPeriod, so we don't have a hard lower bound here.
  1247  				testenv.SkipFlaky(t, 64253)
  1248  
  1249  				if have, want := metricGrowth, baseDelay.Seconds()*float64(len(mus)); have < want {
  1250  					// The test imposes a delay with usleep, verified with calls to
  1251  					// nanotime. Compare against the runtime/metrics package's view
  1252  					// (based on nanotime) rather than runtime/pprof's view (based
  1253  					// on cputicks).
  1254  					t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
  1255  				}
  1256  			})
  1257  			if have, want := n, int64(len(mus)); have != want {
  1258  				t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
  1259  			}
  1260  
  1261  			var slowMinTicks, fastMinTicks int64
  1262  			for role, ticks := range minTicks {
  1263  				for _, delta := range ticks {
  1264  					if role == slowRole {
  1265  						slowMinTicks += delta
  1266  					} else {
  1267  						fastMinTicks += delta
  1268  					}
  1269  				}
  1270  			}
  1271  			var slowMaxTicks, fastMaxTicks int64
  1272  			for role, ticks := range maxTicks {
  1273  				for _, delta := range ticks {
  1274  					if role == slowRole {
  1275  						slowMaxTicks += delta
  1276  					} else {
  1277  						fastMaxTicks += delta
  1278  					}
  1279  				}
  1280  			}
  1281  
  1282  			cpuGHz := float64(runtime.CyclesPerSecond()) / 1e9
  1283  			for _, set := range []struct {
  1284  				name     string
  1285  				profTime int64
  1286  				minTime  int64
  1287  				maxTime  int64
  1288  			}{
  1289  				{
  1290  					name:     "slow",
  1291  					profTime: profileGrowth[slowRole],
  1292  					minTime:  int64(float64(slowMinTicks) / cpuGHz),
  1293  					maxTime:  int64(float64(fastMaxTicks) / cpuGHz),
  1294  				},
  1295  				{
  1296  					name:     "fast",
  1297  					profTime: profileGrowth[fastRole],
  1298  					minTime:  int64(float64(fastMinTicks) / cpuGHz),
  1299  					maxTime:  int64(float64(slowMaxTicks) / cpuGHz),
  1300  				},
  1301  			} {
  1302  				t.Logf("profile's view of delays due to %q critical section:                 %dns", set.name, set.profTime)
  1303  				t.Logf("test's view of known-contended time within %q critical section:      %dns", set.name, set.minTime)
  1304  				t.Logf("test's view of lock duration before critical sections other than %q: %dns", set.name, set.maxTime)
  1305  
  1306  				if set.profTime < set.minTime {
  1307  					t.Errorf("profile undercounted %q critical section", set.name)
  1308  				}
  1309  				if set.profTime > set.maxTime {
  1310  					t.Errorf("profile overcounted %q critical section", set.name)
  1311  				}
  1312  			}
  1313  
  1314  			var totalProfileGrowth float64
  1315  			for _, growth := range profileGrowth {
  1316  				totalProfileGrowth += float64(growth) * time.Nanosecond.Seconds()
  1317  			}
  1318  
  1319  			const slop = 1.5 // account for nanotime vs cputicks
  1320  			t.Run("compare timers", func(t *testing.T) {
  1321  				testenv.SkipFlaky(t, 64253)
  1322  				if totalProfileGrowth > slop*metricGrowth || metricGrowth > slop*totalProfileGrowth {
  1323  					t.Errorf("views differ by more than %fx", slop)
  1324  				}
  1325  			})
  1326  		})
  1327  
  1328  		t.Run("sample-2", func(t *testing.T) {
  1329  			testenv.SkipFlaky(t, 64253)
  1330  
  1331  			old := runtime.SetMutexProfileFraction(2)
  1332  			defer runtime.SetMutexProfileFraction(old)
  1333  
  1334  			needContention.Store(int64(len(mus) - 1))
  1335  			metricGrowth, profileGrowth, n, _, explain := testcase(true, stks, workerCount, fn)(t)
  1336  			defer func() {
  1337  				if t.Failed() {
  1338  					explain()
  1339  				}
  1340  			}()
  1341  
  1342  			// With 100 trials and profile fraction of 2, we expect to capture
  1343  			// 50 samples. Allow the test to pass if we get at least 20 samples;
  1344  			// the CDF of the binomial distribution says there's less than a
  1345  			// 1e-9 chance of that, which is an acceptably low flakiness rate.
  1346  			const samplingSlop = 2.5
  1347  
  1348  			if have, want := metricGrowth, baseDelay.Seconds()*float64(len(mus)); samplingSlop*have < want {
  1349  				// The test imposes a delay with usleep, verified with calls to
  1350  				// nanotime. Compare against the runtime/metrics package's view
  1351  				// (based on nanotime) rather than runtime/pprof's view (based
  1352  				// on cputicks).
  1353  				t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
  1354  			}
  1355  			if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
  1356  				t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
  1357  			}
  1358  
  1359  			var totalProfileGrowth float64
  1360  			for _, growth := range profileGrowth {
  1361  				totalProfileGrowth += float64(growth) * time.Nanosecond.Seconds()
  1362  			}
  1363  
  1364  			const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
  1365  			if totalProfileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*totalProfileGrowth {
  1366  				t.Errorf("views differ by more than %fx", timerSlop)
  1367  			}
  1368  		})
  1369  	})
  1370  
  1371  	t.Run("runtime.semrelease", func(t *testing.T) {
  1372  		testenv.SkipFlaky(t, 64253)
  1373  
  1374  		old := runtime.SetMutexProfileFraction(1)
  1375  		defer runtime.SetMutexProfileFraction(old)
  1376  
  1377  		const workers = 3
  1378  		if runtime.GOMAXPROCS(0) < workers {
  1379  			t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
  1380  		}
  1381  
  1382  		var sem uint32 = 1
  1383  		var tries atomic.Int32
  1384  		tries.Store(10_000_000) // prefer controlled failure to timeout
  1385  		var sawContention atomic.Int32
  1386  		var need int32 = 1
  1387  		fn := func() bool {
  1388  			if sawContention.Load() >= need {
  1389  				return false
  1390  			}
  1391  			if tries.Add(-1) < 0 {
  1392  				return false
  1393  			}
  1394  
  1395  			runtime.Semacquire(&sem)
  1396  			runtime.Semrelease1(&sem, false, 0)
  1397  			if runtime.MutexContended(runtime.SemRootLock(&sem)) {
  1398  				sawContention.Add(1)
  1399  			}
  1400  			return true
  1401  		}
  1402  
  1403  		stks := [][]string{
  1404  			{
  1405  				"runtime.unlock",
  1406  				"runtime.semrelease1",
  1407  				"runtime_test.TestRuntimeLockMetricsAndProfile.func5.1",
  1408  				"runtime_test.(*contentionWorker).run",
  1409  			},
  1410  			{
  1411  				"runtime.unlock",
  1412  				"runtime.semacquire1",
  1413  				"runtime.semacquire",
  1414  				"runtime_test.TestRuntimeLockMetricsAndProfile.func5.1",
  1415  				"runtime_test.(*contentionWorker).run",
  1416  			},
  1417  		}
  1418  
  1419  		// Verify that we get call stack we expect, with anything more than zero
  1420  		// cycles / zero samples. The duration of each contention event is too
  1421  		// small relative to the expected overhead for us to verify its value
  1422  		// more directly. Leave that to the explicit lock/unlock test.
  1423  
  1424  		testcase(false, stks, workers, fn)(t)
  1425  
  1426  		if remaining := tries.Load(); remaining >= 0 {
  1427  			t.Logf("finished test early (%d tries remaining)", remaining)
  1428  		}
  1429  	})
  1430  }
  1431  
  1432  func slowMarkerFrame(fn func()) { fn() }
  1433  func fastMarkerFrame(fn func()) { fn() }
  1434  
  1435  // contentionWorker provides cleaner call stacks for lock contention profile tests
  1436  type contentionWorker struct {
  1437  	before func()
  1438  	fn     func() bool
  1439  	after  func()
  1440  }
  1441  
  1442  func (w *contentionWorker) run() {
  1443  	defer w.after()
  1444  	w.before()
  1445  
  1446  	for w.fn() {
  1447  	}
  1448  }
  1449  
  1450  func TestCPUStats(t *testing.T) {
  1451  	// Run a few GC cycles to get some of the stats to be non-zero.
  1452  	runtime.GC()
  1453  	runtime.GC()
  1454  	runtime.GC()
  1455  
  1456  	// Set GOMAXPROCS high then sleep briefly to ensure we generate
  1457  	// some idle time.
  1458  	oldmaxprocs := runtime.GOMAXPROCS(10)
  1459  	time.Sleep(time.Millisecond)
  1460  	runtime.GOMAXPROCS(oldmaxprocs)
  1461  
  1462  	stats := runtime.ReadCPUStats()
  1463  	gcTotal := stats.GCAssistTime + stats.GCDedicatedTime + stats.GCIdleTime + stats.GCPauseTime
  1464  	if gcTotal != stats.GCTotalTime {
  1465  		t.Errorf("manually computed total does not match GCTotalTime: %d cpu-ns vs. %d cpu-ns", gcTotal, stats.GCTotalTime)
  1466  	}
  1467  	scavTotal := stats.ScavengeAssistTime + stats.ScavengeBgTime
  1468  	if scavTotal != stats.ScavengeTotalTime {
  1469  		t.Errorf("manually computed total does not match ScavengeTotalTime: %d cpu-ns vs. %d cpu-ns", scavTotal, stats.ScavengeTotalTime)
  1470  	}
  1471  	total := gcTotal + scavTotal + stats.IdleTime + stats.UserTime
  1472  	if total != stats.TotalTime {
  1473  		t.Errorf("manually computed overall total does not match TotalTime: %d cpu-ns vs. %d cpu-ns", total, stats.TotalTime)
  1474  	}
  1475  	if total == 0 {
  1476  		t.Error("total time is zero")
  1477  	}
  1478  	if gcTotal == 0 {
  1479  		t.Error("GC total time is zero")
  1480  	}
  1481  	if stats.IdleTime == 0 {
  1482  		t.Error("idle time is zero")
  1483  	}
  1484  }
  1485  
  1486  func TestMetricHeapUnusedLargeObjectOverflow(t *testing.T) {
  1487  	// This test makes sure /memory/classes/heap/unused:bytes
  1488  	// doesn't overflow when allocating and deallocating large
  1489  	// objects. It is a regression test for #67019.
  1490  	done := make(chan struct{})
  1491  	var wg sync.WaitGroup
  1492  	wg.Add(1)
  1493  	go func() {
  1494  		defer wg.Done()
  1495  		for {
  1496  			for range 10 {
  1497  				abi.Escape(make([]byte, 1<<20))
  1498  			}
  1499  			runtime.GC()
  1500  			select {
  1501  			case <-done:
  1502  				return
  1503  			default:
  1504  			}
  1505  		}
  1506  	}()
  1507  	s := []metrics.Sample{
  1508  		{Name: "/memory/classes/heap/unused:bytes"},
  1509  	}
  1510  	for range 1000 {
  1511  		metrics.Read(s)
  1512  		if s[0].Value.Uint64() > 1<<40 {
  1513  			t.Errorf("overflow")
  1514  			break
  1515  		}
  1516  	}
  1517  	done <- struct{}{}
  1518  	wg.Wait()
  1519  }
  1520  
  1521  func TestReadMetricsCleanups(t *testing.T) {
  1522  	runtime.GC()                                                // End any in-progress GC.
  1523  	runtime.BlockUntilEmptyCleanupQueue(int64(1 * time.Second)) // Flush any queued cleanups.
  1524  
  1525  	var before [2]metrics.Sample
  1526  	before[0].Name = "/gc/cleanups/queued:cleanups"
  1527  	before[1].Name = "/gc/cleanups/executed:cleanups"
  1528  	after := before
  1529  
  1530  	metrics.Read(before[:])
  1531  
  1532  	const N = 10
  1533  	for i := 0; i < N; i++ {
  1534  		runtime.AddCleanup(new(*int), func(_ struct{}) {}, struct{}{})
  1535  	}
  1536  
  1537  	runtime.GC()
  1538  	runtime.BlockUntilEmptyCleanupQueue(int64(1 * time.Second))
  1539  
  1540  	metrics.Read(after[:])
  1541  
  1542  	if v0, v1 := before[0].Value.Uint64(), after[0].Value.Uint64(); v0+N != v1 {
  1543  		t.Errorf("expected %s difference to be exactly %d, got %d -> %d", before[0].Name, N, v0, v1)
  1544  	}
  1545  	if v0, v1 := before[1].Value.Uint64(), after[1].Value.Uint64(); v0+N != v1 {
  1546  		t.Errorf("expected %s difference to be exactly %d, got %d -> %d", before[1].Name, N, v0, v1)
  1547  	}
  1548  }
  1549  
  1550  func TestReadMetricsFinalizers(t *testing.T) {
  1551  	runtime.GC()                                                  // End any in-progress GC.
  1552  	runtime.BlockUntilEmptyFinalizerQueue(int64(1 * time.Second)) // Flush any queued finalizers.
  1553  
  1554  	var before [2]metrics.Sample
  1555  	before[0].Name = "/gc/finalizers/queued:finalizers"
  1556  	before[1].Name = "/gc/finalizers/executed:finalizers"
  1557  	after := before
  1558  
  1559  	metrics.Read(before[:])
  1560  
  1561  	const N = 10
  1562  	for i := 0; i < N; i++ {
  1563  		runtime.SetFinalizer(new(*int), func(_ **int) {})
  1564  	}
  1565  
  1566  	runtime.GC()
  1567  	runtime.GC()
  1568  	runtime.BlockUntilEmptyFinalizerQueue(int64(1 * time.Second))
  1569  
  1570  	metrics.Read(after[:])
  1571  
  1572  	if v0, v1 := before[0].Value.Uint64(), after[0].Value.Uint64(); v0+N != v1 {
  1573  		t.Errorf("expected %s difference to be exactly %d, got %d -> %d", before[0].Name, N, v0, v1)
  1574  	}
  1575  	if v0, v1 := before[1].Value.Uint64(), after[1].Value.Uint64(); v0+N != v1 {
  1576  		t.Errorf("expected %s difference to be exactly %d, got %d -> %d", before[1].Name, N, v0, v1)
  1577  	}
  1578  }
  1579  
  1580  func TestReadMetricsSched(t *testing.T) {
  1581  	const (
  1582  		notInGo = iota
  1583  		runnable
  1584  		running
  1585  		waiting
  1586  		created
  1587  		threads
  1588  		numSamples
  1589  	)
  1590  	var s [numSamples]metrics.Sample
  1591  	s[notInGo].Name = "/sched/goroutines/not-in-go:goroutines"
  1592  	s[runnable].Name = "/sched/goroutines/runnable:goroutines"
  1593  	s[running].Name = "/sched/goroutines/running:goroutines"
  1594  	s[waiting].Name = "/sched/goroutines/waiting:goroutines"
  1595  	s[created].Name = "/sched/goroutines-created:goroutines"
  1596  	s[threads].Name = "/sched/threads/total:threads"
  1597  
  1598  	logMetrics := func(t *testing.T, s []metrics.Sample) {
  1599  		for i := range s {
  1600  			t.Logf("%s: %d", s[i].Name, s[i].Value.Uint64())
  1601  		}
  1602  	}
  1603  
  1604  	// generalSlack is the amount of goroutines we allow ourselves to be
  1605  	// off by in any given category, either due to background system
  1606  	// goroutines or testing package goroutines.
  1607  	const generalSlack = 4
  1608  
  1609  	// waitingSlack is the max number of blocked goroutines left
  1610  	// from other tests, the testing package, or system
  1611  	// goroutines.
  1612  	const waitingSlack = 100
  1613  
  1614  	// threadsSlack is the maximum number of threads left over
  1615  	// from other tests and the runtime (sysmon, the template thread, etc.)
  1616  	const threadsSlack = 20
  1617  
  1618  	// Make sure GC isn't running, since GC workers interfere with
  1619  	// expected counts.
  1620  	defer debug.SetGCPercent(debug.SetGCPercent(-1))
  1621  	runtime.GC()
  1622  
  1623  	check := func(t *testing.T, s *metrics.Sample, min, max uint64) {
  1624  		val := s.Value.Uint64()
  1625  		if val < min {
  1626  			t.Errorf("%s too low; %d < %d", s.Name, val, min)
  1627  		}
  1628  		if val > max {
  1629  			t.Errorf("%s too high; %d > %d", s.Name, val, max)
  1630  		}
  1631  	}
  1632  	checkEq := func(t *testing.T, s *metrics.Sample, value uint64) {
  1633  		check(t, s, value, value)
  1634  	}
  1635  	spinUntil := func(f func() bool, timeout time.Duration) bool {
  1636  		start := time.Now()
  1637  		for time.Since(start) < timeout {
  1638  			if f() {
  1639  				return true
  1640  			}
  1641  			time.Sleep(time.Millisecond)
  1642  		}
  1643  		return false
  1644  	}
  1645  
  1646  	// Check base values.
  1647  	t.Run("base", func(t *testing.T) {
  1648  		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
  1649  		metrics.Read(s[:])
  1650  		logMetrics(t, s[:])
  1651  		check(t, &s[notInGo], 0, generalSlack)
  1652  		check(t, &s[runnable], 0, generalSlack)
  1653  		checkEq(t, &s[running], 1)
  1654  		check(t, &s[waiting], 0, waitingSlack)
  1655  	})
  1656  
  1657  	metrics.Read(s[:])
  1658  	createdAfterBase := s[created].Value.Uint64()
  1659  
  1660  	// Force Running count to be high. We'll use these goroutines
  1661  	// for Runnable, too.
  1662  	const count = 10
  1663  	var ready, exit atomic.Uint32
  1664  	for i := 0; i < count-1; i++ {
  1665  		go func() {
  1666  			ready.Add(1)
  1667  			for exit.Load() == 0 {
  1668  				// Spin to get us and keep us running, but check
  1669  				// the exit condition so we exit out early if we're
  1670  				// done.
  1671  				start := time.Now()
  1672  				for time.Since(start) < 10*time.Millisecond && exit.Load() == 0 {
  1673  				}
  1674  				runtime.Gosched()
  1675  			}
  1676  		}()
  1677  	}
  1678  	for ready.Load() < count-1 {
  1679  		runtime.Gosched()
  1680  	}
  1681  
  1682  	// Be careful. We've entered a dangerous state for platforms
  1683  	// that do not return back to the underlying system unless all
  1684  	// goroutines are blocked, like js/wasm, since we have a bunch
  1685  	// of runnable goroutines all spinning. We cannot write anything
  1686  	// out.
  1687  	if testenv.HasParallelism() {
  1688  		t.Run("created", func(t *testing.T) {
  1689  			metrics.Read(s[:])
  1690  			logMetrics(t, s[:])
  1691  			checkEq(t, &s[created], createdAfterBase+count)
  1692  		})
  1693  		t.Run("running", func(t *testing.T) {
  1694  			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(count + 4))
  1695  			// It can take a little bit for the scheduler to
  1696  			// distribute the goroutines to Ps, so retry for a
  1697  			// while.
  1698  			spinUntil(func() bool {
  1699  				metrics.Read(s[:])
  1700  				return s[running].Value.Uint64() >= count
  1701  			}, time.Second)
  1702  			logMetrics(t, s[:])
  1703  			check(t, &s[running], count, count+4)
  1704  			check(t, &s[threads], count, count+4+threadsSlack)
  1705  		})
  1706  
  1707  		// Force runnable count to be high.
  1708  		t.Run("runnable", func(t *testing.T) {
  1709  			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
  1710  			metrics.Read(s[:])
  1711  			logMetrics(t, s[:])
  1712  			checkEq(t, &s[running], 1)
  1713  			check(t, &s[runnable], count-1, count+generalSlack)
  1714  		})
  1715  
  1716  		// Done with the running/runnable goroutines.
  1717  		exit.Store(1)
  1718  	} else {
  1719  		// Read metrics and then exit all the other goroutines,
  1720  		// so that system calls may proceed.
  1721  		metrics.Read(s[:])
  1722  
  1723  		// Done with the running/runnable goroutines.
  1724  		exit.Store(1)
  1725  
  1726  		// Now we can check our invariants.
  1727  		t.Run("created", func(t *testing.T) {
  1728  			// Look for count-1 goroutines because we read metrics
  1729  			// *before* t.Run goroutine was created for this sub-test.
  1730  			checkEq(t, &s[created], createdAfterBase+count-1)
  1731  		})
  1732  		t.Run("running", func(t *testing.T) {
  1733  			logMetrics(t, s[:])
  1734  			checkEq(t, &s[running], 1)
  1735  			checkEq(t, &s[threads], 1)
  1736  		})
  1737  		t.Run("runnable", func(t *testing.T) {
  1738  			logMetrics(t, s[:])
  1739  			check(t, &s[runnable], count-1, count+generalSlack)
  1740  		})
  1741  	}
  1742  
  1743  	// Force not-in-go count to be high. This is a little tricky since
  1744  	// we try really hard not to let things block in system calls.
  1745  	// We have to drop to the syscall package to do this reliably.
  1746  	t.Run("not-in-go", func(t *testing.T) {
  1747  		// Block a bunch of goroutines on an OS pipe.
  1748  		pr, pw, err := pipe()
  1749  		if err != nil {
  1750  			switch runtime.GOOS {
  1751  			case "js", "wasip1":
  1752  				t.Skip("creating pipe:", err)
  1753  			}
  1754  			t.Fatal("creating pipe:", err)
  1755  		}
  1756  		for i := 0; i < count; i++ {
  1757  			go syscall.Read(pr, make([]byte, 1))
  1758  		}
  1759  
  1760  		// Let the goroutines block.
  1761  		spinUntil(func() bool {
  1762  			metrics.Read(s[:])
  1763  			return s[notInGo].Value.Uint64() >= count
  1764  		}, time.Second)
  1765  
  1766  		metrics.Read(s[:])
  1767  		logMetrics(t, s[:])
  1768  		check(t, &s[notInGo], count, count+generalSlack)
  1769  
  1770  		syscall.Close(pw)
  1771  		syscall.Close(pr)
  1772  	})
  1773  
  1774  	t.Run("waiting", func(t *testing.T) {
  1775  		// Force waiting count to be high.
  1776  		const waitingCount = 1000
  1777  		stop := make(chan bool)
  1778  		for i := 0; i < waitingCount; i++ {
  1779  			go func() { <-stop }()
  1780  		}
  1781  
  1782  		// Let the goroutines block.
  1783  		spinUntil(func() bool {
  1784  			metrics.Read(s[:])
  1785  			return s[waiting].Value.Uint64() >= waitingCount
  1786  		}, time.Second)
  1787  
  1788  		metrics.Read(s[:])
  1789  		logMetrics(t, s[:])
  1790  		check(t, &s[waiting], waitingCount, waitingCount+waitingSlack)
  1791  
  1792  		close(stop)
  1793  	})
  1794  }
  1795  

View as plain text