Source file src/runtime/trace.go
1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Go execution tracer. 6 // The tracer captures a wide range of execution events like goroutine 7 // creation/blocking/unblocking, syscall enter/exit/block, GC-related events, 8 // changes of heap size, processor start/stop, etc and writes them to a buffer 9 // in a compact form. A precise nanosecond-precision timestamp and a stack 10 // trace is captured for most events. 11 // 12 // Tracer invariants (to keep the synchronization making sense): 13 // - An m that has a trace buffer must be on either the allm or sched.freem lists. 14 // - Any trace buffer mutation must either be happening in traceAdvance or between 15 // a traceAcquire and a subsequent traceRelease. 16 // - traceAdvance cannot return until the previous generation's buffers are all flushed. 17 // 18 // See https://go.dev/issue/60773 for a link to the full design. 19 20 package runtime 21 22 import ( 23 "internal/runtime/atomic" 24 "unsafe" 25 ) 26 27 // Trace state. 28 29 // trace is global tracing context. 30 var trace struct { 31 // trace.lock must only be acquired on the system stack where 32 // stack splits cannot happen while it is held. 33 lock mutex 34 35 // Trace buffer management. 36 // 37 // First we check the empty list for any free buffers. If not, buffers 38 // are allocated directly from the OS. Once they're filled up and/or 39 // flushed, they end up on the full queue for trace.gen%2. 40 // 41 // The trace reader takes buffers off the full list one-by-one and 42 // places them into reading until they're finished being read from. 43 // Then they're placed onto the empty list. 44 // 45 // Protected by trace.lock. 46 reading *traceBuf // buffer currently handed off to user 47 empty *traceBuf // stack of empty buffers 48 full [2]traceBufQueue 49 workAvailable atomic.Bool 50 51 // State for the trace reader goroutine. 52 // 53 // Protected by trace.lock. 54 readerGen atomic.Uintptr // the generation the reader is currently reading for 55 flushedGen atomic.Uintptr // the last completed generation 56 headerWritten bool // whether ReadTrace has emitted trace header 57 58 // doneSema is used to synchronize the reader and traceAdvance. Specifically, 59 // it notifies traceAdvance that the reader is done with a generation. 60 // Both semaphores are 0 by default (so, acquires block). traceAdvance 61 // attempts to acquire for gen%2 after flushing the last buffers for gen. 62 // Meanwhile the reader releases the sema for gen%2 when it has finished 63 // processing gen. 64 doneSema [2]uint32 65 66 // Trace data tables for deduplicating data going into the trace. 67 // There are 2 of each: one for gen%2, one for 1-gen%2. 68 stackTab [2]traceStackTable // maps stack traces to unique ids 69 stringTab [2]traceStringTable // maps strings to unique ids 70 typeTab [2]traceTypeTable // maps type pointers to unique ids 71 72 // cpuLogRead accepts CPU profile samples from the signal handler where 73 // they're generated. There are two profBufs here: one for gen%2, one for 74 // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G, 75 // and M (respectively) that were active at the time of the sample. Because 76 // profBuf uses a record with all zeros in its header to indicate overflow, 77 // we make sure to make the P field always non-zero: The ID of a real P will 78 // start at bit 1, and bit 0 will be set. Samples that arrive while no P is 79 // running (such as near syscalls) will set the first header field to 0b10. 80 // This careful handling of the first header field allows us to store ID of 81 // the active G directly in the second field, even though that will be 0 82 // when sampling g0. 83 // 84 // Initialization and teardown of these fields is protected by traceAdvanceSema. 85 cpuLogRead [2]*profBuf 86 signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers 87 cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock 88 cpuSleep *wakeableSleep 89 cpuLogDone <-chan struct{} 90 cpuBuf [2]*traceBuf 91 92 reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil 93 94 // Fast mappings from enumerations to string IDs that are prepopulated 95 // in the trace. 96 markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg 97 goStopReasons [2][len(traceGoStopReasonStrings)]traceArg 98 goBlockReasons [2][len(traceBlockReasonStrings)]traceArg 99 100 // enabled indicates whether tracing is enabled, but it is only an optimization, 101 // NOT the source of truth on whether tracing is enabled. Tracing is only truly 102 // enabled if gen != 0. This is used as an optimistic fast path check. 103 // 104 // Transitioning this value from true -> false is easy (once gen is 0) 105 // because it's OK for enabled to have a stale "true" value. traceAcquire will 106 // always double-check gen. 107 // 108 // Transitioning this value from false -> true is harder. We need to make sure 109 // this is observable as true strictly before gen != 0. To maintain this invariant 110 // we only make this transition with the world stopped and use the store to gen 111 // as a publication barrier. 112 enabled bool 113 114 // enabledWithAllocFree is set if debug.traceallocfree is != 0 when tracing begins. 115 // It follows the same synchronization protocol as enabled. 116 enabledWithAllocFree bool 117 118 // Trace generation counter. 119 gen atomic.Uintptr 120 lastNonZeroGen uintptr // last non-zero value of gen 121 122 // shutdown is set when we are waiting for trace reader to finish after setting gen to 0 123 // 124 // Writes protected by trace.lock. 125 shutdown atomic.Bool 126 127 // Number of goroutines in syscall exiting slow path. 128 exitingSyscall atomic.Int32 129 130 // seqGC is the sequence counter for GC begin/end. 131 // 132 // Mutated only during stop-the-world. 133 seqGC uint64 134 135 // minPageHeapAddr is the minimum address of the page heap when tracing started. 136 minPageHeapAddr uint64 137 138 // debugMalloc is the value of debug.malloc before tracing began. 139 debugMalloc bool 140 } 141 142 // Trace public API. 143 144 var ( 145 traceAdvanceSema uint32 = 1 146 traceShutdownSema uint32 = 1 147 ) 148 149 // StartTrace enables tracing for the current process. 150 // While tracing, the data will be buffered and available via [ReadTrace]. 151 // StartTrace returns an error if tracing is already enabled. 152 // Most clients should use the [runtime/trace] package or the [testing] package's 153 // -test.trace flag instead of calling StartTrace directly. 154 func StartTrace() error { 155 if traceEnabled() || traceShuttingDown() { 156 return errorString("tracing is already enabled") 157 } 158 // Block until cleanup of the last trace is done. 159 semacquire(&traceShutdownSema) 160 semrelease(&traceShutdownSema) 161 162 // Hold traceAdvanceSema across trace start, since we'll want it on 163 // the other side of tracing being enabled globally. 164 semacquire(&traceAdvanceSema) 165 166 // Initialize CPU profile -> trace ingestion. 167 traceInitReadCPU() 168 169 // Compute the first generation for this StartTrace. 170 // 171 // Note: we start from the last non-zero generation rather than 1 so we 172 // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's 173 // more than one of each per m, p, and goroutine. 174 firstGen := traceNextGen(trace.lastNonZeroGen) 175 176 // Reset GC sequencer. 177 trace.seqGC = 1 178 179 // Reset trace reader state. 180 trace.headerWritten = false 181 trace.readerGen.Store(firstGen) 182 trace.flushedGen.Store(0) 183 184 // Register some basic strings in the string tables. 185 traceRegisterLabelsAndReasons(firstGen) 186 187 // Stop the world. 188 // 189 // The purpose of stopping the world is to make sure that no goroutine is in a 190 // context where it could emit an event by bringing all goroutines to a safe point 191 // with no opportunity to transition. 192 // 193 // The exception to this rule are goroutines that are concurrently exiting a syscall. 194 // Those will all be forced into the syscalling slow path, and we'll just make sure 195 // that we don't observe any goroutines in that critical section before starting 196 // the world again. 197 // 198 // A good follow-up question to this is why stopping the world is necessary at all 199 // given that we have traceAcquire and traceRelease. Unfortunately, those only help 200 // us when tracing is already active (for performance, so when tracing is off the 201 // tracing seqlock is left untouched). The main issue here is subtle: we're going to 202 // want to obtain a correct starting status for each goroutine, but there are windows 203 // of time in which we could read and emit an incorrect status. Specifically: 204 // 205 // trace := traceAcquire() 206 // // <----> problem window 207 // casgstatus(gp, _Gwaiting, _Grunnable) 208 // if trace.ok() { 209 // trace.GoUnpark(gp, 2) 210 // traceRelease(trace) 211 // } 212 // 213 // More precisely, if we readgstatus for a gp while another goroutine is in the problem 214 // window and that goroutine didn't observe that tracing had begun, then we might write 215 // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking 216 // the transition from GoWaiting to GoRunnable. The trace will then be broken, because 217 // future events will be emitted assuming the tracer sees GoRunnable. 218 // 219 // In short, what we really need here is to make sure that the next time *any goroutine* 220 // hits a traceAcquire, it sees that the trace is enabled. 221 // 222 // Note also that stopping the world is necessary to make sure sweep-related events are 223 // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start 224 // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this. 225 stw := stopTheWorld(stwStartTrace) 226 227 // Prevent sysmon from running any code that could generate events. 228 lock(&sched.sysmonlock) 229 230 // Grab the minimum page heap address. All Ps are stopped, so it's safe to read this since 231 // nothing can allocate heap memory. 232 trace.minPageHeapAddr = uint64(mheap_.pages.inUse.ranges[0].base.addr()) 233 234 // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled. 235 for _, pp := range allp { 236 pp.trace.mSyscallID = -1 237 } 238 239 // Start tracing. 240 // 241 // Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if 242 // trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for 243 // performance, we need trace.enabled to be read without any synchronization. 244 // 245 // We ensure this is safe by stopping the world, which acts a global barrier on almost 246 // every M, and explicitly synchronize with any other Ms that could be running concurrently 247 // with us. Today, there are only two such cases: 248 // - sysmon, which we synchronized with by acquiring sysmonlock. 249 // - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall. 250 // 251 // After trace.gen is updated, other Ms may start creating trace buffers and emitting 252 // data into them. 253 trace.enabled = true 254 if debug.traceallocfree.Load() != 0 { 255 // Enable memory events since the GODEBUG is set. 256 trace.debugMalloc = debug.malloc 257 trace.enabledWithAllocFree = true 258 debug.malloc = true 259 } 260 trace.gen.Store(firstGen) 261 262 // Wait for exitingSyscall to drain. 263 // 264 // It may not monotonically decrease to zero, but in the limit it will always become 265 // zero because the world is stopped and there are no available Ps for syscall-exited 266 // goroutines to run on. 267 // 268 // Because we set gen before checking this, and because exitingSyscall is always incremented 269 // *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero 270 // that any goroutine that goes to exit a syscall from then on *must* observe the new gen as 271 // well as trace.enabled being set to true. 272 // 273 // The critical section on each goroutine here is going to be quite short, so the likelihood 274 // that we observe a zero value is high. 275 for trace.exitingSyscall.Load() != 0 { 276 osyield() 277 } 278 279 // Record some initial pieces of information. 280 // 281 // N.B. This will also emit a status event for this goroutine. 282 tl := traceAcquire() 283 tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance. 284 tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now. 285 286 // Record the fact that a GC is active, if applicable. 287 if gcphase == _GCmark || gcphase == _GCmarktermination { 288 tl.GCActive() 289 } 290 291 // Dump a snapshot of memory, if enabled. 292 if trace.enabledWithAllocFree { 293 traceSnapshotMemory(firstGen) 294 } 295 296 // Record the heap goal so we have it at the very beginning of the trace. 297 tl.HeapGoal() 298 299 // Make sure a ProcStatus is emitted for every P, while we're here. 300 for _, pp := range allp { 301 tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end() 302 } 303 traceRelease(tl) 304 305 unlock(&sched.sysmonlock) 306 startTheWorld(stw) 307 308 traceStartReadCPU() 309 traceAdvancer.start() 310 311 semrelease(&traceAdvanceSema) 312 return nil 313 } 314 315 // StopTrace stops tracing, if it was previously enabled. 316 // StopTrace only returns after all the reads for the trace have completed. 317 func StopTrace() { 318 traceAdvance(true) 319 } 320 321 // traceAdvance moves tracing to the next generation, and cleans up the current generation, 322 // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing 323 // altogether instead of advancing to the next generation. 324 // 325 // traceAdvanceSema must not be held. 326 // 327 // traceAdvance is called by golang.org/x/exp/trace using linkname. 328 // 329 //go:linkname traceAdvance 330 func traceAdvance(stopTrace bool) { 331 semacquire(&traceAdvanceSema) 332 333 // Get the gen that we're advancing from. In this function we don't really care much 334 // about the generation we're advancing _into_ since we'll do all the cleanup in this 335 // generation for the next advancement. 336 gen := trace.gen.Load() 337 if gen == 0 { 338 // We may end up here traceAdvance is called concurrently with StopTrace. 339 semrelease(&traceAdvanceSema) 340 return 341 } 342 343 // Write an EvFrequency event for this generation. 344 // 345 // N.B. This may block for quite a while to get a good frequency estimate, so make sure we do 346 // this here and not e.g. on the trace reader. 347 traceFrequency(gen) 348 349 // Collect all the untraced Gs. 350 type untracedG struct { 351 gp *g 352 goid uint64 353 mid int64 354 stackID uint64 355 status uint32 356 waitreason waitReason 357 inMarkAssist bool 358 } 359 var untracedGs []untracedG 360 forEachGRace(func(gp *g) { 361 // Make absolutely sure all Gs are ready for the next 362 // generation. We need to do this even for dead Gs because 363 // they may come alive with a new identity, and its status 364 // traced bookkeeping might end up being stale. 365 // We may miss totally new goroutines, but they'll always 366 // have clean bookkeeping. 367 gp.trace.readyNextGen(gen) 368 // If the status was traced, nothing else to do. 369 if gp.trace.statusWasTraced(gen) { 370 return 371 } 372 // Scribble down information about this goroutine. 373 ug := untracedG{gp: gp, mid: -1} 374 systemstack(func() { 375 me := getg().m.curg 376 // We don't have to handle this G status transition because we 377 // already eliminated ourselves from consideration above. 378 casGToWaitingForGC(me, _Grunning, waitReasonTraceGoroutineStatus) 379 // We need to suspend and take ownership of the G to safely read its 380 // goid. Note that we can't actually emit the event at this point 381 // because we might stop the G in a window where it's unsafe to write 382 // events based on the G's status. We need the global trace buffer flush 383 // coming up to make sure we're not racing with the G. 384 // 385 // It should be very unlikely that we try to preempt a running G here. 386 // The only situation that we might is that we're racing with a G 387 // that's running for the first time in this generation. Therefore, 388 // this should be relatively fast. 389 s := suspendG(gp) 390 if !s.dead { 391 ug.goid = s.g.goid 392 if s.g.m != nil { 393 ug.mid = int64(s.g.m.procid) 394 } 395 ug.status = readgstatus(s.g) &^ _Gscan 396 ug.waitreason = s.g.waitreason 397 ug.inMarkAssist = s.g.inMarkAssist 398 ug.stackID = traceStack(0, gp, gen) 399 } 400 resumeG(s) 401 casgstatus(me, _Gwaiting, _Grunning) 402 }) 403 if ug.goid != 0 { 404 untracedGs = append(untracedGs, ug) 405 } 406 }) 407 408 if !stopTrace { 409 // Re-register runtime goroutine labels and stop/block reasons. 410 traceRegisterLabelsAndReasons(traceNextGen(gen)) 411 } 412 413 // Now that we've done some of the heavy stuff, prevent the world from stopping. 414 // This is necessary to ensure the consistency of the STW events. If we're feeling 415 // adventurous we could lift this restriction and add a STWActive event, but the 416 // cost of maintaining this consistency is low. We're not going to hold this semaphore 417 // for very long and most STW periods are very short. 418 // Once we hold worldsema, prevent preemption as well so we're not interrupted partway 419 // through this. We want to get this done as soon as possible. 420 semacquire(&worldsema) 421 mp := acquirem() 422 423 // Advance the generation or stop the trace. 424 trace.lastNonZeroGen = gen 425 if stopTrace { 426 systemstack(func() { 427 // Ordering is important here. Set shutdown first, then disable tracing, 428 // so that conditions like (traceEnabled() || traceShuttingDown()) have 429 // no opportunity to be false. Hold the trace lock so this update appears 430 // atomic to the trace reader. 431 lock(&trace.lock) 432 trace.shutdown.Store(true) 433 trace.gen.Store(0) 434 unlock(&trace.lock) 435 436 // Clear trace.enabled. It is totally OK for this value to be stale, 437 // because traceAcquire will always double-check gen. 438 trace.enabled = false 439 }) 440 } else { 441 trace.gen.Store(traceNextGen(gen)) 442 } 443 444 // Emit a ProcsChange event so we have one on record for each generation. 445 // Let's emit it as soon as possible so that downstream tools can rely on the value 446 // being there fairly soon in a generation. 447 // 448 // It's important that we do this before allowing stop-the-worlds again, 449 // because the procs count could change. 450 if !stopTrace { 451 tl := traceAcquire() 452 tl.Gomaxprocs(gomaxprocs) 453 traceRelease(tl) 454 } 455 456 // Emit a GCActive event in the new generation if necessary. 457 // 458 // It's important that we do this before allowing stop-the-worlds again, 459 // because that could emit global GC-related events. 460 if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) { 461 tl := traceAcquire() 462 tl.GCActive() 463 traceRelease(tl) 464 } 465 466 // Preemption is OK again after this. If the world stops or whatever it's fine. 467 // We're just cleaning up the last generation after this point. 468 // 469 // We also don't care if the GC starts again after this for the same reasons. 470 releasem(mp) 471 semrelease(&worldsema) 472 473 // Snapshot allm and freem. 474 // 475 // Snapshotting after the generation counter update is sufficient. 476 // Because an m must be on either allm or sched.freem if it has an active trace 477 // buffer, new threads added to allm after this point must necessarily observe 478 // the new generation number (sched.lock acts as a barrier). 479 // 480 // Threads that exit before this point and are on neither list explicitly 481 // flush their own buffers in traceThreadDestroy. 482 // 483 // Snapshotting freem is necessary because Ms can continue to emit events 484 // while they're still on that list. Removal from sched.freem is serialized with 485 // this snapshot, so either we'll capture an m on sched.freem and race with 486 // the removal to flush its buffers (resolved by traceThreadDestroy acquiring 487 // the thread's seqlock, which one of us must win, so at least its old gen buffer 488 // will be flushed in time for the new generation) or it will have flushed its 489 // buffers before we snapshotted it to begin with. 490 lock(&sched.lock) 491 mToFlush := allm 492 for mp := mToFlush; mp != nil; mp = mp.alllink { 493 mp.trace.link = mp.alllink 494 } 495 for mp := sched.freem; mp != nil; mp = mp.freelink { 496 mp.trace.link = mToFlush 497 mToFlush = mp 498 } 499 unlock(&sched.lock) 500 501 // Iterate over our snapshot, flushing every buffer until we're done. 502 // 503 // Because trace writers read the generation while the seqlock is 504 // held, we can be certain that when there are no writers there are 505 // also no stale generation values left. Therefore, it's safe to flush 506 // any buffers that remain in that generation's slot. 507 const debugDeadlock = false 508 systemstack(func() { 509 // Track iterations for some rudimentary deadlock detection. 510 i := 0 511 detectedDeadlock := false 512 513 for mToFlush != nil { 514 prev := &mToFlush 515 for mp := *prev; mp != nil; { 516 if mp.trace.seqlock.Load()%2 != 0 { 517 // The M is writing. Come back to it later. 518 prev = &mp.trace.link 519 mp = mp.trace.link 520 continue 521 } 522 // Flush the trace buffer. 523 // 524 // trace.lock needed for traceBufFlush, but also to synchronize 525 // with traceThreadDestroy, which flushes both buffers unconditionally. 526 lock(&trace.lock) 527 for exp, buf := range mp.trace.buf[gen%2] { 528 if buf != nil { 529 traceBufFlush(buf, gen) 530 mp.trace.buf[gen%2][exp] = nil 531 } 532 } 533 unlock(&trace.lock) 534 535 // Remove the m from the flush list. 536 *prev = mp.trace.link 537 mp.trace.link = nil 538 mp = *prev 539 } 540 // Yield only if we're going to be going around the loop again. 541 if mToFlush != nil { 542 osyield() 543 } 544 545 if debugDeadlock { 546 // Try to detect a deadlock. We probably shouldn't loop here 547 // this many times. 548 if i > 100000 && !detectedDeadlock { 549 detectedDeadlock = true 550 println("runtime: failing to flush") 551 for mp := mToFlush; mp != nil; mp = mp.trace.link { 552 print("runtime: m=", mp.id, "\n") 553 } 554 } 555 i++ 556 } 557 } 558 }) 559 560 // At this point, the old generation is fully flushed minus stack and string 561 // tables, CPU samples, and goroutines that haven't run at all during the last 562 // generation. 563 564 // Check to see if any Gs still haven't had events written out for them. 565 statusWriter := unsafeTraceWriter(gen, nil) 566 for _, ug := range untracedGs { 567 if ug.gp.trace.statusWasTraced(gen) { 568 // It was traced, we don't need to do anything. 569 continue 570 } 571 // It still wasn't traced. Because we ensured all Ms stopped writing trace 572 // events to the last generation, that must mean the G never had its status 573 // traced in gen between when we recorded it and now. If that's true, the goid 574 // and status we recorded then is exactly what we want right now. 575 status := goStatusToTraceGoStatus(ug.status, ug.waitreason) 576 statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist, ug.stackID) 577 } 578 statusWriter.flush().end() 579 580 // Read everything out of the last gen's CPU profile buffer. 581 traceReadCPU(gen) 582 583 // Flush CPU samples, stacks, and strings for the last generation. This is safe, 584 // because we're now certain no M is writing to the last generation. 585 // 586 // Ordering is important here. traceCPUFlush may generate new stacks and dumping 587 // stacks may generate new strings. 588 traceCPUFlush(gen) 589 trace.stackTab[gen%2].dump(gen) 590 trace.typeTab[gen%2].dump(gen) 591 trace.stringTab[gen%2].reset(gen) 592 593 // That's it. This generation is done producing buffers. 594 systemstack(func() { 595 lock(&trace.lock) 596 trace.flushedGen.Store(gen) 597 unlock(&trace.lock) 598 }) 599 600 // Perform status reset on dead Ps because they just appear as idle. 601 // 602 // Preventing preemption is sufficient to access allp safely. allp is only 603 // mutated by GOMAXPROCS calls, which require a STW. 604 // 605 // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy 606 // events to indicate whether a P exists, rather than just making its 607 // existence implicit. 608 mp = acquirem() 609 for _, pp := range allp[len(allp):cap(allp)] { 610 pp.trace.readyNextGen(traceNextGen(gen)) 611 } 612 releasem(mp) 613 614 if stopTrace { 615 // Acquire the shutdown sema to begin the shutdown process. 616 semacquire(&traceShutdownSema) 617 618 // Finish off CPU profile reading. 619 traceStopReadCPU() 620 621 // Reset debug.malloc if necessary. Note that this is set in a racy 622 // way; that's OK. Some mallocs may still enter into the debug.malloc 623 // block, but they won't generate events because tracing is disabled. 624 // That is, it's OK if mallocs read a stale debug.malloc or 625 // trace.enabledWithAllocFree value. 626 if trace.enabledWithAllocFree { 627 trace.enabledWithAllocFree = false 628 debug.malloc = trace.debugMalloc 629 } 630 } else { 631 // Go over each P and emit a status event for it if necessary. 632 // 633 // We do this at the beginning of the new generation instead of the 634 // end like we do for goroutines because forEachP doesn't give us a 635 // hook to skip Ps that have already been traced. Since we have to 636 // preempt all Ps anyway, might as well stay consistent with StartTrace 637 // which does this during the STW. 638 semacquire(&worldsema) 639 forEachP(waitReasonTraceProcStatus, func(pp *p) { 640 tl := traceAcquire() 641 if !pp.trace.statusWasTraced(tl.gen) { 642 tl.writer().writeProcStatusForP(pp, false).end() 643 } 644 traceRelease(tl) 645 }) 646 semrelease(&worldsema) 647 } 648 649 // Block until the trace reader has finished processing the last generation. 650 semacquire(&trace.doneSema[gen%2]) 651 if raceenabled { 652 raceacquire(unsafe.Pointer(&trace.doneSema[gen%2])) 653 } 654 655 // Double-check that things look as we expect after advancing and perform some 656 // final cleanup if the trace has fully stopped. 657 systemstack(func() { 658 lock(&trace.lock) 659 if !trace.full[gen%2].empty() { 660 throw("trace: non-empty full trace buffer for done generation") 661 } 662 if stopTrace { 663 if !trace.full[1-(gen%2)].empty() { 664 throw("trace: non-empty full trace buffer for next generation") 665 } 666 if trace.reading != nil || trace.reader.Load() != nil { 667 throw("trace: reading after shutdown") 668 } 669 // Free all the empty buffers. 670 for trace.empty != nil { 671 buf := trace.empty 672 trace.empty = buf.link 673 sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys) 674 } 675 // Clear trace.shutdown and other flags. 676 trace.headerWritten = false 677 trace.shutdown.Store(false) 678 } 679 unlock(&trace.lock) 680 }) 681 682 if stopTrace { 683 // Clear the sweep state on every P for the next time tracing is enabled. 684 // 685 // It may be stale in the next trace because we may have ended tracing in 686 // the middle of a sweep on a P. 687 // 688 // It's fine not to call forEachP here because tracing is disabled and we 689 // know at this point that nothing is calling into the tracer, but we do 690 // need to look at dead Ps too just because GOMAXPROCS could have been called 691 // at any point since we stopped tracing, and we have to ensure there's no 692 // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that 693 // might mutate allp by making ourselves briefly non-preemptible. 694 mp := acquirem() 695 for _, pp := range allp[:cap(allp)] { 696 pp.trace.inSweep = false 697 pp.trace.maySweep = false 698 pp.trace.swept = 0 699 pp.trace.reclaimed = 0 700 } 701 releasem(mp) 702 } 703 704 // Release the advance semaphore. If stopTrace is true we're still holding onto 705 // traceShutdownSema. 706 // 707 // Do a direct handoff. Don't let one caller of traceAdvance starve 708 // other calls to traceAdvance. 709 semrelease1(&traceAdvanceSema, true, 0) 710 711 if stopTrace { 712 // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because 713 // we'll deadlock (we're blocked on the advancer goroutine exiting, but it 714 // may be currently trying to acquire traceAdvanceSema). 715 traceAdvancer.stop() 716 semrelease(&traceShutdownSema) 717 } 718 } 719 720 func traceNextGen(gen uintptr) uintptr { 721 if gen == ^uintptr(0) { 722 // gen is used both %2 and %3 and we want both patterns to continue when we loop around. 723 // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation 724 // we want is even and one more than a multiple of 3. The smallest such number is 4. 725 return 4 726 } 727 return gen + 1 728 } 729 730 // traceRegisterLabelsAndReasons re-registers mark worker labels and 731 // goroutine stop/block reasons in the string table for the provided 732 // generation. Note: the provided generation must not have started yet. 733 func traceRegisterLabelsAndReasons(gen uintptr) { 734 for i, label := range gcMarkWorkerModeStrings[:] { 735 trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label)) 736 } 737 for i, str := range traceBlockReasonStrings[:] { 738 trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 739 } 740 for i, str := range traceGoStopReasonStrings[:] { 741 trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 742 } 743 } 744 745 // ReadTrace returns the next chunk of binary tracing data, blocking until data 746 // is available. If tracing is turned off and all the data accumulated while it 747 // was on has been returned, ReadTrace returns nil. The caller must copy the 748 // returned data before calling ReadTrace again. 749 // ReadTrace must be called from one goroutine at a time. 750 func ReadTrace() []byte { 751 top: 752 var buf []byte 753 var park bool 754 systemstack(func() { 755 buf, park = readTrace0() 756 }) 757 if park { 758 gopark(func(gp *g, _ unsafe.Pointer) bool { 759 if !trace.reader.CompareAndSwapNoWB(nil, gp) { 760 // We're racing with another reader. 761 // Wake up and handle this case. 762 return false 763 } 764 765 if g2 := traceReader(); gp == g2 { 766 // New data arrived between unlocking 767 // and the CAS and we won the wake-up 768 // race, so wake up directly. 769 return false 770 } else if g2 != nil { 771 printlock() 772 println("runtime: got trace reader", g2, g2.goid) 773 throw("unexpected trace reader") 774 } 775 776 return true 777 }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2) 778 goto top 779 } 780 781 return buf 782 } 783 784 // readTrace0 is ReadTrace's continuation on g0. This must run on the 785 // system stack because it acquires trace.lock. 786 // 787 //go:systemstack 788 func readTrace0() (buf []byte, park bool) { 789 if raceenabled { 790 // g0 doesn't have a race context. Borrow the user G's. 791 if getg().racectx != 0 { 792 throw("expected racectx == 0") 793 } 794 getg().racectx = getg().m.curg.racectx 795 // (This defer should get open-coded, which is safe on 796 // the system stack.) 797 defer func() { getg().racectx = 0 }() 798 } 799 800 // This function must not allocate while holding trace.lock: 801 // allocation can call heap allocate, which will try to emit a trace 802 // event while holding heap lock. 803 lock(&trace.lock) 804 805 if trace.reader.Load() != nil { 806 // More than one goroutine reads trace. This is bad. 807 // But we rather do not crash the program because of tracing, 808 // because tracing can be enabled at runtime on prod servers. 809 unlock(&trace.lock) 810 println("runtime: ReadTrace called from multiple goroutines simultaneously") 811 return nil, false 812 } 813 // Recycle the old buffer. 814 if buf := trace.reading; buf != nil { 815 buf.link = trace.empty 816 trace.empty = buf 817 trace.reading = nil 818 } 819 // Write trace header. 820 if !trace.headerWritten { 821 trace.headerWritten = true 822 unlock(&trace.lock) 823 return []byte("go 1.23 trace\x00\x00\x00"), false 824 } 825 826 // Read the next buffer. 827 828 if trace.readerGen.Load() == 0 { 829 trace.readerGen.Store(1) 830 } 831 var gen uintptr 832 for { 833 assertLockHeld(&trace.lock) 834 gen = trace.readerGen.Load() 835 836 // Check to see if we need to block for more data in this generation 837 // or if we need to move our generation forward. 838 if !trace.full[gen%2].empty() { 839 break 840 } 841 // Most of the time readerGen is one generation ahead of flushedGen, as the 842 // current generation is being read from. Then, once the last buffer is flushed 843 // into readerGen, flushedGen will rise to meet it. At this point, the tracer 844 // is waiting on the reader to finish flushing the last generation so that it 845 // can continue to advance. 846 if trace.flushedGen.Load() == gen { 847 if trace.shutdown.Load() { 848 unlock(&trace.lock) 849 850 // Wake up anyone waiting for us to be done with this generation. 851 // 852 // Do this after reading trace.shutdown, because the thread we're 853 // waking up is going to clear trace.shutdown. 854 if raceenabled { 855 // Model synchronization on trace.doneSema, which te race 856 // detector does not see. This is required to avoid false 857 // race reports on writer passed to trace.Start. 858 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 859 } 860 semrelease(&trace.doneSema[gen%2]) 861 862 // We're shutting down, and the last generation is fully 863 // read. We're done. 864 return nil, false 865 } 866 // The previous gen has had all of its buffers flushed, and 867 // there's nothing else for us to read. Advance the generation 868 // we're reading from and try again. 869 trace.readerGen.Store(trace.gen.Load()) 870 unlock(&trace.lock) 871 872 // Wake up anyone waiting for us to be done with this generation. 873 // 874 // Do this after reading gen to make sure we can't have the trace 875 // advance until we've read it. 876 if raceenabled { 877 // See comment above in the shutdown case. 878 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 879 } 880 semrelease(&trace.doneSema[gen%2]) 881 882 // Reacquire the lock and go back to the top of the loop. 883 lock(&trace.lock) 884 continue 885 } 886 // Wait for new data. 887 // 888 // We don't simply use a note because the scheduler 889 // executes this goroutine directly when it wakes up 890 // (also a note would consume an M). 891 // 892 // Before we drop the lock, clear the workAvailable flag. Work can 893 // only be queued with trace.lock held, so this is at least true until 894 // we drop the lock. 895 trace.workAvailable.Store(false) 896 unlock(&trace.lock) 897 return nil, true 898 } 899 // Pull a buffer. 900 tbuf := trace.full[gen%2].pop() 901 trace.reading = tbuf 902 unlock(&trace.lock) 903 return tbuf.arr[:tbuf.pos], false 904 } 905 906 // traceReader returns the trace reader that should be woken up, if any. 907 // Callers should first check (traceEnabled() || traceShuttingDown()). 908 // 909 // This must run on the system stack because it acquires trace.lock. 910 // 911 //go:systemstack 912 func traceReader() *g { 913 gp := traceReaderAvailable() 914 if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) { 915 return nil 916 } 917 return gp 918 } 919 920 // traceReaderAvailable returns the trace reader if it is not currently 921 // scheduled and should be. Callers should first check that 922 // (traceEnabled() || traceShuttingDown()) is true. 923 func traceReaderAvailable() *g { 924 // There are three conditions under which we definitely want to schedule 925 // the reader: 926 // - The reader is lagging behind in finishing off the last generation. 927 // In this case, trace buffers could even be empty, but the trace 928 // advancer will be waiting on the reader, so we have to make sure 929 // to schedule the reader ASAP. 930 // - The reader has pending work to process for it's reader generation 931 // (assuming readerGen is not lagging behind). Note that we also want 932 // to be careful *not* to schedule the reader if there's no work to do. 933 // - The trace is shutting down. The trace stopper blocks on the reader 934 // to finish, much like trace advancement. 935 // 936 // We also want to be careful not to schedule the reader if there's no 937 // reason to. 938 if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() { 939 return trace.reader.Load() 940 } 941 return nil 942 } 943 944 // Trace advancer goroutine. 945 var traceAdvancer traceAdvancerState 946 947 type traceAdvancerState struct { 948 timer *wakeableSleep 949 done chan struct{} 950 } 951 952 // start starts a new traceAdvancer. 953 func (s *traceAdvancerState) start() { 954 // Start a goroutine to periodically advance the trace generation. 955 s.done = make(chan struct{}) 956 s.timer = newWakeableSleep() 957 go func() { 958 for traceEnabled() { 959 // Set a timer to wake us up 960 s.timer.sleep(int64(debug.traceadvanceperiod)) 961 962 // Try to advance the trace. 963 traceAdvance(false) 964 } 965 s.done <- struct{}{} 966 }() 967 } 968 969 // stop stops a traceAdvancer and blocks until it exits. 970 func (s *traceAdvancerState) stop() { 971 s.timer.wake() 972 <-s.done 973 close(s.done) 974 s.timer.close() 975 } 976 977 // traceAdvancePeriod is the approximate period between 978 // new generations. 979 const defaultTraceAdvancePeriod = 1e9 // 1 second. 980 981 // wakeableSleep manages a wakeable goroutine sleep. 982 // 983 // Users of this type must call init before first use and 984 // close to free up resources. Once close is called, init 985 // must be called before another use. 986 type wakeableSleep struct { 987 timer *timer 988 989 // lock protects access to wakeup, but not send/recv on it. 990 lock mutex 991 wakeup chan struct{} 992 } 993 994 // newWakeableSleep initializes a new wakeableSleep and returns it. 995 func newWakeableSleep() *wakeableSleep { 996 s := new(wakeableSleep) 997 lockInit(&s.lock, lockRankWakeableSleep) 998 s.wakeup = make(chan struct{}, 1) 999 s.timer = new(timer) 1000 f := func(s any, _ uintptr, _ int64) { 1001 s.(*wakeableSleep).wake() 1002 } 1003 s.timer.init(f, s) 1004 return s 1005 } 1006 1007 // sleep sleeps for the provided duration in nanoseconds or until 1008 // another goroutine calls wake. 1009 // 1010 // Must not be called by more than one goroutine at a time and 1011 // must not be called concurrently with close. 1012 func (s *wakeableSleep) sleep(ns int64) { 1013 s.timer.reset(nanotime()+ns, 0) 1014 lock(&s.lock) 1015 if raceenabled { 1016 raceacquire(unsafe.Pointer(&s.lock)) 1017 } 1018 wakeup := s.wakeup 1019 if raceenabled { 1020 racerelease(unsafe.Pointer(&s.lock)) 1021 } 1022 unlock(&s.lock) 1023 <-wakeup 1024 s.timer.stop() 1025 } 1026 1027 // wake awakens any goroutine sleeping on the timer. 1028 // 1029 // Safe for concurrent use with all other methods. 1030 func (s *wakeableSleep) wake() { 1031 // Grab the wakeup channel, which may be nil if we're 1032 // racing with close. 1033 lock(&s.lock) 1034 if raceenabled { 1035 raceacquire(unsafe.Pointer(&s.lock)) 1036 } 1037 if s.wakeup != nil { 1038 // Non-blocking send. 1039 // 1040 // Others may also write to this channel and we don't 1041 // want to block on the receiver waking up. This also 1042 // effectively batches together wakeup notifications. 1043 select { 1044 case s.wakeup <- struct{}{}: 1045 default: 1046 } 1047 } 1048 if raceenabled { 1049 racerelease(unsafe.Pointer(&s.lock)) 1050 } 1051 unlock(&s.lock) 1052 } 1053 1054 // close wakes any goroutine sleeping on the timer and prevents 1055 // further sleeping on it. 1056 // 1057 // Once close is called, the wakeableSleep must no longer be used. 1058 // 1059 // It must only be called once no goroutine is sleeping on the 1060 // timer *and* nothing else will call wake concurrently. 1061 func (s *wakeableSleep) close() { 1062 // Set wakeup to nil so that a late timer ends up being a no-op. 1063 lock(&s.lock) 1064 if raceenabled { 1065 raceacquire(unsafe.Pointer(&s.lock)) 1066 } 1067 wakeup := s.wakeup 1068 s.wakeup = nil 1069 1070 // Close the channel. 1071 close(wakeup) 1072 1073 if raceenabled { 1074 racerelease(unsafe.Pointer(&s.lock)) 1075 } 1076 unlock(&s.lock) 1077 return 1078 } 1079