gh-138122: Validate base frame before caching in remote debugging frame cache

pablogsal · pablogsal · commit 224dede57f7c · 2025-12-17T03:11:23.000Z
The frame cache in the remote debugging module was storing frame chains
without validating that they reached the base frame. This could happen
when a frame chain was interrupted or when the process state changed
during reading, resulting in incomplete stacks being cached. Subsequent
samples that hit the cache would then produce flamegraphs that didn't
reach the bottom of the call stack.

The fix passes base_frame_addr through to process_frame_chain() which
already has validation logic to ensure the frame walk terminates at the
expected sentinel frame. By enabling this validation in the caching code
path and tracking whether we've confirmed reaching the base frame, we
now only store complete frame chains in the cache. When extending from
cached data, we trust that the cached frames were already validated at
storage time, maintaining the invariant that cached stacks are always
complete.

An integration test using deeply nested generators that oscillate the
stack depth is added to verify that all sampled stacks contain the entry
point function. This catches regressions where incomplete stacks might
be cached and returned.
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
@@ -863,3 +863,100 @@ def test_async_aware_running_sees_only_cpu_task(self):
         self.assertGreater(cpu_percentage, 90.0,
             f"cpu_leaf should dominate samples in 'running' mode, "
             f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})")
+
+
+def _generate_deep_generators_script(chain_depth=20, recurse_depth=150):
+    """Generate a script with deep nested generators for stress testing."""
+    lines = [
+        'import sys',
+        'sys.setrecursionlimit(5000)',
+        '',
+    ]
+    # Generate chain of yield-from functions
+    for i in range(chain_depth - 1):
+        lines.extend([
+            f'def deep_yield_chain_{i}(n):',
+            f'    yield ("L{i}", n)',
+            f'    yield from deep_yield_chain_{i + 1}(n)',
+            '',
+        ])
+    # Last chain function calls recursive_diver
+    lines.extend([
+        f'def deep_yield_chain_{chain_depth - 1}(n):',
+        f'    yield ("L{chain_depth - 1}", n)',
+        f'    yield from recursive_diver(n, {chain_depth})',
+        '',
+        'def recursive_diver(n, depth):',
+        '    yield (f"DIVE_{depth}", n)',
+        f'    if depth < {recurse_depth}:',
+        '        yield from recursive_diver(n, depth + 1)',
+        '    else:',
+        '        for i in range(5):',
+        '            yield (f"BOTTOM_{depth}", i)',
+        '',
+        'def oscillating_generator(iterations=1000):',
+        '    for i in range(iterations):',
+        '        yield ("OSCILLATE", i)',
+        '        yield from deep_yield_chain_0(i)',
+        '',
+        'def run_forever():',
+        '    while True:',
+        '        for _ in oscillating_generator(10):',
+        '            pass',
+        '',
+        '_test_sock.sendall(b"working")',
+        'run_forever()',
+    ])
+    return '\n'.join(lines)
+
+
+@requires_remote_subprocess_debugging()
+class TestDeepGeneratorFrameCache(unittest.TestCase):
+    """Test frame cache consistency with deep oscillating generator stacks."""
+
+    def test_all_stacks_share_same_base_frame(self):
+        """Verify all sampled stacks reach the entry point function.
+
+        When profiling deep generators that oscillate up and down the call
+        stack, every sample should include the entry point function
+        (run_forever) in its call chain. If the frame cache stores
+        incomplete stacks, some samples will be missing this base function,
+        causing broken flamegraphs.
+        """
+        script = _generate_deep_generators_script()
+        with test_subprocess(script, wait_for_working=True) as subproc:
+            collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False)
+
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                profiling.sampling.sample.sample(
+                    subproc.process.pid,
+                    collector,
+                    duration_sec=2,
+                )
+
+        samples_with_entry_point = 0
+        samples_without_entry_point = 0
+        total_samples = 0
+
+        for (call_tree, _thread_id), count in collector.stack_counter.items():
+            total_samples += count
+            if call_tree:
+                has_entry_point = any(
+                    frame[2] == "run_forever" for frame in call_tree
+                )
+                if has_entry_point:
+                    samples_with_entry_point += count
+                else:
+                    samples_without_entry_point += count
+
+        self.assertGreater(total_samples, 100,
+            f"Expected at least 100 samples, got {total_samples}")
+
+        self.assertEqual(samples_without_entry_point, 0,
+            f"Found {samples_without_entry_point}/{total_samples} samples "
+            f"missing the entry point function 'run_forever'. This indicates "
+            f"incomplete stacks are being returned, likely due to frame cache "
+            f"storing partial stack traces.")
diff --git a/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst
@@ -0,0 +1,4 @@
+Fix incomplete stack traces in the Tachyon profiler's frame cache when
+profiling code with deeply nested generators. The frame cache now validates
+that stack traces reach the base frame before caching, preventing broken
+flamegraphs. Patch by Pablo Galindo.
diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h
@@ -459,6 +459,7 @@ extern int collect_frames_with_cache(
     uintptr_t frame_addr,
     StackChunkList *chunks,
     PyObject *frame_info,
+    uintptr_t base_frame_addr,
     uintptr_t gc_frame,
     uintptr_t last_profiled_frame,
     uint64_t thread_id);
diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c
@@ -537,6 +537,7 @@ collect_frames_with_cache(
     uintptr_t frame_addr,
     StackChunkList *chunks,
     PyObject *frame_info,
+    uintptr_t base_frame_addr,
     uintptr_t gc_frame,
     uintptr_t last_profiled_frame,
     uint64_t thread_id)
@@ -552,8 +553,11 @@ collect_frames_with_cache(
     Py_ssize_t num_addrs = 0;
     Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
 
+    // Track whether we've validated reaching the base frame (either directly or via cache)
+    int reached_base_frame = 0;
+
     int stopped_at_cached = 0;
-    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
+    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, base_frame_addr, gc_frame,
                             last_profiled_frame, &stopped_at_cached,
                             addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
         return -1;
@@ -575,24 +579,34 @@ collect_frames_with_cache(
             // Cache miss - continue walking from last_profiled_frame to get the rest
             STATS_INC(unwinder, frame_cache_misses);
             Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
-            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
+            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, base_frame_addr, gc_frame,
                                     0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
                 return -1;
             }
             STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
+            // We walked to base frame (process_frame_chain validated it)
+            reached_base_frame = 1;
         } else {
-            // Partial cache hit
+            // Partial cache hit - cache was validated when stored, so we trust it
             STATS_INC(unwinder, frame_cache_partial_hits);
             STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
+            reached_base_frame = 1;
+        }
+    } else {
+        // Walked entire chain without stopping at cache - process_frame_chain validated base frame
+        reached_base_frame = 1;
+        if (last_profiled_frame == 0) {
+            // No cache involvement (no last_profiled_frame or cache disabled)
+            STATS_INC(unwinder, frame_cache_misses);
         }
-    } else if (last_profiled_frame == 0) {
-        // No cache involvement (no last_profiled_frame or cache disabled)
-        STATS_INC(unwinder, frame_cache_misses);
     }
 
-    // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
-    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
-        return -1;
+    // Only store in cache if we reached the base frame (complete stack)
+    // This prevents caching incomplete stacks that would produce broken flamegraphs
+    if (reached_base_frame) {
+        if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
+            return -1;
+        }
     }
 
     return 0;
diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c
@@ -430,7 +430,7 @@ unwind_stack_for_thread(
         uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
             unwinder->debug_offsets.thread_state.last_profiled_frame);
         if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
-                                      gc_frame, last_profiled_frame, tid) < 0) {
+                                      base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) {
             set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
             goto error;
         }

Original file line number	Diff line number	Diff line change
`@@ -430,7 +430,7 @@ unwind_stack_for_thread(`
`430`	`430`	`uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,`
`431`	`431`	`unwinder->debug_offsets.thread_state.last_profiled_frame);`
`432`	`432`	`if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,`
`433`		`- gc_frame, last_profiled_frame, tid) < 0) {`
	`433`	`+ base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) {`
`434`	`434`	`set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");`
`435`	`435`	`goto error;`
`436`	`436`	`}`