From 65312d5fbcccb15f4e61d31c3373a6fb395e9bbd Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 21 Mar 2026 16:57:18 +0100 Subject: [PATCH 01/41] first stab --- Lib/profiling/sampling/__init__.py | 11 +- Lib/profiling/sampling/binary_reader.py | 3 + Lib/profiling/sampling/cli.py | 26 ++- Lib/profiling/sampling/ndjson_collector.py | 216 +++++++++++++++++++++ 4 files changed, 251 insertions(+), 5 deletions(-) create mode 100644 Lib/profiling/sampling/ndjson_collector.py diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 6a0bb5e5c2f387..21d3a773a2ba63 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,6 +9,15 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .ndjson_collector import NdjsonCollector from .string_table import StringTable -__all__ = ("Collector", "PstatsCollector", "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", "StringTable") +__all__ = ( + "Collector", + "PstatsCollector", + "CollapsedStackCollector", + "HeatmapCollector", + "GeckoCollector", + "NdjsonCollector", + "StringTable", +) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index a11be3652597a6..d5bfc0d6130f1a 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,6 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector +from .ndjson_collector import NdjsonCollector from .pstats_collector import PstatsCollector @@ -117,6 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) + elif output_format == 'ndjson': + collector = NdjsonCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index 9900415ae8a927..655d5b51cea901 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -20,6 +20,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector, DiffFlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .ndjson_collector import NdjsonCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -101,6 +102,7 @@ def __call__(self, parser, namespace, values, option_string=None): "diff_flamegraph": "html", "gecko": "json", "heatmap": "html", + "ndjson": "ndjson", "binary": "bin", } @@ -111,6 +113,7 @@ def __call__(self, parser, namespace, values, option_string=None): "diff_flamegraph": DiffFlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, + "ndjson": NdjsonCollector, "binary": BinaryCollector, } @@ -488,6 +491,13 @@ def _add_format_options(parser, include_compression=True, include_binary=True): action=DiffFlamegraphAction, help="Generate differential flamegraph comparing current profile to `BASELINE` binary file", ) + format_group.add_argument( + "--ndjson", + action="store_const", + const="ndjson", + dest="format", + help="Generate NDJSON snapshot output for external consumers", + ) if include_binary: format_group.add_argument( "--binary", @@ -611,15 +621,18 @@ def _sort_to_mode(sort_choice): return sort_map.get(sort_choice, SORT_MODE_NSAMPLES) def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=False, - output_file=None, compression='auto', diff_baseline=None): + mode=None, output_file=None, compression='auto', + diff_baseline=None): """Create the appropriate collector based on format type. Args: - format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary', 'diff_flamegraph') + format_type: The output format ('pstats', 'collapsed', 'flamegraph', + 'gecko', 'heatmap', 'ndjson', 'binary', 'diff_flamegraph') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format for creating interval markers in Firefox Profiler) + mode: Profiling mode for collectors that expose it in metadata output_file: Output file path (required for binary format) compression: Compression type for binary format ('auto', 'zstd', 'none') diff_baseline: Path to baseline binary file for differential flamegraph @@ -655,6 +668,11 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) + if format_type == "ndjson": + return collector_class( + sample_interval_usec, skip_idle=skip_idle, mode=mode + ) + return collector_class(sample_interval_usec, skip_idle=skip_idle) @@ -1142,7 +1160,7 @@ def _handle_attach(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto'), diff_baseline=args.diff_baseline @@ -1249,7 +1267,7 @@ def _handle_run(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto'), diff_baseline=args.diff_baseline diff --git a/Lib/profiling/sampling/ndjson_collector.py b/Lib/profiling/sampling/ndjson_collector.py new file mode 100644 index 00000000000000..123ec1c5ea9a1c --- /dev/null +++ b/Lib/profiling/sampling/ndjson_collector.py @@ -0,0 +1,216 @@ +"""NDJSON collector.""" + +import json +import uuid +from itertools import batched + +from .constants import ( + PROFILING_MODE_ALL, + PROFILING_MODE_CPU, + PROFILING_MODE_EXCEPTION, + PROFILING_MODE_GIL, + PROFILING_MODE_WALL, +) +from .stack_collector import StackTraceCollector + + +_CHUNK_SIZE = 1000 + +_MODE_NAMES = { + PROFILING_MODE_WALL: "wall", + PROFILING_MODE_CPU: "cpu", + PROFILING_MODE_GIL: "gil", + PROFILING_MODE_ALL: "all", + PROFILING_MODE_EXCEPTION: "exception", +} + + +class NdjsonCollector(StackTraceCollector): + """Collector that exports finalized profiling data as NDJSON.""" + + def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): + super().__init__(sample_interval_usec, skip_idle=skip_idle) + self.run_id = uuid.uuid4().hex + + self._string_to_id = {} + self._strings = [] + + self._frame_to_id = {} + self._frames = [] + + self._frame_self = {} + self._frame_cumulative = {} + self._samples_total = 0 + + self._mode = mode + + def process_frames(self, frames, _thread_id, weight=1): + if not frames: + return + + self._samples_total += weight + + frame_ids = [ + self._get_or_create_frame_id(filename, location, funcname) + for filename, location, funcname, _opcode in frames + ] + leaf_frame_id = frame_ids[0] + + self._frame_self[leaf_frame_id] = ( + self._frame_self.get(leaf_frame_id, 0) + weight + ) + + for frame_id in set(frame_ids): + self._frame_cumulative[frame_id] = ( + self._frame_cumulative.get(frame_id, 0) + weight + ) + + def export(self, filename): + with open(filename, "w", encoding="utf-8") as output: + self._write_message(output, self._build_meta_record()) + self._write_chunked_defs(output, "str_def", self._strings) + self._write_chunked_defs(output, "frame_def", self._frames) + self._write_chunked_agg(output, self._iter_agg_entries()) + self._write_message( + output, + { + "type": "end", + "v": 1, + "run_id": self.run_id, + "samples_total": self._samples_total, + }, + ) + + print(f"NDJSON profile written to {filename}") + + def _build_meta_record(self): + record = { + "type": "meta", + "v": 1, + "run_id": self.run_id, + "sample_interval_usec": self.sample_interval_usec, + } + + if self._mode is not None: + record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode)) + + return record + + def _get_or_create_frame_id(self, filename, location, funcname): + synthetic = location is None + location_fields = self._normalize_export_location(location) + func_str_id = self._intern_string(funcname) + path_str_id = self._intern_string(filename) + + frame_key = ( + path_str_id, + func_str_id, + location_fields["line"], + location_fields.get("end_line"), + location_fields.get("col"), + location_fields.get("end_col"), + synthetic, + ) + + if (frame_id := self._frame_to_id.get(frame_key)) is not None: + return frame_id + + frame_id = len(self._frames) + 1 + frame_record = { + "frame_id": frame_id, + "path_str_id": path_str_id, + "func_str_id": func_str_id, + **location_fields, + } + if synthetic: + frame_record["synthetic"] = True + + self._frame_to_id[frame_key] = frame_id + self._frames.append(frame_record) + return frame_id + + def _intern_string(self, value): + value = str(value) + + if (string_id := self._string_to_id.get(value)) is not None: + return string_id + + string_id = len(self._strings) + 1 + self._string_to_id[value] = string_id + self._strings.append({"str_id": string_id, "value": value}) + return string_id + + @staticmethod + def _normalize_export_location(location): + if location is None: + return {"line": 0} + + if isinstance(location, int): + return {"line": max(location, 0)} + + if not isinstance(location, tuple): + lineno = getattr(location, "lineno", 0) + location = ( + lineno, + getattr(location, "end_lineno", lineno), + getattr(location, "col_offset", -1), + getattr(location, "end_col_offset", -1), + ) + + lineno, end_lineno, col_offset, end_col_offset = location + if not isinstance(lineno, int) or lineno <= 0: + return {"line": 0} + + normalized = {"line": lineno} + if isinstance(end_lineno, int) and end_lineno > 0: + normalized["end_line"] = end_lineno + if isinstance(col_offset, int) and col_offset >= 0: + normalized["col"] = col_offset + if isinstance(end_col_offset, int) and end_col_offset >= 0: + normalized["end_col"] = end_col_offset + return normalized + + def _iter_agg_entries(self): + entries = [] + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + entries.append( + { + "frame_id": frame_id, + "self": self._frame_self.get(frame_id, 0), + "cumulative": self._frame_cumulative.get(frame_id, 0), + } + ) + return entries + + def _write_chunked_defs(self, output, record_type, entries): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message( + output, + { + "type": record_type, + "v": 1, + "run_id": self.run_id, + "defs": chunk, + }, + ) + + def _write_chunked_agg(self, output, entries): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message( + output, + { + "type": "agg", + "v": 1, + "run_id": self.run_id, + "kind": "frame", + "scope": "final", + "samples_total": self._samples_total, + "entries": chunk, + }, + ) + + @staticmethod + def _write_message(output, record): + output.write(json.dumps(record, separators=(",", ":"))) + output.write("\n") From dff2ead5e96a49230bb5b4388ae66a035ab4fbd3 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 21 Mar 2026 21:08:18 +0100 Subject: [PATCH 02/41] s/ndjson/jsonl/ --- Lib/profiling/sampling/__init__.py | 4 ++-- Lib/profiling/sampling/binary_reader.py | 6 +++--- Lib/profiling/sampling/cli.py | 16 ++++++++-------- .../{ndjson_collector.py => jsonl_collector.py} | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) rename Lib/profiling/sampling/{ndjson_collector.py => jsonl_collector.py} (97%) diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 21d3a773a2ba63..71579a3903253e 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,7 +9,7 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .string_table import StringTable __all__ = ( @@ -18,6 +18,6 @@ "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", - "NdjsonCollector", + "JsonlCollector", "StringTable", ) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index d5bfc0d6130f1a..8d1d8eef9155eb 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,7 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .pstats_collector import PstatsCollector @@ -118,8 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) - elif output_format == 'ndjson': - collector = NdjsonCollector(interval) + elif output_format == 'jsonl': + collector = JsonlCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index 655d5b51cea901..ccefd2402edc8e 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -20,7 +20,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector, DiffFlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -102,7 +102,7 @@ def __call__(self, parser, namespace, values, option_string=None): "diff_flamegraph": "html", "gecko": "json", "heatmap": "html", - "ndjson": "ndjson", + "jsonl": "jsonl", "binary": "bin", } @@ -113,7 +113,7 @@ def __call__(self, parser, namespace, values, option_string=None): "diff_flamegraph": DiffFlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, - "ndjson": NdjsonCollector, + "jsonl": JsonlCollector, "binary": BinaryCollector, } @@ -492,11 +492,11 @@ def _add_format_options(parser, include_compression=True, include_binary=True): help="Generate differential flamegraph comparing current profile to `BASELINE` binary file", ) format_group.add_argument( - "--ndjson", + "--jsonl", action="store_const", - const="ndjson", + const="jsonl", dest="format", - help="Generate NDJSON snapshot output for external consumers", + help="Generate JSONL snapshot output for external consumers", ) if include_binary: format_group.add_argument( @@ -627,7 +627,7 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals Args: format_type: The output format ('pstats', 'collapsed', 'flamegraph', - 'gecko', 'heatmap', 'ndjson', 'binary', 'diff_flamegraph') + 'gecko', 'heatmap', 'jsonl', 'binary', 'diff_flamegraph') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format @@ -668,7 +668,7 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) - if format_type == "ndjson": + if format_type == "jsonl": return collector_class( sample_interval_usec, skip_idle=skip_idle, mode=mode ) diff --git a/Lib/profiling/sampling/ndjson_collector.py b/Lib/profiling/sampling/jsonl_collector.py similarity index 97% rename from Lib/profiling/sampling/ndjson_collector.py rename to Lib/profiling/sampling/jsonl_collector.py index 123ec1c5ea9a1c..1d6575425c2616 100644 --- a/Lib/profiling/sampling/ndjson_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -1,4 +1,4 @@ -"""NDJSON collector.""" +"""JSONL collector.""" import json import uuid @@ -25,8 +25,8 @@ } -class NdjsonCollector(StackTraceCollector): - """Collector that exports finalized profiling data as NDJSON.""" +class JsonlCollector(StackTraceCollector): + """Collector that exports finalized profiling data as JSONL.""" def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): super().__init__(sample_interval_usec, skip_idle=skip_idle) @@ -81,7 +81,7 @@ def export(self, filename): }, ) - print(f"NDJSON profile written to {filename}") + print(f"JSONL profile written to {filename}") def _build_meta_record(self): record = { From 23b5df1d374d0a954160ae75d4cee44acea244d9 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sun, 22 Mar 2026 02:51:12 +0100 Subject: [PATCH 03/41] printing to stdout isn't a great idea --- Lib/profiling/sampling/jsonl_collector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 1d6575425c2616..3333b7352c9411 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -81,8 +81,6 @@ def export(self, filename): }, ) - print(f"JSONL profile written to {filename}") - def _build_meta_record(self): record = { "type": "meta", From 9cdb9710b8b6854e5b78b3b207fae6ffe8c1c943 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 30 Mar 2026 22:50:59 +0200 Subject: [PATCH 04/41] even a basic test --- .../test_sampling_profiler/test_collectors.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 240ec8a195c43b..bac81bafde7533 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -16,6 +16,7 @@ CollapsedStackCollector, FlamegraphCollector, ) + from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.gecko_collector import GeckoCollector from profiling.sampling.collector import extract_lineno, normalize_location from profiling.sampling.opcode_utils import get_opcode_info, format_opcode @@ -1669,6 +1670,86 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) + def test_jsonl_collector_basic(self): + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + collector = JsonlCollector(1000) + run_id = collector.run_id + + self.assertIsNotNone(run_id) + + test_frames1 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + ) + ], + ) + ] + test_frames2 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + ) + ], + ) + ] # Same stack + test_frames3 = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [MockFrameInfo("other.py", 5, "other_func")])] + ) + ] + + collector.collect(test_frames1) + collector.collect(test_frames2) + collector.collect(test_frames3) + + with captured_stdout(), captured_stderr(): + collector.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "func1"}, + {"str_id": 2, "value": "file.py"}, + {"str_id": 3, "value": "func2"}, + {"str_id": 4, "value": "other_func"}, + {"str_id": 5, "value": "other.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 10, "end_line": 10}, + {"frame_id": 2, "path_str_id": 2, "func_str_id": 3, + "line": 20, "end_line": 20}, + {"frame_id": 3, "path_str_id": 5, "func_str_id": 4, + "line": 5, "end_line": 5}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 3, + "entries": [{"frame_id": 1, "self": 2, "cumulative": 2}, + {"frame_id": 2, "self": 0, "cumulative": 2}, + {"frame_id": 3, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 3}), + ] + + for exp in expected: + self.assertIn(exp, lines) + class TestRecursiveFunctionHandling(unittest.TestCase): """Tests for correct handling of recursive functions in cumulative stats.""" From 5920559f3e2484bb6ef2ccaf3f5b3a133467e446 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 30 Mar 2026 22:54:22 +0200 Subject: [PATCH 05/41] separate func for end record --- Lib/profiling/sampling/jsonl_collector.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 3333b7352c9411..59ab3b865c182c 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -71,15 +71,7 @@ def export(self, filename): self._write_chunked_defs(output, "str_def", self._strings) self._write_chunked_defs(output, "frame_def", self._frames) self._write_chunked_agg(output, self._iter_agg_entries()) - self._write_message( - output, - { - "type": "end", - "v": 1, - "run_id": self.run_id, - "samples_total": self._samples_total, - }, - ) + self._write_message(output, self._build_end_record()) def _build_meta_record(self): record = { @@ -94,6 +86,16 @@ def _build_meta_record(self): return record + def _build_end_record(self): + record = { + "type": "end", + "v": 1, + "run_id": self.run_id, + "samples_total": self._samples_total, + } + + return record + def _get_or_create_frame_id(self, filename, location, funcname): synthetic = location is None location_fields = self._normalize_export_location(location) From 28ebd2a64b2ca060c064e6fd207ed03a3af7556f Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 30 Mar 2026 23:14:18 +0200 Subject: [PATCH 06/41] proper name --- .../test_profiling/test_sampling_profiler/test_collectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index bac81bafde7533..2afb9b5a2ca459 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1670,7 +1670,7 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) - def test_jsonl_collector_basic(self): + def test_jsonl_collector_export(self): collapsed_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, collapsed_out) From bc3370b083fe42a2687539ff937c263aca5629fd Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:48:26 +0200 Subject: [PATCH 07/41] test_jsonl_collector_with_location_info --- .../test_sampling_profiler/test_collectors.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 2afb9b5a2ca459..14744be75a0e9b 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2149,6 +2149,56 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) + def test_jsonl_collector_with_location_info(self): + """Test JsonlCollector handles LocationInfo properly.""" + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + collector = JsonlCollector(sample_interval_usec=1000) + run_id = collector.run_id + + # Frame with LocationInfo + frame = MockFrameInfo("test.py", 42, "my_function") + frames = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + + # Should extract lineno from location + with captured_stdout(), captured_stderr(): + collector.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "my_function"}, + {"str_id": 2, "value": "test.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 42, "end_line": 42}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 1, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 1}), + ] + + for exp in expected: + self.assertIn(exp, lines) + class TestOpcodeHandling(unittest.TestCase): """Tests for opcode field handling in collectors.""" From a151578460b53530ca86c00d7c93a2be00d840d9 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:52:15 +0200 Subject: [PATCH 08/41] test synthetic frames --- .../test_sampling_profiler/test_collectors.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 14744be75a0e9b..9d88c5283a44e8 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2149,6 +2149,7 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) + def test_jsonl_collector_with_location_info(self): """Test JsonlCollector handles LocationInfo properly.""" collapsed_out = tempfile.NamedTemporaryFile(delete=False) @@ -2200,6 +2201,59 @@ def jsonl(obj): self.assertIn(exp, lines) + def test_jsonl_collector_with_none_location(self): + """Test JsonlCollector handles None location (synthetic frames).""" + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + collector = JsonlCollector(sample_interval_usec=1000) + run_id = collector.run_id + + # Create frame with None location (like GC frame) + frame = MockFrameInfo("~", 0, "") + frame.location = None # Synthetic frame has no location + frames = [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + + # Should handle None location as synthetic frame + with captured_stdout(), captured_stderr(): + collector.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": ""}, + {"str_id": 2, "value": "~"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 0, "synthetic": True}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 1, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 1}), + ] + + for exp in expected: + self.assertIn(exp, lines) + + class TestOpcodeHandling(unittest.TestCase): """Tests for opcode field handling in collectors.""" From f851de9c9490ac970296ddc97d143efe59393c0a Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:02:08 +0200 Subject: [PATCH 09/41] too many new lines --- .../test_profiling/test_sampling_profiler/test_collectors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 9d88c5283a44e8..4c73c42a811076 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2149,7 +2149,6 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) - def test_jsonl_collector_with_location_info(self): """Test JsonlCollector handles LocationInfo properly.""" collapsed_out = tempfile.NamedTemporaryFile(delete=False) @@ -2200,7 +2199,6 @@ def jsonl(obj): for exp in expected: self.assertIn(exp, lines) - def test_jsonl_collector_with_none_location(self): """Test JsonlCollector handles None location (synthetic frames).""" collapsed_out = tempfile.NamedTemporaryFile(delete=False) From e5831a8cb7c38db37bae5d3211669277595236b5 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:21:42 +0200 Subject: [PATCH 10/41] BUG? confusing... two ways to set skip_idle? --- .../test_sampling_profiler/test_modes.py | 157 +++++++++++++++++- 1 file changed, 155 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 0b38fb4ad4bcf6..67b82eff091d08 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -9,6 +9,7 @@ import profiling.sampling import profiling.sampling.sample from profiling.sampling.pstats_collector import PstatsCollector + from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.cli import main, _parse_mode from profiling.sampling.constants import PROFILING_MODE_EXCEPTION from _remote_debugging import ( @@ -20,9 +21,13 @@ "Test only runs when _remote_debugging is available" ) -from test.support import requires_remote_subprocess_debugging +from test.support import ( + captured_stdout, + captured_stderr, + requires_remote_subprocess_debugging, +) -from .helpers import test_subprocess +from .helpers import close_and_unlink, test_subprocess from .mocks import MockFrameInfo, MockInterpreterInfo @@ -228,6 +233,154 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) + def test_jsonl_collector_rspects_skip_idle(self): + """Test that frames are actually filtered when skip_idle=True.""" + import tempfile + import json + + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + # Create mock frames with different thread statuses + class MockThreadInfoWithStatus: + def __init__(self, thread_id, frame_info, status): + self.thread_id = thread_id + self.frame_info = frame_info + self.status = status + + # Create test data: active thread (HAS_GIL | ON_CPU), idle thread (neither), and another active thread + ACTIVE_STATUS = ( + THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU + ) # Has GIL and on CPU + IDLE_STATUS = 0 # Neither has GIL nor on CPU + + test_frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfoWithStatus( + 1, + [MockFrameInfo("active1.py", 10, "active_func1")], + ACTIVE_STATUS, + ), + MockThreadInfoWithStatus( + 2, + [MockFrameInfo("idle.py", 20, "idle_func")], + IDLE_STATUS, + ), + MockThreadInfoWithStatus( + 3, + [MockFrameInfo("active2.py", 30, "active_func2")], + ACTIVE_STATUS, + ), + ], + ) + ] + + # Test with skip_idle=True - should only process running threads + collector_skip = JsonlCollector( + sample_interval_usec=1000, skip_idle=True + ) + collector_skip.collect(test_frames) + + run_id = collector_skip.run_id + + # Should only have functions from running threads (status 0) + with captured_stdout(), captured_stderr(): + collector_skip.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "active_func1"}, + {"str_id": 2, "value": "active1.py"}, + {"str_id": 3, "value": "idle_func"}, + {"str_id": 4, "value": "idle.py"}, + {"str_id": 5, "value": "active_func2"}, + {"str_id": 6, "value": "active2.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 10, "end_line": 10}, + {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, + "line": 20, "end_line": 20}, + {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, + "line": 30, "end_line": 30}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 3, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, + {"frame_id": 2, "self": 1, "cumulative": 1}, + {"frame_id": 3, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 3}), + ] + + for exp in expected: + self.assertIn(exp, lines) + + # Test with skip_idle=False - should process all threads + collector_no_skip = JsonlCollector( + sample_interval_usec=1000, skip_idle=False + ) + collector_no_skip.collect(test_frames) + + run_id = collector_no_skip.run_id + + # Should have functions from all threads + with captured_stdout(), captured_stderr(): + collector_no_skip.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "active_func1"}, + {"str_id": 2, "value": "active1.py"}, + {"str_id": 3, "value": "idle_func"}, + {"str_id": 4, "value": "idle.py"}, + {"str_id": 5, "value": "active_func2"}, + {"str_id": 6, "value": "active2.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 10, "end_line": 10}, + {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, + "line": 20, "end_line": 20}, + {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, + "line": 30, "end_line": 30}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 3, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, + {"frame_id": 2, "self": 1, "cumulative": 1}, + {"frame_id": 3, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 3}), + ] + + for exp in expected: + self.assertIn(exp, lines) + + # self.assertIn(active1_key, collector_no_skip.result) + # self.assertIn(active2_key, collector_no_skip.result) + # self.assertIn( + # idle_key, collector_no_skip.result + # ) # Idle thread should be included @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): From 9c106a4075ec885fa7e3e89b04d97417bece2e34 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:51:16 +0200 Subject: [PATCH 11/41] ok, thx b4fac15613a16f9cd7b2ee32840523b399f4621f --- .../test_sampling_profiler/test_modes.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 67b82eff091d08..9d792b8d6f20ab 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -305,24 +305,19 @@ def jsonl(obj): jsonl({"type": "str_def", "v": 1, "run_id": run_id, "defs": [{"str_id": 1, "value": "active_func1"}, {"str_id": 2, "value": "active1.py"}, - {"str_id": 3, "value": "idle_func"}, - {"str_id": 4, "value": "idle.py"}, - {"str_id": 5, "value": "active_func2"}, - {"str_id": 6, "value": "active2.py"}]}), + {"str_id": 3, "value": "active_func2"}, + {"str_id": 4, "value": "active2.py"}]}), jsonl({"type": "frame_def", "v": 1, "run_id": run_id, "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, "line": 10, "end_line": 10}, {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, - "line": 20, "end_line": 20}, - {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, "line": 30, "end_line": 30}]}), jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 3, + "kind": "frame", "scope": "final", "samples_total": 2, "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, - {"frame_id": 2, "self": 1, "cumulative": 1}, - {"frame_id": 3, "self": 1, "cumulative": 1}]}), + {"frame_id": 2, "self": 1, "cumulative": 1}]}), jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 3}), + "samples_total": 2}), ] for exp in expected: From 727f05f3eb16f0d0d373d2fc373ae8a0bcdd8910 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:16:36 +0200 Subject: [PATCH 12/41] check if it works fine with (file, loc, func, op) --- .../test_sampling_profiler/test_collectors.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 4c73c42a811076..9982403c329091 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2471,6 +2471,38 @@ def test_gecko_collector_frame_format(self): # Should have recorded 3 functions self.assertEqual(thread["funcTable"]["length"], 3) + def test_jsonl_collector_frame_format(self): + """Test JsonlCollector with 4-element frame format.""" + collector = JsonlCollector(sample_interval_usec=1000) + collector.collect(self._make_sample_frames()) + + with tempfile.NamedTemporaryFile(delete=False) as f: + self.addClassCleanup(close_and_unlink, f) + collector.export(f.name) + + with open(f.name, "r", encoding="utf-8") as fp: + records = [json.loads(line) for line in fp] + + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + + self.assertEqual(len(frame_defs), 3) + + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + funcs = {str_defs[item["func_str_id"]] for item in frame_defs} + + self.assertEqual(paths, {"app.py", "utils.py", "lib.py"}) + self.assertEqual(funcs, {"main", "helper", "process"}) class TestInternalFrameFiltering(unittest.TestCase): """Tests for filtering internal profiler frames from output.""" From 1c6f81a45d61b6b53c558343c0edc9589c649fb2 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:19:44 +0200 Subject: [PATCH 13/41] missing new line --- .../test_profiling/test_sampling_profiler/test_collectors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 9982403c329091..a407e16b9a61be 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2504,6 +2504,7 @@ def test_jsonl_collector_frame_format(self): self.assertEqual(paths, {"app.py", "utils.py", "lib.py"}) self.assertEqual(funcs, {"main", "helper", "process"}) + class TestInternalFrameFiltering(unittest.TestCase): """Tests for filtering internal profiler frames from output.""" From c278f83476b3c412e47721d06040e3cc61c288b5 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:25:20 +0200 Subject: [PATCH 14/41] filter out sync coordinator --- .../test_sampling_profiler/test_collectors.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index a407e16b9a61be..3937fe3b8aa018 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2631,3 +2631,54 @@ def test_collapsed_stack_collector_filters_internal_frames(self): for (call_tree, _), _ in collector.stack_counter.items(): for filename, _, _ in call_tree: self.assertNotIn("_sync_coordinator", filename) + + def test_jsonl_collector_filters_internal_frames(self): + """Test that JsonlCollector filters out internal frames.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(sample_interval_usec=1000) + + frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("app.py", 50, "run"), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo("", 87, "_run_code"), + ], + status=THREAD_STATUS_HAS_GIL, + ) + ], + ) + ] + + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + + self.assertIn("app.py", paths) + self.assertIn("", paths) + + for path in paths: + self.assertNotIn("_sync_coordinator", path) From bf6b4547dca2cc40ecd0535b8081e7a82a1fa950 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:30:12 +0200 Subject: [PATCH 15/41] s/collapsed_out/jsonl_out/, less copying :D --- .../test_sampling_profiler/test_collectors.py | 24 +++++++++---------- .../test_sampling_profiler/test_modes.py | 12 +++++----- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 3937fe3b8aa018..56cc7a2bd1190c 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1671,8 +1671,8 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff_pct"], -50.0) def test_jsonl_collector_export(self): - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(1000) run_id = collector.run_id @@ -1710,10 +1710,10 @@ def test_jsonl_collector_export(self): collector.collect(test_frames3) with captured_stdout(), captured_stderr(): - collector.export(collapsed_out.name) + collector.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") @@ -2151,8 +2151,8 @@ def test_gecko_collector_with_location_info(self): def test_jsonl_collector_with_location_info(self): """Test JsonlCollector handles LocationInfo properly.""" - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) run_id = collector.run_id @@ -2168,10 +2168,10 @@ def test_jsonl_collector_with_location_info(self): # Should extract lineno from location with captured_stdout(), captured_stderr(): - collector.export(collapsed_out.name) + collector.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") @@ -2201,8 +2201,8 @@ def jsonl(obj): def test_jsonl_collector_with_none_location(self): """Test JsonlCollector handles None location (synthetic frames).""" - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) run_id = collector.run_id @@ -2220,10 +2220,10 @@ def test_jsonl_collector_with_none_location(self): # Should handle None location as synthetic frame with captured_stdout(), captured_stderr(): - collector.export(collapsed_out.name) + collector.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 9d792b8d6f20ab..a4c7ed857ce7fb 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -238,8 +238,8 @@ def test_jsonl_collector_rspects_skip_idle(self): import tempfile import json - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) # Create mock frames with different thread statuses class MockThreadInfoWithStatus: @@ -287,10 +287,10 @@ def __init__(self, thread_id, frame_info, status): # Should only have functions from running threads (status 0) with captured_stdout(), captured_stderr(): - collector_skip.export(collapsed_out.name) + collector_skip.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") @@ -333,10 +333,10 @@ def jsonl(obj): # Should have functions from all threads with captured_stdout(), captured_stderr(): - collector_no_skip.export(collapsed_out.name) + collector_no_skip.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") From 9ef96f82e7c7711fa05e0b114a39c46e8f58601b Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:31:24 +0200 Subject: [PATCH 16/41] nicer reading --- .../test_profiling/test_sampling_profiler/test_collectors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 56cc7a2bd1190c..4411bae62fc7ee 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2637,8 +2637,6 @@ def test_jsonl_collector_filters_internal_frames(self): jsonl_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, jsonl_out) - collector = JsonlCollector(sample_interval_usec=1000) - frames = [ MockInterpreterInfo( 0, @@ -2656,6 +2654,7 @@ def test_jsonl_collector_filters_internal_frames(self): ) ] + collector = JsonlCollector(sample_interval_usec=1000) collector.collect(frames) collector.export(jsonl_out.name) From 13dd0f2b03e5b9707d6a250f74a57f73bb366d2a Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:32:23 +0200 Subject: [PATCH 17/41] typo --- Lib/test/test_profiling/test_sampling_profiler/test_modes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index a4c7ed857ce7fb..37cb6c3a5c5ab2 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -233,7 +233,7 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) - def test_jsonl_collector_rspects_skip_idle(self): + def test_jsonl_collector_respects_skip_idle(self): """Test that frames are actually filtered when skip_idle=True.""" import tempfile import json From ae7afe1fa4ddb415ead8dc11833cb7dbcb84ad3e Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:37:39 +0200 Subject: [PATCH 18/41] too much copying, left-over --- Lib/test/test_profiling/test_sampling_profiler/test_modes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 37cb6c3a5c5ab2..2bac26c37091b0 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -371,11 +371,6 @@ def jsonl(obj): for exp in expected: self.assertIn(exp, lines) - # self.assertIn(active1_key, collector_no_skip.result) - # self.assertIn(active2_key, collector_no_skip.result) - # self.assertIn( - # idle_key, collector_no_skip.result - # ) # Idle thread should be included @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): From a8eb46d51ddafd04937cf8b2691fda191a620fd1 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:49:39 +0200 Subject: [PATCH 19/41] just Counter --- Lib/profiling/sampling/jsonl_collector.py | 87 +++++++++++------------ 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 59ab3b865c182c..56539c2a9e2232 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -1,5 +1,6 @@ """JSONL collector.""" +from collections import Counter import json import uuid from itertools import batched @@ -38,8 +39,8 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): self._frame_to_id = {} self._frames = [] - self._frame_self = {} - self._frame_cumulative = {} + self._frame_self = Counter() + self._frame_cumulative = Counter() self._samples_total = 0 self._mode = mode @@ -56,21 +57,39 @@ def process_frames(self, frames, _thread_id, weight=1): ] leaf_frame_id = frame_ids[0] - self._frame_self[leaf_frame_id] = ( - self._frame_self.get(leaf_frame_id, 0) + weight - ) + self._frame_self[leaf_frame_id] += weight for frame_id in set(frame_ids): - self._frame_cumulative[frame_id] = ( - self._frame_cumulative.get(frame_id, 0) + weight - ) + self._frame_cumulative[frame_id] += weight def export(self, filename): with open(filename, "w", encoding="utf-8") as output: self._write_message(output, self._build_meta_record()) - self._write_chunked_defs(output, "str_def", self._strings) - self._write_chunked_defs(output, "frame_def", self._frames) - self._write_chunked_agg(output, self._iter_agg_entries()) + self._write_chunked_records( + output, + {"type": "str_def", "v": 1, "run_id": self.run_id}, + "defs", + self._strings, + ) + self._write_chunked_records( + output, + {"type": "frame_def", "v": 1, "run_id": self.run_id}, + "defs", + self._frames, + ) + self._write_chunked_records( + output, + { + "type": "agg", + "v": 1, + "run_id": self.run_id, + "kind": "frame", + "scope": "final", + "samples_total": self._samples_total, + }, + "entries", + self._iter_agg_entries(), + ) self._write_message(output, self._build_end_record()) def _build_meta_record(self): @@ -171,44 +190,18 @@ def _normalize_export_location(location): return normalized def _iter_agg_entries(self): - entries = [] - for frame_record in self._frames: - frame_id = frame_record["frame_id"] - entries.append( - { - "frame_id": frame_id, - "self": self._frame_self.get(frame_id, 0), - "cumulative": self._frame_cumulative.get(frame_id, 0), - } - ) - return entries - - def _write_chunked_defs(self, output, record_type, entries): - for chunk in batched(entries, _CHUNK_SIZE): - self._write_message( - output, - { - "type": record_type, - "v": 1, - "run_id": self.run_id, - "defs": chunk, - }, - ) + return [ + { + "frame_id": frame_record["frame_id"], + "self": self._frame_self[frame_record["frame_id"]], + "cumulative": self._frame_cumulative[frame_record["frame_id"]], + } + for frame_record in self._frames + ] - def _write_chunked_agg(self, output, entries): + def _write_chunked_records(self, output, base_record, chunk_field, entries): for chunk in batched(entries, _CHUNK_SIZE): - self._write_message( - output, - { - "type": "agg", - "v": 1, - "run_id": self.run_id, - "kind": "frame", - "scope": "final", - "samples_total": self._samples_total, - "entries": chunk, - }, - ) + self._write_message(output, {**base_record, chunk_field: chunk}) @staticmethod def _write_message(output, record): From 9982bb4283798578c2dcbadb0c3752177ee8593a Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:52:11 +0200 Subject: [PATCH 20/41] ruff --- Lib/profiling/sampling/jsonl_collector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 56539c2a9e2232..244501ba446f07 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -199,7 +199,9 @@ def _iter_agg_entries(self): for frame_record in self._frames ] - def _write_chunked_records(self, output, base_record, chunk_field, entries): + def _write_chunked_records( + self, output, base_record, chunk_field, entries + ): for chunk in batched(entries, _CHUNK_SIZE): self._write_message(output, {**base_record, chunk_field: chunk}) From fe298881656505963b2e9d49d3ab3362b7ee3abe Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:56:14 +0200 Subject: [PATCH 21/41] future-proof name --- Lib/profiling/sampling/jsonl_collector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 244501ba446f07..7d7b44c8d89407 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -88,7 +88,7 @@ def export(self, filename): "samples_total": self._samples_total, }, "entries", - self._iter_agg_entries(), + self._iter_final_agg_entries(), ) self._write_message(output, self._build_end_record()) @@ -189,7 +189,7 @@ def _normalize_export_location(location): normalized["end_col"] = end_col_offset return normalized - def _iter_agg_entries(self): + def _iter_final_agg_entries(self): return [ { "frame_id": frame_record["frame_id"], From a5192b77d2f56391290780a48771906bc21f64d5 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:56:58 +0200 Subject: [PATCH 22/41] future-proof iter for streaming --- Lib/profiling/sampling/jsonl_collector.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 7d7b44c8d89407..1b318573425edf 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -190,14 +190,13 @@ def _normalize_export_location(location): return normalized def _iter_final_agg_entries(self): - return [ - { - "frame_id": frame_record["frame_id"], - "self": self._frame_self[frame_record["frame_id"]], - "cumulative": self._frame_cumulative[frame_record["frame_id"]], + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + yield { + "frame_id": frame_id, + "self": self._frame_self[frame_id], + "cumulative": self._frame_cumulative[frame_id], } - for frame_record in self._frames - ] def _write_chunked_records( self, output, base_record, chunk_field, entries From 1d53e16b5a634fdea156649cd624b702c5947cfe Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:13:23 +0200 Subject: [PATCH 23/41] truth to be told, this should be layer above --- Lib/profiling/sampling/collector.py | 5 ++- Lib/profiling/sampling/jsonl_collector.py | 42 +++++++------------ .../test_sampling_profiler/test_collectors.py | 5 +++ 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py index 08759b611696b7..ad5be46821c096 100644 --- a/Lib/profiling/sampling/collector.py +++ b/Lib/profiling/sampling/collector.py @@ -20,13 +20,16 @@ def normalize_location(location): """Normalize location to a 4-tuple format. Args: - location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None + location: tuple (lineno, end_lineno, col_offset, end_col_offset), + an integer line number, or None Returns: tuple: (lineno, end_lineno, col_offset, end_col_offset) """ if location is None: return DEFAULT_LOCATION + if isinstance(location, int): + return (location, location, -1, -1) return location diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 1b318573425edf..6c8f2bc2fd3135 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -12,6 +12,7 @@ PROFILING_MODE_GIL, PROFILING_MODE_WALL, ) +from .collector import normalize_location from .stack_collector import StackTraceCollector @@ -117,7 +118,7 @@ def _build_end_record(self): def _get_or_create_frame_id(self, filename, location, funcname): synthetic = location is None - location_fields = self._normalize_export_location(location) + location_fields = self._location_to_export_fields(location) func_str_id = self._intern_string(funcname) path_str_id = self._intern_string(filename) @@ -160,34 +161,19 @@ def _intern_string(self, value): return string_id @staticmethod - def _normalize_export_location(location): - if location is None: - return {"line": 0} - - if isinstance(location, int): - return {"line": max(location, 0)} - - if not isinstance(location, tuple): - lineno = getattr(location, "lineno", 0) - location = ( - lineno, - getattr(location, "end_lineno", lineno), - getattr(location, "col_offset", -1), - getattr(location, "end_col_offset", -1), - ) + def _location_to_export_fields(location): + lineno, end_lineno, col_offset, end_col_offset = normalize_location( + location + ) - lineno, end_lineno, col_offset, end_col_offset = location - if not isinstance(lineno, int) or lineno <= 0: - return {"line": 0} - - normalized = {"line": lineno} - if isinstance(end_lineno, int) and end_lineno > 0: - normalized["end_line"] = end_lineno - if isinstance(col_offset, int) and col_offset >= 0: - normalized["col"] = col_offset - if isinstance(end_col_offset, int) and end_col_offset >= 0: - normalized["end_col"] = end_col_offset - return normalized + fields = {"line": lineno} + if end_lineno > 0: + fields["end_line"] = end_lineno + if col_offset >= 0: + fields["col"] = col_offset + if end_col_offset >= 0: + fields["end_col"] = end_col_offset + return fields def _iter_final_agg_entries(self): for frame_record in self._frames: diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 4411bae62fc7ee..a458475fc46d58 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1959,6 +1959,11 @@ def test_extract_lineno_from_none(self): """Test extracting lineno from None (synthetic frames).""" self.assertEqual(extract_lineno(None), 0) + def test_normalize_location_with_int(self): + """Test normalize_location expands a legacy integer line number.""" + result = normalize_location(42) + self.assertEqual(result, (42, 42, -1, -1)) + def test_normalize_location_with_location_info(self): """Test normalize_location passes through LocationInfo.""" loc = LocationInfo(10, 15, 0, 5) From 4b477c0dacc1eb0ab93342de4757a6fae41c0ebd Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:20:48 +0200 Subject: [PATCH 24/41] helper --- .../test_sampling_profiler/test_collectors.py | 255 +++++++++--------- .../test_sampling_profiler/test_modes.py | 148 +--------- 2 files changed, 130 insertions(+), 273 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index a458475fc46d58..f1d005dff8c976 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -58,6 +58,25 @@ def find_child_by_name(children, strings, substr): return None +def _jsonl_tables(records): + meta = next(record for record in records if record["type"] == "meta") + end = next(record for record in records if record["type"] == "end") + agg = next(record for record in records if record["type"] == "agg") + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + return meta, str_defs, frame_defs, agg, end + + class TestSampleProfilerComponents(unittest.TestCase): """Unit tests for individual profiler components.""" @@ -1670,14 +1689,12 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) - def test_jsonl_collector_export(self): + def test_jsonl_collector_export_exact_output(self): jsonl_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(1000) - run_id = collector.run_id - - self.assertIsNotNone(run_id) + collector.run_id = "run-123" test_frames1 = [ MockInterpreterInfo( @@ -1709,46 +1726,74 @@ def test_jsonl_collector_export(self): collector.collect(test_frames2) collector.collect(test_frames3) - with captured_stdout(), captured_stderr(): - collector.export(jsonl_out.name) + collector.export(jsonl_out.name) - # Check file contents - with open(jsonl_out.name, "r") as f: + with open(jsonl_out.name, "r", encoding="utf-8") as f: content = f.read() - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "func1"}, - {"str_id": 2, "value": "file.py"}, - {"str_id": 3, "value": "func2"}, - {"str_id": 4, "value": "other_func"}, - {"str_id": 5, "value": "other.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 10, "end_line": 10}, - {"frame_id": 2, "path_str_id": 2, "func_str_id": 3, - "line": 20, "end_line": 20}, - {"frame_id": 3, "path_str_id": 5, "func_str_id": 4, - "line": 5, "end_line": 5}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 3, - "entries": [{"frame_id": 1, "self": 2, "cumulative": 2}, - {"frame_id": 2, "self": 0, "cumulative": 2}, - {"frame_id": 3, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 3}), - ] - - for exp in expected: - self.assertIn(exp, lines) + self.assertEqual( + content, + ( + '{"type":"meta","v":1,"run_id":"run-123","sample_interval_usec":1000}\n' + '{"type":"str_def","v":1,"run_id":"run-123","defs":[{"str_id":1,"value":"func1"},{"str_id":2,"value":"file.py"},{"str_id":3,"value":"func2"},{"str_id":4,"value":"other_func"},{"str_id":5,"value":"other.py"}]}\n' + '{"type":"frame_def","v":1,"run_id":"run-123","defs":[{"frame_id":1,"path_str_id":2,"func_str_id":1,"line":10,"end_line":10},{"frame_id":2,"path_str_id":2,"func_str_id":3,"line":20,"end_line":20},{"frame_id":3,"path_str_id":5,"func_str_id":4,"line":5,"end_line":5}]}\n' + '{"type":"agg","v":1,"run_id":"run-123","kind":"frame","scope":"final","samples_total":3,"entries":[{"frame_id":1,"self":2,"cumulative":2},{"frame_id":2,"self":0,"cumulative":2},{"frame_id":3,"self":1,"cumulative":1}]}\n' + '{"type":"end","v":1,"run_id":"run-123","samples_total":3}\n' + ), + ) + + def test_jsonl_collector_skip_idle_filters_threads(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + active_status = THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU + frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [MockFrameInfo("active1.py", 10, "active_func1")], + status=active_status, + ), + MockThreadInfo( + 2, + [MockFrameInfo("idle.py", 20, "idle_func")], + status=0, + ), + MockThreadInfo( + 3, + [MockFrameInfo("active2.py", 30, "active_func2")], + status=active_status, + ), + ], + ) + ] + + def export_summary(skip_idle): + collector = JsonlCollector(1000, skip_idle=skip_idle) + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, agg_record, _ = _jsonl_tables(records) + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + funcs = {str_defs[item["func_str_id"]] for item in frame_defs} + return paths, funcs, agg_record["samples_total"] + + paths, funcs, samples_total = export_summary(skip_idle=True) + self.assertEqual(paths, {"active1.py", "active2.py"}) + self.assertEqual(funcs, {"active_func1", "active_func2"}) + self.assertEqual(samples_total, 2) + + paths, funcs, samples_total = export_summary(skip_idle=False) + self.assertEqual(paths, {"active1.py", "idle.py", "active2.py"}) + self.assertEqual( + funcs, {"active_func1", "idle_func", "active_func2"} + ) + self.assertEqual(samples_total, 3) class TestRecursiveFunctionHandling(unittest.TestCase): @@ -2160,7 +2205,6 @@ def test_jsonl_collector_with_location_info(self): self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) - run_id = collector.run_id # Frame with LocationInfo frame = MockFrameInfo("test.py", 42, "my_function") @@ -2171,38 +2215,28 @@ def test_jsonl_collector_with_location_info(self): ] collector.collect(frames) - # Should extract lineno from location - with captured_stdout(), captured_stderr(): - collector.export(jsonl_out.name) + collector.export(jsonl_out.name) - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "my_function"}, - {"str_id": 2, "value": "test.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 42, "end_line": 42}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 1, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 1}), - ] - - for exp in expected: - self.assertIn(exp, lines) + meta, str_defs, frame_defs, agg, end = _jsonl_tables(records) + self.assertEqual(meta["sample_interval_usec"], 1000) + self.assertEqual(agg["samples_total"], 1) + self.assertEqual(end["samples_total"], 1) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(str_defs[frame_defs[0]["path_str_id"]], "test.py") + self.assertEqual(str_defs[frame_defs[0]["func_str_id"]], "my_function") + self.assertEqual( + frame_defs[0], + { + "frame_id": 1, + "path_str_id": frame_defs[0]["path_str_id"], + "func_str_id": frame_defs[0]["func_str_id"], + "line": 42, + "end_line": 42, + }, + ) def test_jsonl_collector_with_none_location(self): """Test JsonlCollector handles None location (synthetic frames).""" @@ -2210,7 +2244,6 @@ def test_jsonl_collector_with_none_location(self): self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) - run_id = collector.run_id # Create frame with None location (like GC frame) frame = MockFrameInfo("~", 0, "") @@ -2223,38 +2256,28 @@ def test_jsonl_collector_with_none_location(self): ] collector.collect(frames) - # Should handle None location as synthetic frame - with captured_stdout(), captured_stderr(): - collector.export(jsonl_out.name) - - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() - - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) + collector.export(jsonl_out.name) - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": ""}, - {"str_id": 2, "value": "~"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 0, "synthetic": True}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 1, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 1}), - ] + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] - for exp in expected: - self.assertIn(exp, lines) + meta, str_defs, frame_defs, agg, end = _jsonl_tables(records) + self.assertEqual(meta["sample_interval_usec"], 1000) + self.assertEqual(agg["samples_total"], 1) + self.assertEqual(end["samples_total"], 1) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(str_defs[frame_defs[0]["path_str_id"]], "~") + self.assertEqual(str_defs[frame_defs[0]["func_str_id"]], "") + self.assertEqual( + frame_defs[0], + { + "frame_id": 1, + "path_str_id": frame_defs[0]["path_str_id"], + "func_str_id": frame_defs[0]["func_str_id"], + "line": 0, + "synthetic": True, + }, + ) class TestOpcodeHandling(unittest.TestCase): @@ -2488,18 +2511,7 @@ def test_jsonl_collector_frame_format(self): with open(f.name, "r", encoding="utf-8") as fp: records = [json.loads(line) for line in fp] - str_defs = { - item["str_id"]: item["value"] - for record in records - if record["type"] == "str_def" - for item in record["defs"] - } - frame_defs = [ - item - for record in records - if record["type"] == "frame_def" - for item in record["defs"] - ] + _, str_defs, frame_defs, _, _ = _jsonl_tables(records) self.assertEqual(len(frame_defs), 3) @@ -2666,18 +2678,7 @@ def test_jsonl_collector_filters_internal_frames(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - str_defs = { - item["str_id"]: item["value"] - for record in records - if record["type"] == "str_def" - for item in record["defs"] - } - frame_defs = [ - item - for record in records - if record["type"] == "frame_def" - for item in record["defs"] - ] + _, str_defs, frame_defs, _, _ = _jsonl_tables(records) paths = {str_defs[item["path_str_id"]] for item in frame_defs} diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 2bac26c37091b0..6cd636593e3db1 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -9,7 +9,6 @@ import profiling.sampling import profiling.sampling.sample from profiling.sampling.pstats_collector import PstatsCollector - from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.cli import main, _parse_mode from profiling.sampling.constants import PROFILING_MODE_EXCEPTION from _remote_debugging import ( @@ -21,13 +20,9 @@ "Test only runs when _remote_debugging is available" ) -from test.support import ( - captured_stdout, - captured_stderr, - requires_remote_subprocess_debugging, -) +from test.support import requires_remote_subprocess_debugging -from .helpers import close_and_unlink, test_subprocess +from .helpers import test_subprocess from .mocks import MockFrameInfo, MockInterpreterInfo @@ -233,145 +228,6 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) - def test_jsonl_collector_respects_skip_idle(self): - """Test that frames are actually filtered when skip_idle=True.""" - import tempfile - import json - - jsonl_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, jsonl_out) - - # Create mock frames with different thread statuses - class MockThreadInfoWithStatus: - def __init__(self, thread_id, frame_info, status): - self.thread_id = thread_id - self.frame_info = frame_info - self.status = status - - # Create test data: active thread (HAS_GIL | ON_CPU), idle thread (neither), and another active thread - ACTIVE_STATUS = ( - THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU - ) # Has GIL and on CPU - IDLE_STATUS = 0 # Neither has GIL nor on CPU - - test_frames = [ - MockInterpreterInfo( - 0, - [ - MockThreadInfoWithStatus( - 1, - [MockFrameInfo("active1.py", 10, "active_func1")], - ACTIVE_STATUS, - ), - MockThreadInfoWithStatus( - 2, - [MockFrameInfo("idle.py", 20, "idle_func")], - IDLE_STATUS, - ), - MockThreadInfoWithStatus( - 3, - [MockFrameInfo("active2.py", 30, "active_func2")], - ACTIVE_STATUS, - ), - ], - ) - ] - - # Test with skip_idle=True - should only process running threads - collector_skip = JsonlCollector( - sample_interval_usec=1000, skip_idle=True - ) - collector_skip.collect(test_frames) - - run_id = collector_skip.run_id - - # Should only have functions from running threads (status 0) - with captured_stdout(), captured_stderr(): - collector_skip.export(jsonl_out.name) - - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() - - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "active_func1"}, - {"str_id": 2, "value": "active1.py"}, - {"str_id": 3, "value": "active_func2"}, - {"str_id": 4, "value": "active2.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 10, "end_line": 10}, - {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, - "line": 30, "end_line": 30}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 2, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, - {"frame_id": 2, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 2}), - ] - - for exp in expected: - self.assertIn(exp, lines) - - # Test with skip_idle=False - should process all threads - collector_no_skip = JsonlCollector( - sample_interval_usec=1000, skip_idle=False - ) - collector_no_skip.collect(test_frames) - - run_id = collector_no_skip.run_id - - # Should have functions from all threads - with captured_stdout(), captured_stderr(): - collector_no_skip.export(jsonl_out.name) - - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() - - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "active_func1"}, - {"str_id": 2, "value": "active1.py"}, - {"str_id": 3, "value": "idle_func"}, - {"str_id": 4, "value": "idle.py"}, - {"str_id": 5, "value": "active_func2"}, - {"str_id": 6, "value": "active2.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 10, "end_line": 10}, - {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, - "line": 20, "end_line": 20}, - {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, - "line": 30, "end_line": 30}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 3, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, - {"frame_id": 2, "self": 1, "cumulative": 1}, - {"frame_id": 3, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 3}), - ] - - for exp in expected: - self.assertIn(exp, lines) - - @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): """Test GIL mode filtering functionality (--mode=gil).""" From e14f6f17a1c2062519db71f8d21ffda159c28a41 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:31:15 +0200 Subject: [PATCH 25/41] reorder --- Lib/profiling/sampling/jsonl_collector.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 6c8f2bc2fd3135..372205a566afc6 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -116,6 +116,15 @@ def _build_end_record(self): return record + def _iter_final_agg_entries(self): + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + yield { + "frame_id": frame_id, + "self": self._frame_self[frame_id], + "cumulative": self._frame_cumulative[frame_id], + } + def _get_or_create_frame_id(self, filename, location, funcname): synthetic = location is None location_fields = self._location_to_export_fields(location) @@ -175,15 +184,6 @@ def _location_to_export_fields(location): fields["end_col"] = end_col_offset return fields - def _iter_final_agg_entries(self): - for frame_record in self._frames: - frame_id = frame_record["frame_id"] - yield { - "frame_id": frame_id, - "self": self._frame_self[frame_id], - "cumulative": self._frame_cumulative[frame_id], - } - def _write_chunked_records( self, output, base_record, chunk_field, entries ): From cf6aa9e296d42b19ba496bae23ee44b4e26a077f Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:46:44 +0200 Subject: [PATCH 26/41] eh, just copy from heatmap --- Lib/profiling/sampling/jsonl_collector.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 372205a566afc6..146075b00b121c 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -43,6 +43,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): self._frame_self = Counter() self._frame_cumulative = Counter() self._samples_total = 0 + self._seen_frame_ids = set() self._mode = mode @@ -51,17 +52,21 @@ def process_frames(self, frames, _thread_id, weight=1): return self._samples_total += weight + self._seen_frame_ids.clear() - frame_ids = [ - self._get_or_create_frame_id(filename, location, funcname) - for filename, location, funcname, _opcode in frames - ] - leaf_frame_id = frame_ids[0] + for i, (filename, location, funcname, _opcode) in enumerate(frames): + frame_id = self._get_or_create_frame_id(filename, location, funcname) + is_leaf = (i == 0) + count_cumulative = frame_id not in self._seen_frame_ids - self._frame_self[leaf_frame_id] += weight + if count_cumulative: + self._seen_frame_ids.add(frame_id) - for frame_id in set(frame_ids): - self._frame_cumulative[frame_id] += weight + if is_leaf: + self._frame_self[frame_id] += weight + + if count_cumulative: + self._frame_cumulative[frame_id] += weight def export(self, filename): with open(filename, "w", encoding="utf-8") as output: From 1f4c7660eabae2dbc8b07e72cbf3e71f47006deb Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:54:14 +0200 Subject: [PATCH 27/41] smaller chunk; matter of taste --- Lib/profiling/sampling/jsonl_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 146075b00b121c..12d8e4b3e2e77b 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -16,7 +16,7 @@ from .stack_collector import StackTraceCollector -_CHUNK_SIZE = 1000 +_CHUNK_SIZE = 256 _MODE_NAMES = { PROFILING_MODE_WALL: "wall", From ba5712e2d47eb8b9ffaac180619a7b3dc3ccbd4b Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:54:27 +0200 Subject: [PATCH 28/41] test actual chunking --- .../test_sampling_profiler/test_collectors.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index f1d005dff8c976..d34cb33e21b81b 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1795,6 +1795,49 @@ def export_summary(skip_idle): ) self.assertEqual(samples_total, 3) + def test_jsonl_collector_splits_large_exports_into_chunks(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + + for i in range(257): + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [MockFrameInfo(f"file{i}.py", i + 1, f"func{i}")], + ) + ], + ) + ] + ) + + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + run_ids = {record["run_id"] for record in records} + self.assertEqual(len(run_ids), 1) + self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") + + _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables(records) + str_chunks = [record for record in records if record["type"] == "str_def"] + frame_chunks = [record for record in records if record["type"] == "frame_def"] + agg_chunks = [record for record in records if record["type"] == "agg"] + + self.assertEqual([len(record["defs"]) for record in str_chunks], [256, 256, 2]) + self.assertEqual([len(record["defs"]) for record in frame_chunks], [256, 1]) + self.assertEqual([len(record["entries"]) for record in agg_chunks], [256, 1]) + self.assertEqual(len(str_defs), 514) + self.assertEqual(len(frame_defs), 257) + self.assertEqual(agg_record["samples_total"], 257) + self.assertEqual(end_record["samples_total"], 257) + class TestRecursiveFunctionHandling(unittest.TestCase): """Tests for correct handling of recursive functions in cumulative stats.""" From 3cacc309140da450a9963b7d77c8c0e7ebe55ca9 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:00:29 +0200 Subject: [PATCH 29/41] test edge cases --- .../test_sampling_profiler/test_collectors.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index d34cb33e21b81b..8d80ad16982d7d 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1742,6 +1742,80 @@ def test_jsonl_collector_export_exact_output(self): ), ) + def test_jsonl_collector_export_includes_mode_in_meta(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000, mode=PROFILING_MODE_CPU) + collector.collect( + [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [MockFrameInfo("file.py", 10, "func")])], + ) + ] + ) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + meta_record = next(record for record in records if record["type"] == "meta") + self.assertEqual(meta_record["mode"], "cpu") + + def test_jsonl_collector_export_empty_profile(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.run_id = "run-123" + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + self.assertEqual([record["type"] for record in records], ["meta", "end"]) + self.assertEqual(records[0]["sample_interval_usec"], 1000) + self.assertEqual(records[0]["run_id"], "run-123") + self.assertEqual(records[1]["samples_total"], 0) + self.assertEqual(records[1]["run_id"], "run-123") + + def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("recursive.py", 10, "recursive_func"), + MockFrameInfo("recursive.py", 10, "recursive_func"), + MockFrameInfo("recursive.py", 10, "recursive_func"), + ], + ) + ], + ) + ] + ) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, agg_record, end_record = _jsonl_tables(records) + self.assertEqual(len(frame_defs), 1) + self.assertEqual( + agg_record["entries"], + [{"frame_id": frame_defs[0]["frame_id"], "self": 1, "cumulative": 1}], + ) + self.assertEqual(agg_record["samples_total"], 1) + self.assertEqual(end_record["samples_total"], 1) + def test_jsonl_collector_skip_idle_filters_threads(self): jsonl_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, jsonl_out) From 4d48f5838dc826c93682683a3799d96504810090 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:05:30 +0200 Subject: [PATCH 30/41] ruff --- Lib/profiling/sampling/jsonl_collector.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 12d8e4b3e2e77b..a1d37df85c2672 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -55,8 +55,10 @@ def process_frames(self, frames, _thread_id, weight=1): self._seen_frame_ids.clear() for i, (filename, location, funcname, _opcode) in enumerate(frames): - frame_id = self._get_or_create_frame_id(filename, location, funcname) - is_leaf = (i == 0) + frame_id = self._get_or_create_frame_id( + filename, location, funcname + ) + is_leaf = i == 0 count_cumulative = frame_id not in self._seen_frame_ids if count_cumulative: From 3ea253bba7bcb0b3cc0cc607a817873375e233d0 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:16:37 +0200 Subject: [PATCH 31/41] match pep8 --- Lib/test/test_profiling/test_sampling_profiler/test_modes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 6cd636593e3db1..0b38fb4ad4bcf6 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -228,6 +228,7 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) + @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): """Test GIL mode filtering functionality (--mode=gil).""" From 308ca86e221d7ab1a289a490bbe341e89a98f051 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:44:27 +0200 Subject: [PATCH 32/41] style --- Lib/profiling/sampling/binary_reader.py | 2 +- .../test_sampling_profiler/test_collectors.py | 124 +++++++++++++----- 2 files changed, 94 insertions(+), 32 deletions(-) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index 8d1d8eef9155eb..a29dad91ae339d 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -118,7 +118,7 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) - elif output_format == 'jsonl': + elif output_format == "jsonl": collector = JsonlCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 8d80ad16982d7d..b6d09301240e65 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1701,7 +1701,11 @@ def test_jsonl_collector_export_exact_output(self): 0, [ MockThreadInfo( - 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + 1, + [ + MockFrameInfo("file.py", 10, "func1"), + MockFrameInfo("file.py", 20, "func2"), + ], ) ], ) @@ -1711,14 +1715,23 @@ def test_jsonl_collector_export_exact_output(self): 0, [ MockThreadInfo( - 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + 1, + [ + MockFrameInfo("file.py", 10, "func1"), + MockFrameInfo("file.py", 20, "func2"), + ], ) ], ) ] # Same stack test_frames3 = [ MockInterpreterInfo( - 0, [MockThreadInfo(1, [MockFrameInfo("other.py", 5, "other_func")])] + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("other.py", 5, "other_func")] + ) + ], ) ] @@ -1751,7 +1764,11 @@ def test_jsonl_collector_export_includes_mode_in_meta(self): [ MockInterpreterInfo( 0, - [MockThreadInfo(1, [MockFrameInfo("file.py", 10, "func")])], + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func")] + ) + ], ) ] ) @@ -1760,7 +1777,9 @@ def test_jsonl_collector_export_includes_mode_in_meta(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - meta_record = next(record for record in records if record["type"] == "meta") + meta_record = next( + record for record in records if record["type"] == "meta" + ) self.assertEqual(meta_record["mode"], "cpu") def test_jsonl_collector_export_empty_profile(self): @@ -1774,7 +1793,9 @@ def test_jsonl_collector_export_empty_profile(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - self.assertEqual([record["type"] for record in records], ["meta", "end"]) + self.assertEqual( + [record["type"] for record in records], ["meta", "end"] + ) self.assertEqual(records[0]["sample_interval_usec"], 1000) self.assertEqual(records[0]["run_id"], "run-123") self.assertEqual(records[1]["samples_total"], 0) @@ -1793,9 +1814,15 @@ def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): MockThreadInfo( 1, [ - MockFrameInfo("recursive.py", 10, "recursive_func"), - MockFrameInfo("recursive.py", 10, "recursive_func"), - MockFrameInfo("recursive.py", 10, "recursive_func"), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), ], ) ], @@ -1811,7 +1838,13 @@ def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): self.assertEqual(len(frame_defs), 1) self.assertEqual( agg_record["entries"], - [{"frame_id": frame_defs[0]["frame_id"], "self": 1, "cumulative": 1}], + [ + { + "frame_id": frame_defs[0]["frame_id"], + "self": 1, + "cumulative": 1, + } + ], ) self.assertEqual(agg_record["samples_total"], 1) self.assertEqual(end_record["samples_total"], 1) @@ -1864,9 +1897,7 @@ def export_summary(skip_idle): paths, funcs, samples_total = export_summary(skip_idle=False) self.assertEqual(paths, {"active1.py", "idle.py", "active2.py"}) - self.assertEqual( - funcs, {"active_func1", "idle_func", "active_func2"} - ) + self.assertEqual(funcs, {"active_func1", "idle_func", "active_func2"}) self.assertEqual(samples_total, 3) def test_jsonl_collector_splits_large_exports_into_chunks(self): @@ -1883,7 +1914,11 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): [ MockThreadInfo( 1, - [MockFrameInfo(f"file{i}.py", i + 1, f"func{i}")], + [ + MockFrameInfo( + f"file{i}.py", i + 1, f"func{i}" + ) + ], ) ], ) @@ -1899,14 +1934,26 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): self.assertEqual(len(run_ids), 1) self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") - _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables(records) - str_chunks = [record for record in records if record["type"] == "str_def"] - frame_chunks = [record for record in records if record["type"] == "frame_def"] + _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables( + records + ) + str_chunks = [ + record for record in records if record["type"] == "str_def" + ] + frame_chunks = [ + record for record in records if record["type"] == "frame_def" + ] agg_chunks = [record for record in records if record["type"] == "agg"] - self.assertEqual([len(record["defs"]) for record in str_chunks], [256, 256, 2]) - self.assertEqual([len(record["defs"]) for record in frame_chunks], [256, 1]) - self.assertEqual([len(record["entries"]) for record in agg_chunks], [256, 1]) + self.assertEqual( + [len(record["defs"]) for record in str_chunks], [256, 256, 2] + ) + self.assertEqual( + [len(record["defs"]) for record in frame_chunks], [256, 1] + ) + self.assertEqual( + [len(record["entries"]) for record in agg_chunks], [256, 1] + ) self.assertEqual(len(str_defs), 514) self.assertEqual(len(frame_defs), 257) self.assertEqual(agg_record["samples_total"], 257) @@ -2075,7 +2122,9 @@ def test_pstats_collector_cumulative_percentage_cannot_exceed_100(self): cumulative_calls = stats[1] self.assertEqual(cumulative_calls, 10) - def test_pstats_collector_different_lines_same_function_counted_separately(self): + def test_pstats_collector_different_lines_same_function_counted_separately( + self, + ): """Test that different line numbers in same function are tracked separately.""" collector = PstatsCollector(sample_interval_usec=1000) @@ -2282,8 +2331,7 @@ def test_flamegraph_collector_with_location_info(self): frame = MockFrameInfo("app.py", 100, "process_data") frames = [ MockInterpreterInfo( - 0, - [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) @@ -2291,8 +2339,15 @@ def test_flamegraph_collector_with_location_info(self): data = collector._convert_to_flamegraph_format() # Verify the function name includes lineno from location strings = data.get("strings", []) - name_found = any("process_data" in s and "100" in s for s in strings if isinstance(s, str)) - self.assertTrue(name_found, f"Expected to find 'process_data' with line 100 in {strings}") + name_found = any( + "process_data" in s and "100" in s + for s in strings + if isinstance(s, str) + ) + self.assertTrue( + name_found, + f"Expected to find 'process_data' with line 100 in {strings}", + ) def test_gecko_collector_with_location_info(self): """Test GeckoCollector handles LocationInfo properly.""" @@ -2301,8 +2356,7 @@ def test_gecko_collector_with_location_info(self): frame = MockFrameInfo("server.py", 50, "handle_request") frames = [ MockInterpreterInfo( - 0, - [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) @@ -2565,8 +2619,12 @@ def _make_sample_frames(self): 1, [ MockFrameInfo("app.py", 100, "main", opcode=90), - MockFrameInfo("utils.py", 50, "helper", opcode=100), - MockFrameInfo("lib.py", 25, "process", opcode=None), + MockFrameInfo( + "utils.py", 50, "helper", opcode=100 + ), + MockFrameInfo( + "lib.py", 25, "process", opcode=None + ), ], status=THREAD_STATUS_HAS_GIL, ) @@ -2724,7 +2782,9 @@ def test_flamegraph_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo( + "/lib/_sync_coordinator.py", 100, "main" + ), MockFrameInfo("", 87, "_run_code"), ], status=THREAD_STATUS_HAS_GIL, @@ -2752,7 +2812,9 @@ def test_collapsed_stack_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo( + "/lib/_sync_coordinator.py", 100, "main" + ), ], status=THREAD_STATUS_HAS_GIL, ) From 0db38a1bbe2f11702638c78f50a97c59efb5b68a Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:49:26 +0200 Subject: [PATCH 33/41] too defensive --- Lib/profiling/sampling/jsonl_collector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index a1d37df85c2672..187c4175da6816 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -48,9 +48,6 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): self._mode = mode def process_frames(self, frames, _thread_id, weight=1): - if not frames: - return - self._samples_total += weight self._seen_frame_ids.clear() From 4c768b4f135cfc07549d30220c6a04e160a04a2f Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:16:15 +0200 Subject: [PATCH 34/41] too many style changes --- .../test_sampling_profiler/test_collectors.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index b6d09301240e65..3134bc1a946f9e 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2122,9 +2122,7 @@ def test_pstats_collector_cumulative_percentage_cannot_exceed_100(self): cumulative_calls = stats[1] self.assertEqual(cumulative_calls, 10) - def test_pstats_collector_different_lines_same_function_counted_separately( - self, - ): + def test_pstats_collector_different_lines_same_function_counted_separately(self): """Test that different line numbers in same function are tracked separately.""" collector = PstatsCollector(sample_interval_usec=1000) @@ -2331,7 +2329,8 @@ def test_flamegraph_collector_with_location_info(self): frame = MockFrameInfo("app.py", 100, "process_data") frames = [ MockInterpreterInfo( - 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) @@ -2339,15 +2338,8 @@ def test_flamegraph_collector_with_location_info(self): data = collector._convert_to_flamegraph_format() # Verify the function name includes lineno from location strings = data.get("strings", []) - name_found = any( - "process_data" in s and "100" in s - for s in strings - if isinstance(s, str) - ) - self.assertTrue( - name_found, - f"Expected to find 'process_data' with line 100 in {strings}", - ) + name_found = any("process_data" in s and "100" in s for s in strings if isinstance(s, str)) + self.assertTrue(name_found, f"Expected to find 'process_data' with line 100 in {strings}") def test_gecko_collector_with_location_info(self): """Test GeckoCollector handles LocationInfo properly.""" @@ -2356,7 +2348,8 @@ def test_gecko_collector_with_location_info(self): frame = MockFrameInfo("server.py", 50, "handle_request") frames = [ MockInterpreterInfo( - 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) From 5e86f4f8e7cbeeaaf139347a99f0df60d649277e Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:18:14 +0200 Subject: [PATCH 35/41] less style --- .../test_sampling_profiler/test_collectors.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 3134bc1a946f9e..1bb9884c1e0ed8 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2612,12 +2612,8 @@ def _make_sample_frames(self): 1, [ MockFrameInfo("app.py", 100, "main", opcode=90), - MockFrameInfo( - "utils.py", 50, "helper", opcode=100 - ), - MockFrameInfo( - "lib.py", 25, "process", opcode=None - ), + MockFrameInfo("utils.py", 50, "helper", opcode=100), + MockFrameInfo("lib.py", 25, "process", opcode=None), ], status=THREAD_STATUS_HAS_GIL, ) @@ -2805,9 +2801,7 @@ def test_collapsed_stack_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo( - "/lib/_sync_coordinator.py", 100, "main" - ), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), ], status=THREAD_STATUS_HAS_GIL, ) From 25eb558e98a16ca35a0e66e89219d601200955b7 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:20:22 +0200 Subject: [PATCH 36/41] ha! even less style... --- .../test_profiling/test_sampling_profiler/test_collectors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 1bb9884c1e0ed8..51c2eb73a20764 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2771,9 +2771,7 @@ def test_flamegraph_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo( - "/lib/_sync_coordinator.py", 100, "main" - ), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), MockFrameInfo("", 87, "_run_code"), ], status=THREAD_STATUS_HAS_GIL, From d25b4d506b9c93123389ba9b7c65fdb5e673a84f Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:33:18 +0200 Subject: [PATCH 37/41] news --- .../Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst new file mode 100644 index 00000000000000..d2d7e0d98d158b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -0,0 +1,3 @@ +The ``profiling.sampling`` module now supports JSONL output format via +`--jsonl`. Each run emits newline-delimited JSON records suitable for +streaming or agents. From 0c0089afa6f506242ff9d5699466181714e2fbc9 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:33:34 +0200 Subject: [PATCH 38/41] news: proper formatting --- .../next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst index d2d7e0d98d158b..d270cc14288d8a 100644 --- a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -1,3 +1,3 @@ The ``profiling.sampling`` module now supports JSONL output format via -`--jsonl`. Each run emits newline-delimited JSON records suitable for +``--jsonl``. Each run emits newline-delimited JSON records suitable for streaming or agents. From 5690ddf8354667aa904e6647faef07ca722afbb0 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 4 Apr 2026 14:42:54 +0200 Subject: [PATCH 39/41] claim credit! --- .../next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst index d270cc14288d8a..f75102eeae16e6 100644 --- a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -1,3 +1,3 @@ The ``profiling.sampling`` module now supports JSONL output format via ``--jsonl``. Each run emits newline-delimited JSON records suitable for -streaming or agents. +streaming or agents. Patch by Maurycy Pawłowski-Wieroński. From 8e1d83bb97942c7b8c0087847eb6e3714c6b7741 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Mon, 4 May 2026 23:51:03 +0100 Subject: [PATCH 40/41] fixup! claim credit! --- Lib/profiling/sampling/cli.py | 2 +- Lib/profiling/sampling/collector.py | 5 +- Lib/profiling/sampling/jsonl_collector.py | 75 +++++++- .../test_sampling_profiler/helpers.py | 26 +++ .../test_binary_format.py | 70 +++++++- .../test_sampling_profiler/test_cli.py | 77 ++++++++- .../test_sampling_profiler/test_collectors.py | 160 +++++++++++++++--- ...-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst | 5 +- Modules/_remote_debugging/binary_io_reader.c | 6 +- 9 files changed, 387 insertions(+), 39 deletions(-) diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index ccefd2402edc8e..0648713edc52af 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -496,7 +496,7 @@ def _add_format_options(parser, include_compression=True, include_binary=True): action="store_const", const="jsonl", dest="format", - help="Generate JSONL snapshot output for external consumers", + help="Generate newline-delimited JSON (JSONL) for programmatic consumers", ) if include_binary: format_group.add_argument( diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py index ad5be46821c096..81ec6344ebdea4 100644 --- a/Lib/profiling/sampling/collector.py +++ b/Lib/profiling/sampling/collector.py @@ -37,13 +37,16 @@ def extract_lineno(location): """Extract lineno from location. Args: - location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None + location: tuple (lineno, end_lineno, col_offset, end_col_offset), + an integer line number, or None Returns: int: The line number (0 for synthetic frames) """ if location is None: return 0 + if isinstance(location, int): + return location return location[0] def _is_internal_frame(frame): diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 187c4175da6816..bcb98e75d91164 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -1,4 +1,70 @@ -"""JSONL collector.""" +"""JSON Lines (JSONL) collector for the sampling profiler. + +Emits a normalized newline-delimited JSON record stream suitable for +programmatic consumption by external tools, scripts, and agents. Each line +is one JSON object; consumers can parse the file incrementally line by +line, but the producer writes the whole file at the end of the run (it is +not a live/streaming producer). + +Record schema +============= + +Every record is a JSON object with at least ``"type"``, ``"v"`` (record +schema version), and ``"run_id"`` (UUID4 hex tagging the run; allows +demultiplexing concatenated streams). Records appear in this fixed order: + +1. ``meta`` (exactly one, first line):: + + {"type":"meta","v":1,"run_id":"", + "sample_interval_usec":,"mode":"wall|cpu|gil|all|exception"} + + ``mode`` is omitted when not provided. + +2. ``str_def`` (zero or more):: + + {"type":"str_def","v":1,"run_id":"", + "defs":[{"str_id":,"value":""}, ...]} + + Strings (filenames, function names) are interned to keep repeated values + compact. Each chunk holds up to ``_CHUNK_SIZE`` entries. + +3. ``frame_def`` (zero or more):: + + {"type":"frame_def","v":1,"run_id":"", + "defs":[{"frame_id":,"path_str_id":,"func_str_id":, + "line":,"end_line":,"col":,"end_col":, + "synthetic":true}, ...]} + + ``end_line``/``col``/``end_col`` are *omitted* when source location data + is unavailable (a missing key means "not available", not zero or null). + ``synthetic`` is present only on synthetic frames (for example, internal + marker frames whose source location is None) and absent otherwise. + +4. ``agg`` (zero or more):: + + {"type":"agg","v":1,"run_id":"","kind":"frame","scope":"final", + "samples_total":, + "entries":[{"frame_id":,"self":,"cumulative":}, ...]} + + ``self`` counts samples where the frame was the leaf (currently + executing); ``cumulative`` counts samples where the frame appeared + anywhere in the stack (deduped per sample so recursion does not + double-count). ``samples_total`` is the run-wide total, repeated on + each chunk so a streaming consumer always knows the denominator. + +5. ``end`` (exactly one, last line):: + + {"type":"end","v":1,"run_id":"","samples_total":} + + Presence of ``end`` is the consumer's signal that the file is complete. + +Forward compatibility +===================== + +Consumers MUST ignore unknown record ``"type"`` values and unknown object +fields. New fields will be added by adding optional keys; an incompatible +schema change will bump the per-record ``"v"``. +""" from collections import Counter import json @@ -28,7 +94,12 @@ class JsonlCollector(StackTraceCollector): - """Collector that exports finalized profiling data as JSONL.""" + """Collector that exports finalized profiling data as JSONL. + + See the module docstring for the full record schema. The collector + accumulates samples in memory and writes the complete file at + ``export()`` time. + """ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): super().__init__(sample_interval_usec, skip_idle=skip_idle) diff --git a/Lib/test/test_profiling/test_sampling_profiler/helpers.py b/Lib/test/test_profiling/test_sampling_profiler/helpers.py index 0e32d8dd9eabef..87bdf2c7f778a2 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/helpers.py +++ b/Lib/test/test_profiling/test_sampling_profiler/helpers.py @@ -174,3 +174,29 @@ def close_and_unlink(file): """Close a file and unlink it from the filesystem.""" file.close() unlink(file.name) + + +def jsonl_tables(records): + """Extract the canonical sections of a parsed JSONL profile. + + Returns ``(meta, str_defs, frame_defs, agg, end)`` where ``str_defs`` is a + ``{str_id: value}`` dict, ``frame_defs`` is a flat list of all frame + definitions across chunks, and ``agg`` is the first agg record (sufficient + for tests that only emit one chunk). + """ + meta = next(record for record in records if record["type"] == "meta") + end = next(record for record in records if record["type"] == "end") + agg = next(record for record in records if record["type"] == "agg") + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + return meta, str_defs, frame_defs, agg, end diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py index 29f83c843561cd..2d8b705be8c2ea 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py @@ -1,5 +1,6 @@ """Tests for binary format round-trip functionality.""" +import json import os import random import tempfile @@ -21,7 +22,7 @@ THREAD_STATUS_MAIN_THREAD, ) from profiling.sampling.binary_collector import BinaryCollector - from profiling.sampling.binary_reader import BinaryReader + from profiling.sampling.binary_reader import BinaryReader, convert_binary_to_format from profiling.sampling.gecko_collector import GeckoCollector ZSTD_AVAILABLE = _remote_debugging.zstd_available() @@ -30,6 +31,8 @@ "Test only runs when _remote_debugging is available" ) +from .helpers import jsonl_tables + def make_frame(filename, lineno, funcname, end_lineno=None, column=None, end_column=None, opcode=None): @@ -1211,5 +1214,70 @@ def test_timestamp_preservation_with_rle(self): self.assertEqual(ts_collector.all_timestamps, expected_timestamps) +class TestBinaryReplayToJsonl(BinaryFormatTestBase): + """Tests for binary -> JSONL replay via convert_binary_to_format.""" + + def _replay_to_jsonl(self, samples, interval=1000): + bin_path = self.create_binary_file(samples, interval=interval) + with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: + jsonl_path = f.name + self.temp_files.append(jsonl_path) + + convert_binary_to_format(bin_path, jsonl_path, "jsonl") + + with open(jsonl_path, "r", encoding="utf-8") as f: + return [json.loads(line) for line in f] + + def test_binary_replay_to_jsonl_basic(self): + """Replay a small .bin to JSONL: meta/end shape, samples_total, run_id.""" + frame = make_frame("hot.py", 99, "hot_func") + samples = [ + [make_interpreter(0, [make_thread(1, [frame])])] + for _ in range(5) + ] + records = self._replay_to_jsonl(samples, interval=2000) + meta, _, frame_defs, _, end = jsonl_tables(records) + + self.assertEqual(meta["sample_interval_usec"], 2000) + self.assertEqual(end["samples_total"], 5) + + run_ids = {r["run_id"] for r in records} + self.assertEqual(len(run_ids), 1) + self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") + + self.assertEqual(len(frame_defs), 1) + self.assertEqual(frame_defs[0]["line"], 99) + + def test_binary_replay_to_jsonl_rle_weight_propagation(self): + """RLE-batched identical samples land as a single agg entry with the right total.""" + frame = make_frame("rle.py", 42, "rle_func") + samples = [ + [make_interpreter(0, [make_thread(1, [frame])])] + for _ in range(50) + ] + records = self._replay_to_jsonl(samples) + _, _, _, agg, end = jsonl_tables(records) + + self.assertEqual(end["samples_total"], 50) + self.assertEqual(agg["entries"], [ + {"frame_id": 1, "self": 50, "cumulative": 50}, + ]) + + def test_binary_replay_to_jsonl_omits_unavailable_columns(self): + """Columns the binary recorder did not capture are omitted, not 0.""" + # make_frame defaults column/end_column to 0; pass column=-1 / end_column=-1 + # so the binary side records LOCATION_NOT_AVAILABLE. + frame = make_frame("nocol.py", 7, "no_col", column=-1, end_column=-1) + samples = [[make_interpreter(0, [make_thread(1, [frame])])]] + records = self._replay_to_jsonl(samples) + _, _, frame_defs, _, _ = jsonl_tables(records) + + self.assertEqual(len(frame_defs), 1) + fd = frame_defs[0] + self.assertEqual(fd["line"], 7) + self.assertNotIn("col", fd) + self.assertNotIn("end_col", fd) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py index c522c50d1fd5fa..9c0734ac804e1b 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py @@ -1,6 +1,7 @@ """Tests for sampling profiler CLI argument parsing and functionality.""" import io +import json import os import subprocess import sys @@ -21,9 +22,19 @@ requires_remote_subprocess_debugging, ) -from profiling.sampling.cli import main -from profiling.sampling.constants import PROFILING_MODE_ALL, PROFILING_MODE_WALL +from profiling.sampling.cli import ( + FORMAT_EXTENSIONS, + _create_collector, + _generate_output_filename, + main, +) +from profiling.sampling.constants import ( + PROFILING_MODE_ALL, + PROFILING_MODE_CPU, + PROFILING_MODE_WALL, +) from profiling.sampling.errors import SamplingScriptNotFoundError, SamplingModuleNotFoundError, SamplingUnknownProcessError +from profiling.sampling.jsonl_collector import JsonlCollector class TestSampleProfilerCLI(unittest.TestCase): def _setup_sync_mocks(self, mock_socket, mock_popen): @@ -912,3 +923,65 @@ def test_cli_replay_reader_errors_exit_cleanly(self): str(cm.exception), "Error: Unsupported format version 2", ) + + def test_cli_jsonl_format_mutually_exclusive_with_pstats(self): + """--jsonl and --pstats cannot be combined (mutually exclusive group).""" + with ( + mock.patch( + "sys.argv", + [ + "profiling.sampling.cli", + "attach", + "12345", + "--jsonl", + "--pstats", + ], + ), + mock.patch("sys.stderr", io.StringIO()), + ): + with self.assertRaises(SystemExit): + main() + + def test_cli_jsonl_extension_in_format_extensions(self): + """FORMAT_EXTENSIONS maps 'jsonl' -> 'jsonl' so default filenames work.""" + self.assertEqual(FORMAT_EXTENSIONS["jsonl"], "jsonl") + self.assertEqual(_generate_output_filename("jsonl", 12345), "jsonl_12345.jsonl") + + def test_cli_jsonl_create_collector_propagates_mode(self): + """_create_collector('jsonl', ..., mode=X) lands X in the meta record.""" + collector = _create_collector( + "jsonl", + sample_interval_usec=1000, + skip_idle=False, + mode=PROFILING_MODE_CPU, + ) + self.assertIsInstance(collector, JsonlCollector) + + with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: + jsonl_path = f.name + self.addCleanup(os.unlink, jsonl_path) + collector.export(jsonl_path) + with open(jsonl_path, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + meta = next(r for r in records if r["type"] == "meta") + self.assertEqual(meta["mode"], "cpu") + + def test_cli_jsonl_rejects_opcodes_combination(self): + """--opcodes is incompatible with --jsonl per opcodes_compatible_formats.""" + test_args = [ + "profiling.sampling.cli", + "attach", + "12345", + "--jsonl", + "--opcodes", + ] + with ( + mock.patch("sys.argv", test_args), + mock.patch("sys.stderr", io.StringIO()) as mock_stderr, + mock.patch("profiling.sampling.cli.sample"), + self.assertRaises(SystemExit) as cm, + ): + main() + + self.assertEqual(cm.exception.code, 2) + self.assertIn("--opcodes", mock_stderr.getvalue()) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 51c2eb73a20764..bd3461885281e8 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -39,7 +39,7 @@ from test.support import captured_stdout, captured_stderr from .mocks import MockFrameInfo, MockThreadInfo, MockInterpreterInfo, LocationInfo, make_diff_collector_with_mock_baseline -from .helpers import close_and_unlink +from .helpers import close_and_unlink, jsonl_tables def resolve_name(node, strings): @@ -58,25 +58,6 @@ def find_child_by_name(children, strings, substr): return None -def _jsonl_tables(records): - meta = next(record for record in records if record["type"] == "meta") - end = next(record for record in records if record["type"] == "end") - agg = next(record for record in records if record["type"] == "agg") - str_defs = { - item["str_id"]: item["value"] - for record in records - if record["type"] == "str_def" - for item in record["defs"] - } - frame_defs = [ - item - for record in records - if record["type"] == "frame_def" - for item in record["defs"] - ] - return meta, str_defs, frame_defs, agg, end - - class TestSampleProfilerComponents(unittest.TestCase): """Unit tests for individual profiler components.""" @@ -1834,7 +1815,7 @@ def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - _, _, frame_defs, agg_record, end_record = _jsonl_tables(records) + _, _, frame_defs, agg_record, end_record = jsonl_tables(records) self.assertEqual(len(frame_defs), 1) self.assertEqual( agg_record["entries"], @@ -1885,7 +1866,7 @@ def export_summary(skip_idle): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - _, str_defs, frame_defs, agg_record, _ = _jsonl_tables(records) + _, str_defs, frame_defs, agg_record, _ = jsonl_tables(records) paths = {str_defs[item["path_str_id"]] for item in frame_defs} funcs = {str_defs[item["func_str_id"]] for item in frame_defs} return paths, funcs, agg_record["samples_total"] @@ -1934,7 +1915,7 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): self.assertEqual(len(run_ids), 1) self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") - _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables( + _, str_defs, frame_defs, agg_record, end_record = jsonl_tables( records ) str_chunks = [ @@ -1959,6 +1940,122 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): self.assertEqual(agg_record["samples_total"], 257) self.assertEqual(end_record["samples_total"], 257) + def test_jsonl_collector_respects_weight_for_rle_batched_samples(self): + """weight>1 (from binary replay RLE) is honored in self/cumulative.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + leaf = MockFrameInfo("file.py", 10, "leaf") + non_leaf = MockFrameInfo("file.py", 20, "non_leaf") + + collector.process_frames([leaf, non_leaf], _thread_id=1, weight=5) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, agg, end = jsonl_tables(records) + self.assertEqual(end["samples_total"], 5) + self.assertEqual(agg["samples_total"], 5) + self.assertEqual( + {str_defs[fd["func_str_id"]]: fd["frame_id"] for fd in frame_defs}, + {"leaf": 1, "non_leaf": 2}, + ) + self.assertEqual(agg["entries"], [ + {"frame_id": 1, "self": 5, "cumulative": 5}, + {"frame_id": 2, "self": 0, "cumulative": 5}, + ]) + + def test_jsonl_collector_recursion_with_weight(self): + """Recursion dedup respects weight, not occurrence count.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + recursive = MockFrameInfo("rec.py", 10, "f") + + collector.process_frames([recursive] * 3, _thread_id=1, weight=3) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, agg, _ = jsonl_tables(records) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(agg["entries"], [ + {"frame_id": 1, "self": 3, "cumulative": 3}, + ]) + + def test_jsonl_collector_emits_col_and_end_col_when_present(self): + """All four location fields are emitted when col/end_col are >= 0.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + frame = MockFrameInfo("test.py", 0, "f") + frame.location = LocationInfo(42, 45, 4, 12) + frames = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, _, _ = jsonl_tables(records) + self.assertEqual(frame_defs, [ + { + "frame_id": 1, + "path_str_id": 2, + "func_str_id": 1, + "line": 42, + "end_line": 45, + "col": 4, + "end_col": 12, + }, + ]) + self.assertEqual(str_defs, {1: "f", 2: "test.py"}) + + def test_jsonl_collector_partial_location_elision(self): + """Negative col/end_col/end_line fields are individually elided.""" + # _get_or_create_frame_id interns funcname before filename, so + # func_str_id=1 ("f") and path_str_id=2 ("test.py"). + common = {"frame_id": 1, "path_str_id": 2, "func_str_id": 1} + cases = [ + (LocationInfo(42, 45, -1, 12), + {**common, "line": 42, "end_line": 45, "end_col": 12}), + (LocationInfo(42, 45, 4, -1), + {**common, "line": 42, "end_line": 45, "col": 4}), + (LocationInfo(42, 0, 4, 8), + {**common, "line": 42, "col": 4, "end_col": 8}), + ] + for loc, expected_frame_def in cases: + with self.subTest(location=loc): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + frame = MockFrameInfo("test.py", 0, "f") + frame.location = loc + frames = [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)], + ) + ] + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, _, _ = jsonl_tables(records) + self.assertEqual(frame_defs, [expected_frame_def]) + class TestRecursiveFunctionHandling(unittest.TestCase): """Tests for correct handling of recursive functions in cumulative stats.""" @@ -2168,6 +2265,15 @@ def test_extract_lineno_from_none(self): """Test extracting lineno from None (synthetic frames).""" self.assertEqual(extract_lineno(None), 0) + def test_extract_lineno_from_int(self): + """Test extracting lineno from a bare integer line number. + + Mirrors normalize_location's int contract so callers like the + collapsed/flamegraph collectors do not crash on a bare-int location. + """ + self.assertEqual(extract_lineno(42), 42) + self.assertEqual(extract_lineno(0), 0) + def test_normalize_location_with_int(self): """Test normalize_location expands a legacy integer line number.""" result = normalize_location(42) @@ -2384,7 +2490,7 @@ def test_jsonl_collector_with_location_info(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - meta, str_defs, frame_defs, agg, end = _jsonl_tables(records) + meta, str_defs, frame_defs, agg, end = jsonl_tables(records) self.assertEqual(meta["sample_interval_usec"], 1000) self.assertEqual(agg["samples_total"], 1) self.assertEqual(end["samples_total"], 1) @@ -2425,7 +2531,7 @@ def test_jsonl_collector_with_none_location(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - meta, str_defs, frame_defs, agg, end = _jsonl_tables(records) + meta, str_defs, frame_defs, agg, end = jsonl_tables(records) self.assertEqual(meta["sample_interval_usec"], 1000) self.assertEqual(agg["samples_total"], 1) self.assertEqual(end["samples_total"], 1) @@ -2675,7 +2781,7 @@ def test_jsonl_collector_frame_format(self): with open(f.name, "r", encoding="utf-8") as fp: records = [json.loads(line) for line in fp] - _, str_defs, frame_defs, _, _ = _jsonl_tables(records) + _, str_defs, frame_defs, _, _ = jsonl_tables(records) self.assertEqual(len(frame_defs), 3) @@ -2842,7 +2948,7 @@ def test_jsonl_collector_filters_internal_frames(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - _, str_defs, frame_defs, _, _ = _jsonl_tables(records) + _, str_defs, frame_defs, _, _ = jsonl_tables(records) paths = {str_defs[item["path_str_id"]] for item in frame_defs} diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst index f75102eeae16e6..636f45ae8d6c70 100644 --- a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -1,3 +1,4 @@ The ``profiling.sampling`` module now supports JSONL output format via -``--jsonl``. Each run emits newline-delimited JSON records suitable for -streaming or agents. Patch by Maurycy Pawłowski-Wieroński. +``--jsonl``. Each run emits a newline-delimited JSON file that is +sequentially parseable by external tools, scripts, and programmatic +consumers. Patch by Maurycy Pawłowski-Wieroński. diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c index 6c32ef70ac3f65..a3364ce913923e 100644 --- a/Modules/_remote_debugging/binary_io_reader.c +++ b/Modules/_remote_debugging/binary_io_reader.c @@ -785,9 +785,9 @@ build_frame_list(RemoteDebuggingState *state, BinaryReader *reader, if (frame->lineno != LOCATION_NOT_AVAILABLE) { location = Py_BuildValue("(iiii)", frame->lineno, - frame->end_lineno != LOCATION_NOT_AVAILABLE ? frame->end_lineno : frame->lineno, - frame->column != LOCATION_NOT_AVAILABLE ? frame->column : 0, - frame->end_column != LOCATION_NOT_AVAILABLE ? frame->end_column : 0); + frame->end_lineno, + frame->column, + frame->end_column); if (!location) { Py_DECREF(frame_info); goto error; From fb4a7c85cecbad71ac983a3639ebddde34999f4d Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Mon, 4 May 2026 23:57:36 +0100 Subject: [PATCH 41/41] fixup! fixup! claim credit! --- Lib/profiling/sampling/constants.py | 8 ++ Lib/profiling/sampling/jsonl_collector.py | 81 +++++++++---------- .../test_sampling_profiler/helpers.py | 8 +- .../test_binary_format.py | 2 +- .../test_sampling_profiler/test_collectors.py | 44 +++++----- 5 files changed, 73 insertions(+), 70 deletions(-) diff --git a/Lib/profiling/sampling/constants.py b/Lib/profiling/sampling/constants.py index a364d0b8fde1e0..d7c710f943b1b7 100644 --- a/Lib/profiling/sampling/constants.py +++ b/Lib/profiling/sampling/constants.py @@ -11,6 +11,14 @@ PROFILING_MODE_ALL = 3 # Combines GIL + CPU checks PROFILING_MODE_EXCEPTION = 4 # Only samples when thread has an active exception +PROFILING_MODE_NAMES = { + PROFILING_MODE_WALL: "wall", + PROFILING_MODE_CPU: "cpu", + PROFILING_MODE_GIL: "gil", + PROFILING_MODE_ALL: "all", + PROFILING_MODE_EXCEPTION: "exception", +} + # Sort mode constants SORT_MODE_NSAMPLES = 0 SORT_MODE_TOTTIME = 1 diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index bcb98e75d91164..7d26129b80de86 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -15,34 +15,36 @@ 1. ``meta`` (exactly one, first line):: - {"type":"meta","v":1,"run_id":"", + {"type":"meta","v":0,"run_id":"", "sample_interval_usec":,"mode":"wall|cpu|gil|all|exception"} ``mode`` is omitted when not provided. -2. ``str_def`` (zero or more):: +2. ``string_table`` (zero or more):: - {"type":"str_def","v":1,"run_id":"", - "defs":[{"str_id":,"value":""}, ...]} + {"type":"string_table","v":0,"run_id":"", + "strings":[{"str_id":,"value":""}, ...]} Strings (filenames, function names) are interned to keep repeated values - compact. Each chunk holds up to ``_CHUNK_SIZE`` entries. + compact. IDs are zero-based. Each chunk holds up to ``_CHUNK_SIZE`` + entries, and each entry carries its explicit ``str_id`` so consumers do + not need to infer offsets across chunks. -3. ``frame_def`` (zero or more):: +3. ``frame_table`` (zero or more):: - {"type":"frame_def","v":1,"run_id":"", - "defs":[{"frame_id":,"path_str_id":,"func_str_id":, - "line":,"end_line":,"col":,"end_col":, - "synthetic":true}, ...]} + {"type":"frame_table","v":0,"run_id":"", + "frames":[{"frame_id":,"path_str_id":,"func_str_id":, + "line":,"end_line":,"col":, + "end_col":}, ...]} ``end_line``/``col``/``end_col`` are *omitted* when source location data is unavailable (a missing key means "not available", not zero or null). - ``synthetic`` is present only on synthetic frames (for example, internal - marker frames whose source location is None) and absent otherwise. + ``line`` is ``0`` for synthetic frames (for example, internal marker + frames whose source location is None). Frame IDs are zero-based. 4. ``agg`` (zero or more):: - {"type":"agg","v":1,"run_id":"","kind":"frame","scope":"final", + {"type":"agg","v":0,"run_id":"","kind":"frame","scope":"final", "samples_total":, "entries":[{"frame_id":,"self":,"cumulative":}, ...]} @@ -54,7 +56,7 @@ 5. ``end`` (exactly one, last line):: - {"type":"end","v":1,"run_id":"","samples_total":} + {"type":"end","v":0,"run_id":"","samples_total":} Presence of ``end`` is the consumer's signal that the file is complete. @@ -71,26 +73,13 @@ import uuid from itertools import batched -from .constants import ( - PROFILING_MODE_ALL, - PROFILING_MODE_CPU, - PROFILING_MODE_EXCEPTION, - PROFILING_MODE_GIL, - PROFILING_MODE_WALL, -) +from .constants import PROFILING_MODE_NAMES from .collector import normalize_location from .stack_collector import StackTraceCollector _CHUNK_SIZE = 256 - -_MODE_NAMES = { - PROFILING_MODE_WALL: "wall", - PROFILING_MODE_CPU: "cpu", - PROFILING_MODE_GIL: "gil", - PROFILING_MODE_ALL: "all", - PROFILING_MODE_EXCEPTION: "exception", -} +_SCHEMA_VERSION = 0 class JsonlCollector(StackTraceCollector): @@ -143,21 +132,29 @@ def export(self, filename): self._write_message(output, self._build_meta_record()) self._write_chunked_records( output, - {"type": "str_def", "v": 1, "run_id": self.run_id}, - "defs", + { + "type": "string_table", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + }, + "strings", self._strings, ) self._write_chunked_records( output, - {"type": "frame_def", "v": 1, "run_id": self.run_id}, - "defs", + { + "type": "frame_table", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + }, + "frames", self._frames, ) self._write_chunked_records( output, { "type": "agg", - "v": 1, + "v": _SCHEMA_VERSION, "run_id": self.run_id, "kind": "frame", "scope": "final", @@ -171,20 +168,22 @@ def export(self, filename): def _build_meta_record(self): record = { "type": "meta", - "v": 1, + "v": _SCHEMA_VERSION, "run_id": self.run_id, "sample_interval_usec": self.sample_interval_usec, } if self._mode is not None: - record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode)) + record["mode"] = PROFILING_MODE_NAMES.get( + self._mode, str(self._mode) + ) return record def _build_end_record(self): record = { "type": "end", - "v": 1, + "v": _SCHEMA_VERSION, "run_id": self.run_id, "samples_total": self._samples_total, } @@ -201,7 +200,6 @@ def _iter_final_agg_entries(self): } def _get_or_create_frame_id(self, filename, location, funcname): - synthetic = location is None location_fields = self._location_to_export_fields(location) func_str_id = self._intern_string(funcname) path_str_id = self._intern_string(filename) @@ -213,21 +211,18 @@ def _get_or_create_frame_id(self, filename, location, funcname): location_fields.get("end_line"), location_fields.get("col"), location_fields.get("end_col"), - synthetic, ) if (frame_id := self._frame_to_id.get(frame_key)) is not None: return frame_id - frame_id = len(self._frames) + 1 + frame_id = len(self._frames) frame_record = { "frame_id": frame_id, "path_str_id": path_str_id, "func_str_id": func_str_id, **location_fields, } - if synthetic: - frame_record["synthetic"] = True self._frame_to_id[frame_key] = frame_id self._frames.append(frame_record) @@ -239,7 +234,7 @@ def _intern_string(self, value): if (string_id := self._string_to_id.get(value)) is not None: return string_id - string_id = len(self._strings) + 1 + string_id = len(self._strings) self._string_to_id[value] = string_id self._strings.append({"str_id": string_id, "value": value}) return string_id diff --git a/Lib/test/test_profiling/test_sampling_profiler/helpers.py b/Lib/test/test_profiling/test_sampling_profiler/helpers.py index 87bdf2c7f778a2..b07776d415bb29 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/helpers.py +++ b/Lib/test/test_profiling/test_sampling_profiler/helpers.py @@ -190,13 +190,13 @@ def jsonl_tables(records): str_defs = { item["str_id"]: item["value"] for record in records - if record["type"] == "str_def" - for item in record["defs"] + if record["type"] == "string_table" + for item in record["strings"] } frame_defs = [ item for record in records - if record["type"] == "frame_def" - for item in record["defs"] + if record["type"] == "frame_table" + for item in record["frames"] ] return meta, str_defs, frame_defs, agg, end diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py index 2d8b705be8c2ea..fd674589b3135d 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py @@ -1260,7 +1260,7 @@ def test_binary_replay_to_jsonl_rle_weight_propagation(self): self.assertEqual(end["samples_total"], 50) self.assertEqual(agg["entries"], [ - {"frame_id": 1, "self": 50, "cumulative": 50}, + {"frame_id": 0, "self": 50, "cumulative": 50}, ]) def test_binary_replay_to_jsonl_omits_unavailable_columns(self): diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index bd3461885281e8..b42e7aa579f40c 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1728,11 +1728,11 @@ def test_jsonl_collector_export_exact_output(self): self.assertEqual( content, ( - '{"type":"meta","v":1,"run_id":"run-123","sample_interval_usec":1000}\n' - '{"type":"str_def","v":1,"run_id":"run-123","defs":[{"str_id":1,"value":"func1"},{"str_id":2,"value":"file.py"},{"str_id":3,"value":"func2"},{"str_id":4,"value":"other_func"},{"str_id":5,"value":"other.py"}]}\n' - '{"type":"frame_def","v":1,"run_id":"run-123","defs":[{"frame_id":1,"path_str_id":2,"func_str_id":1,"line":10,"end_line":10},{"frame_id":2,"path_str_id":2,"func_str_id":3,"line":20,"end_line":20},{"frame_id":3,"path_str_id":5,"func_str_id":4,"line":5,"end_line":5}]}\n' - '{"type":"agg","v":1,"run_id":"run-123","kind":"frame","scope":"final","samples_total":3,"entries":[{"frame_id":1,"self":2,"cumulative":2},{"frame_id":2,"self":0,"cumulative":2},{"frame_id":3,"self":1,"cumulative":1}]}\n' - '{"type":"end","v":1,"run_id":"run-123","samples_total":3}\n' + '{"type":"meta","v":0,"run_id":"run-123","sample_interval_usec":1000}\n' + '{"type":"string_table","v":0,"run_id":"run-123","strings":[{"str_id":0,"value":"func1"},{"str_id":1,"value":"file.py"},{"str_id":2,"value":"func2"},{"str_id":3,"value":"other_func"},{"str_id":4,"value":"other.py"}]}\n' + '{"type":"frame_table","v":0,"run_id":"run-123","frames":[{"frame_id":0,"path_str_id":1,"func_str_id":0,"line":10,"end_line":10},{"frame_id":1,"path_str_id":1,"func_str_id":2,"line":20,"end_line":20},{"frame_id":2,"path_str_id":4,"func_str_id":3,"line":5,"end_line":5}]}\n' + '{"type":"agg","v":0,"run_id":"run-123","kind":"frame","scope":"final","samples_total":3,"entries":[{"frame_id":0,"self":2,"cumulative":2},{"frame_id":1,"self":0,"cumulative":2},{"frame_id":2,"self":1,"cumulative":1}]}\n' + '{"type":"end","v":0,"run_id":"run-123","samples_total":3}\n' ), ) @@ -1919,18 +1919,19 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): records ) str_chunks = [ - record for record in records if record["type"] == "str_def" + record for record in records if record["type"] == "string_table" ] frame_chunks = [ - record for record in records if record["type"] == "frame_def" + record for record in records if record["type"] == "frame_table" ] agg_chunks = [record for record in records if record["type"] == "agg"] self.assertEqual( - [len(record["defs"]) for record in str_chunks], [256, 256, 2] + [len(record["strings"]) for record in str_chunks], + [256, 256, 2], ) self.assertEqual( - [len(record["defs"]) for record in frame_chunks], [256, 1] + [len(record["frames"]) for record in frame_chunks], [256, 1] ) self.assertEqual( [len(record["entries"]) for record in agg_chunks], [256, 1] @@ -1960,11 +1961,11 @@ def test_jsonl_collector_respects_weight_for_rle_batched_samples(self): self.assertEqual(agg["samples_total"], 5) self.assertEqual( {str_defs[fd["func_str_id"]]: fd["frame_id"] for fd in frame_defs}, - {"leaf": 1, "non_leaf": 2}, + {"leaf": 0, "non_leaf": 1}, ) self.assertEqual(agg["entries"], [ - {"frame_id": 1, "self": 5, "cumulative": 5}, - {"frame_id": 2, "self": 0, "cumulative": 5}, + {"frame_id": 0, "self": 5, "cumulative": 5}, + {"frame_id": 1, "self": 0, "cumulative": 5}, ]) def test_jsonl_collector_recursion_with_weight(self): @@ -1984,7 +1985,7 @@ def test_jsonl_collector_recursion_with_weight(self): _, _, frame_defs, agg, _ = jsonl_tables(records) self.assertEqual(len(frame_defs), 1) self.assertEqual(agg["entries"], [ - {"frame_id": 1, "self": 3, "cumulative": 3}, + {"frame_id": 0, "self": 3, "cumulative": 3}, ]) def test_jsonl_collector_emits_col_and_end_col_when_present(self): @@ -2009,22 +2010,22 @@ def test_jsonl_collector_emits_col_and_end_col_when_present(self): _, str_defs, frame_defs, _, _ = jsonl_tables(records) self.assertEqual(frame_defs, [ { - "frame_id": 1, - "path_str_id": 2, - "func_str_id": 1, + "frame_id": 0, + "path_str_id": 1, + "func_str_id": 0, "line": 42, "end_line": 45, "col": 4, "end_col": 12, }, ]) - self.assertEqual(str_defs, {1: "f", 2: "test.py"}) + self.assertEqual(str_defs, {0: "f", 1: "test.py"}) def test_jsonl_collector_partial_location_elision(self): """Negative col/end_col/end_line fields are individually elided.""" # _get_or_create_frame_id interns funcname before filename, so - # func_str_id=1 ("f") and path_str_id=2 ("test.py"). - common = {"frame_id": 1, "path_str_id": 2, "func_str_id": 1} + # func_str_id=0 ("f") and path_str_id=1 ("test.py"). + common = {"frame_id": 0, "path_str_id": 1, "func_str_id": 0} cases = [ (LocationInfo(42, 45, -1, 12), {**common, "line": 42, "end_line": 45, "end_col": 12}), @@ -2500,7 +2501,7 @@ def test_jsonl_collector_with_location_info(self): self.assertEqual( frame_defs[0], { - "frame_id": 1, + "frame_id": 0, "path_str_id": frame_defs[0]["path_str_id"], "func_str_id": frame_defs[0]["func_str_id"], "line": 42, @@ -2541,11 +2542,10 @@ def test_jsonl_collector_with_none_location(self): self.assertEqual( frame_defs[0], { - "frame_id": 1, + "frame_id": 0, "path_str_id": frame_defs[0]["path_str_id"], "func_str_id": frame_defs[0]["func_str_id"], "line": 0, - "synthetic": True, }, )