diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 6a0bb5e5c2f387..71579a3903253e 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,6 +9,15 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .jsonl_collector import JsonlCollector from .string_table import StringTable -__all__ = ("Collector", "PstatsCollector", "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", "StringTable") +__all__ = ( + "Collector", + "PstatsCollector", + "CollapsedStackCollector", + "HeatmapCollector", + "GeckoCollector", + "JsonlCollector", + "StringTable", +) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index a11be3652597a6..a29dad91ae339d 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,6 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector +from .jsonl_collector import JsonlCollector from .pstats_collector import PstatsCollector @@ -117,6 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) + elif output_format == "jsonl": + collector = JsonlCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index 9900415ae8a927..0648713edc52af 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -20,6 +20,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector, DiffFlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .jsonl_collector import JsonlCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -101,6 +102,7 @@ def __call__(self, parser, namespace, values, option_string=None): "diff_flamegraph": "html", "gecko": "json", "heatmap": "html", + "jsonl": "jsonl", "binary": "bin", } @@ -111,6 +113,7 @@ def __call__(self, parser, namespace, values, option_string=None): "diff_flamegraph": DiffFlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, + "jsonl": JsonlCollector, "binary": BinaryCollector, } @@ -488,6 +491,13 @@ def _add_format_options(parser, include_compression=True, include_binary=True): action=DiffFlamegraphAction, help="Generate differential flamegraph comparing current profile to `BASELINE` binary file", ) + format_group.add_argument( + "--jsonl", + action="store_const", + const="jsonl", + dest="format", + help="Generate newline-delimited JSON (JSONL) for programmatic consumers", + ) if include_binary: format_group.add_argument( "--binary", @@ -611,15 +621,18 @@ def _sort_to_mode(sort_choice): return sort_map.get(sort_choice, SORT_MODE_NSAMPLES) def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=False, - output_file=None, compression='auto', diff_baseline=None): + mode=None, output_file=None, compression='auto', + diff_baseline=None): """Create the appropriate collector based on format type. Args: - format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary', 'diff_flamegraph') + format_type: The output format ('pstats', 'collapsed', 'flamegraph', + 'gecko', 'heatmap', 'jsonl', 'binary', 'diff_flamegraph') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format for creating interval markers in Firefox Profiler) + mode: Profiling mode for collectors that expose it in metadata output_file: Output file path (required for binary format) compression: Compression type for binary format ('auto', 'zstd', 'none') diff_baseline: Path to baseline binary file for differential flamegraph @@ -655,6 +668,11 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) + if format_type == "jsonl": + return collector_class( + sample_interval_usec, skip_idle=skip_idle, mode=mode + ) + return collector_class(sample_interval_usec, skip_idle=skip_idle) @@ -1142,7 +1160,7 @@ def _handle_attach(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto'), diff_baseline=args.diff_baseline @@ -1249,7 +1267,7 @@ def _handle_run(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto'), diff_baseline=args.diff_baseline diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py index 08759b611696b7..81ec6344ebdea4 100644 --- a/Lib/profiling/sampling/collector.py +++ b/Lib/profiling/sampling/collector.py @@ -20,13 +20,16 @@ def normalize_location(location): """Normalize location to a 4-tuple format. Args: - location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None + location: tuple (lineno, end_lineno, col_offset, end_col_offset), + an integer line number, or None Returns: tuple: (lineno, end_lineno, col_offset, end_col_offset) """ if location is None: return DEFAULT_LOCATION + if isinstance(location, int): + return (location, location, -1, -1) return location @@ -34,13 +37,16 @@ def extract_lineno(location): """Extract lineno from location. Args: - location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None + location: tuple (lineno, end_lineno, col_offset, end_col_offset), + an integer line number, or None Returns: int: The line number (0 for synthetic frames) """ if location is None: return 0 + if isinstance(location, int): + return location return location[0] def _is_internal_frame(frame): diff --git a/Lib/profiling/sampling/constants.py b/Lib/profiling/sampling/constants.py index a364d0b8fde1e0..d7c710f943b1b7 100644 --- a/Lib/profiling/sampling/constants.py +++ b/Lib/profiling/sampling/constants.py @@ -11,6 +11,14 @@ PROFILING_MODE_ALL = 3 # Combines GIL + CPU checks PROFILING_MODE_EXCEPTION = 4 # Only samples when thread has an active exception +PROFILING_MODE_NAMES = { + PROFILING_MODE_WALL: "wall", + PROFILING_MODE_CPU: "cpu", + PROFILING_MODE_GIL: "gil", + PROFILING_MODE_ALL: "all", + PROFILING_MODE_EXCEPTION: "exception", +} + # Sort mode constants SORT_MODE_NSAMPLES = 0 SORT_MODE_TOTTIME = 1 diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py new file mode 100644 index 00000000000000..7d26129b80de86 --- /dev/null +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -0,0 +1,266 @@ +"""JSON Lines (JSONL) collector for the sampling profiler. + +Emits a normalized newline-delimited JSON record stream suitable for +programmatic consumption by external tools, scripts, and agents. Each line +is one JSON object; consumers can parse the file incrementally line by +line, but the producer writes the whole file at the end of the run (it is +not a live/streaming producer). + +Record schema +============= + +Every record is a JSON object with at least ``"type"``, ``"v"`` (record +schema version), and ``"run_id"`` (UUID4 hex tagging the run; allows +demultiplexing concatenated streams). Records appear in this fixed order: + +1. ``meta`` (exactly one, first line):: + + {"type":"meta","v":0,"run_id":"", + "sample_interval_usec":,"mode":"wall|cpu|gil|all|exception"} + + ``mode`` is omitted when not provided. + +2. ``string_table`` (zero or more):: + + {"type":"string_table","v":0,"run_id":"", + "strings":[{"str_id":,"value":""}, ...]} + + Strings (filenames, function names) are interned to keep repeated values + compact. IDs are zero-based. Each chunk holds up to ``_CHUNK_SIZE`` + entries, and each entry carries its explicit ``str_id`` so consumers do + not need to infer offsets across chunks. + +3. ``frame_table`` (zero or more):: + + {"type":"frame_table","v":0,"run_id":"", + "frames":[{"frame_id":,"path_str_id":,"func_str_id":, + "line":,"end_line":,"col":, + "end_col":}, ...]} + + ``end_line``/``col``/``end_col`` are *omitted* when source location data + is unavailable (a missing key means "not available", not zero or null). + ``line`` is ``0`` for synthetic frames (for example, internal marker + frames whose source location is None). Frame IDs are zero-based. + +4. ``agg`` (zero or more):: + + {"type":"agg","v":0,"run_id":"","kind":"frame","scope":"final", + "samples_total":, + "entries":[{"frame_id":,"self":,"cumulative":}, ...]} + + ``self`` counts samples where the frame was the leaf (currently + executing); ``cumulative`` counts samples where the frame appeared + anywhere in the stack (deduped per sample so recursion does not + double-count). ``samples_total`` is the run-wide total, repeated on + each chunk so a streaming consumer always knows the denominator. + +5. ``end`` (exactly one, last line):: + + {"type":"end","v":0,"run_id":"","samples_total":} + + Presence of ``end`` is the consumer's signal that the file is complete. + +Forward compatibility +===================== + +Consumers MUST ignore unknown record ``"type"`` values and unknown object +fields. New fields will be added by adding optional keys; an incompatible +schema change will bump the per-record ``"v"``. +""" + +from collections import Counter +import json +import uuid +from itertools import batched + +from .constants import PROFILING_MODE_NAMES +from .collector import normalize_location +from .stack_collector import StackTraceCollector + + +_CHUNK_SIZE = 256 +_SCHEMA_VERSION = 0 + + +class JsonlCollector(StackTraceCollector): + """Collector that exports finalized profiling data as JSONL. + + See the module docstring for the full record schema. The collector + accumulates samples in memory and writes the complete file at + ``export()`` time. + """ + + def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): + super().__init__(sample_interval_usec, skip_idle=skip_idle) + self.run_id = uuid.uuid4().hex + + self._string_to_id = {} + self._strings = [] + + self._frame_to_id = {} + self._frames = [] + + self._frame_self = Counter() + self._frame_cumulative = Counter() + self._samples_total = 0 + self._seen_frame_ids = set() + + self._mode = mode + + def process_frames(self, frames, _thread_id, weight=1): + self._samples_total += weight + self._seen_frame_ids.clear() + + for i, (filename, location, funcname, _opcode) in enumerate(frames): + frame_id = self._get_or_create_frame_id( + filename, location, funcname + ) + is_leaf = i == 0 + count_cumulative = frame_id not in self._seen_frame_ids + + if count_cumulative: + self._seen_frame_ids.add(frame_id) + + if is_leaf: + self._frame_self[frame_id] += weight + + if count_cumulative: + self._frame_cumulative[frame_id] += weight + + def export(self, filename): + with open(filename, "w", encoding="utf-8") as output: + self._write_message(output, self._build_meta_record()) + self._write_chunked_records( + output, + { + "type": "string_table", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + }, + "strings", + self._strings, + ) + self._write_chunked_records( + output, + { + "type": "frame_table", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + }, + "frames", + self._frames, + ) + self._write_chunked_records( + output, + { + "type": "agg", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + "kind": "frame", + "scope": "final", + "samples_total": self._samples_total, + }, + "entries", + self._iter_final_agg_entries(), + ) + self._write_message(output, self._build_end_record()) + + def _build_meta_record(self): + record = { + "type": "meta", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + "sample_interval_usec": self.sample_interval_usec, + } + + if self._mode is not None: + record["mode"] = PROFILING_MODE_NAMES.get( + self._mode, str(self._mode) + ) + + return record + + def _build_end_record(self): + record = { + "type": "end", + "v": _SCHEMA_VERSION, + "run_id": self.run_id, + "samples_total": self._samples_total, + } + + return record + + def _iter_final_agg_entries(self): + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + yield { + "frame_id": frame_id, + "self": self._frame_self[frame_id], + "cumulative": self._frame_cumulative[frame_id], + } + + def _get_or_create_frame_id(self, filename, location, funcname): + location_fields = self._location_to_export_fields(location) + func_str_id = self._intern_string(funcname) + path_str_id = self._intern_string(filename) + + frame_key = ( + path_str_id, + func_str_id, + location_fields["line"], + location_fields.get("end_line"), + location_fields.get("col"), + location_fields.get("end_col"), + ) + + if (frame_id := self._frame_to_id.get(frame_key)) is not None: + return frame_id + + frame_id = len(self._frames) + frame_record = { + "frame_id": frame_id, + "path_str_id": path_str_id, + "func_str_id": func_str_id, + **location_fields, + } + + self._frame_to_id[frame_key] = frame_id + self._frames.append(frame_record) + return frame_id + + def _intern_string(self, value): + value = str(value) + + if (string_id := self._string_to_id.get(value)) is not None: + return string_id + + string_id = len(self._strings) + self._string_to_id[value] = string_id + self._strings.append({"str_id": string_id, "value": value}) + return string_id + + @staticmethod + def _location_to_export_fields(location): + lineno, end_lineno, col_offset, end_col_offset = normalize_location( + location + ) + + fields = {"line": lineno} + if end_lineno > 0: + fields["end_line"] = end_lineno + if col_offset >= 0: + fields["col"] = col_offset + if end_col_offset >= 0: + fields["end_col"] = end_col_offset + return fields + + def _write_chunked_records( + self, output, base_record, chunk_field, entries + ): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message(output, {**base_record, chunk_field: chunk}) + + @staticmethod + def _write_message(output, record): + output.write(json.dumps(record, separators=(",", ":"))) + output.write("\n") diff --git a/Lib/test/test_profiling/test_sampling_profiler/helpers.py b/Lib/test/test_profiling/test_sampling_profiler/helpers.py index 0e32d8dd9eabef..b07776d415bb29 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/helpers.py +++ b/Lib/test/test_profiling/test_sampling_profiler/helpers.py @@ -174,3 +174,29 @@ def close_and_unlink(file): """Close a file and unlink it from the filesystem.""" file.close() unlink(file.name) + + +def jsonl_tables(records): + """Extract the canonical sections of a parsed JSONL profile. + + Returns ``(meta, str_defs, frame_defs, agg, end)`` where ``str_defs`` is a + ``{str_id: value}`` dict, ``frame_defs`` is a flat list of all frame + definitions across chunks, and ``agg`` is the first agg record (sufficient + for tests that only emit one chunk). + """ + meta = next(record for record in records if record["type"] == "meta") + end = next(record for record in records if record["type"] == "end") + agg = next(record for record in records if record["type"] == "agg") + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "string_table" + for item in record["strings"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_table" + for item in record["frames"] + ] + return meta, str_defs, frame_defs, agg, end diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py index 29f83c843561cd..fd674589b3135d 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py @@ -1,5 +1,6 @@ """Tests for binary format round-trip functionality.""" +import json import os import random import tempfile @@ -21,7 +22,7 @@ THREAD_STATUS_MAIN_THREAD, ) from profiling.sampling.binary_collector import BinaryCollector - from profiling.sampling.binary_reader import BinaryReader + from profiling.sampling.binary_reader import BinaryReader, convert_binary_to_format from profiling.sampling.gecko_collector import GeckoCollector ZSTD_AVAILABLE = _remote_debugging.zstd_available() @@ -30,6 +31,8 @@ "Test only runs when _remote_debugging is available" ) +from .helpers import jsonl_tables + def make_frame(filename, lineno, funcname, end_lineno=None, column=None, end_column=None, opcode=None): @@ -1211,5 +1214,70 @@ def test_timestamp_preservation_with_rle(self): self.assertEqual(ts_collector.all_timestamps, expected_timestamps) +class TestBinaryReplayToJsonl(BinaryFormatTestBase): + """Tests for binary -> JSONL replay via convert_binary_to_format.""" + + def _replay_to_jsonl(self, samples, interval=1000): + bin_path = self.create_binary_file(samples, interval=interval) + with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: + jsonl_path = f.name + self.temp_files.append(jsonl_path) + + convert_binary_to_format(bin_path, jsonl_path, "jsonl") + + with open(jsonl_path, "r", encoding="utf-8") as f: + return [json.loads(line) for line in f] + + def test_binary_replay_to_jsonl_basic(self): + """Replay a small .bin to JSONL: meta/end shape, samples_total, run_id.""" + frame = make_frame("hot.py", 99, "hot_func") + samples = [ + [make_interpreter(0, [make_thread(1, [frame])])] + for _ in range(5) + ] + records = self._replay_to_jsonl(samples, interval=2000) + meta, _, frame_defs, _, end = jsonl_tables(records) + + self.assertEqual(meta["sample_interval_usec"], 2000) + self.assertEqual(end["samples_total"], 5) + + run_ids = {r["run_id"] for r in records} + self.assertEqual(len(run_ids), 1) + self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") + + self.assertEqual(len(frame_defs), 1) + self.assertEqual(frame_defs[0]["line"], 99) + + def test_binary_replay_to_jsonl_rle_weight_propagation(self): + """RLE-batched identical samples land as a single agg entry with the right total.""" + frame = make_frame("rle.py", 42, "rle_func") + samples = [ + [make_interpreter(0, [make_thread(1, [frame])])] + for _ in range(50) + ] + records = self._replay_to_jsonl(samples) + _, _, _, agg, end = jsonl_tables(records) + + self.assertEqual(end["samples_total"], 50) + self.assertEqual(agg["entries"], [ + {"frame_id": 0, "self": 50, "cumulative": 50}, + ]) + + def test_binary_replay_to_jsonl_omits_unavailable_columns(self): + """Columns the binary recorder did not capture are omitted, not 0.""" + # make_frame defaults column/end_column to 0; pass column=-1 / end_column=-1 + # so the binary side records LOCATION_NOT_AVAILABLE. + frame = make_frame("nocol.py", 7, "no_col", column=-1, end_column=-1) + samples = [[make_interpreter(0, [make_thread(1, [frame])])]] + records = self._replay_to_jsonl(samples) + _, _, frame_defs, _, _ = jsonl_tables(records) + + self.assertEqual(len(frame_defs), 1) + fd = frame_defs[0] + self.assertEqual(fd["line"], 7) + self.assertNotIn("col", fd) + self.assertNotIn("end_col", fd) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py index c522c50d1fd5fa..9c0734ac804e1b 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py @@ -1,6 +1,7 @@ """Tests for sampling profiler CLI argument parsing and functionality.""" import io +import json import os import subprocess import sys @@ -21,9 +22,19 @@ requires_remote_subprocess_debugging, ) -from profiling.sampling.cli import main -from profiling.sampling.constants import PROFILING_MODE_ALL, PROFILING_MODE_WALL +from profiling.sampling.cli import ( + FORMAT_EXTENSIONS, + _create_collector, + _generate_output_filename, + main, +) +from profiling.sampling.constants import ( + PROFILING_MODE_ALL, + PROFILING_MODE_CPU, + PROFILING_MODE_WALL, +) from profiling.sampling.errors import SamplingScriptNotFoundError, SamplingModuleNotFoundError, SamplingUnknownProcessError +from profiling.sampling.jsonl_collector import JsonlCollector class TestSampleProfilerCLI(unittest.TestCase): def _setup_sync_mocks(self, mock_socket, mock_popen): @@ -912,3 +923,65 @@ def test_cli_replay_reader_errors_exit_cleanly(self): str(cm.exception), "Error: Unsupported format version 2", ) + + def test_cli_jsonl_format_mutually_exclusive_with_pstats(self): + """--jsonl and --pstats cannot be combined (mutually exclusive group).""" + with ( + mock.patch( + "sys.argv", + [ + "profiling.sampling.cli", + "attach", + "12345", + "--jsonl", + "--pstats", + ], + ), + mock.patch("sys.stderr", io.StringIO()), + ): + with self.assertRaises(SystemExit): + main() + + def test_cli_jsonl_extension_in_format_extensions(self): + """FORMAT_EXTENSIONS maps 'jsonl' -> 'jsonl' so default filenames work.""" + self.assertEqual(FORMAT_EXTENSIONS["jsonl"], "jsonl") + self.assertEqual(_generate_output_filename("jsonl", 12345), "jsonl_12345.jsonl") + + def test_cli_jsonl_create_collector_propagates_mode(self): + """_create_collector('jsonl', ..., mode=X) lands X in the meta record.""" + collector = _create_collector( + "jsonl", + sample_interval_usec=1000, + skip_idle=False, + mode=PROFILING_MODE_CPU, + ) + self.assertIsInstance(collector, JsonlCollector) + + with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: + jsonl_path = f.name + self.addCleanup(os.unlink, jsonl_path) + collector.export(jsonl_path) + with open(jsonl_path, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + meta = next(r for r in records if r["type"] == "meta") + self.assertEqual(meta["mode"], "cpu") + + def test_cli_jsonl_rejects_opcodes_combination(self): + """--opcodes is incompatible with --jsonl per opcodes_compatible_formats.""" + test_args = [ + "profiling.sampling.cli", + "attach", + "12345", + "--jsonl", + "--opcodes", + ] + with ( + mock.patch("sys.argv", test_args), + mock.patch("sys.stderr", io.StringIO()) as mock_stderr, + mock.patch("profiling.sampling.cli.sample"), + self.assertRaises(SystemExit) as cm, + ): + main() + + self.assertEqual(cm.exception.code, 2) + self.assertIn("--opcodes", mock_stderr.getvalue()) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 240ec8a195c43b..b42e7aa579f40c 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -16,6 +16,7 @@ CollapsedStackCollector, FlamegraphCollector, ) + from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.gecko_collector import GeckoCollector from profiling.sampling.collector import extract_lineno, normalize_location from profiling.sampling.opcode_utils import get_opcode_info, format_opcode @@ -38,7 +39,7 @@ from test.support import captured_stdout, captured_stderr from .mocks import MockFrameInfo, MockThreadInfo, MockInterpreterInfo, LocationInfo, make_diff_collector_with_mock_baseline -from .helpers import close_and_unlink +from .helpers import close_and_unlink, jsonl_tables def resolve_name(node, strings): @@ -1669,6 +1670,393 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) + def test_jsonl_collector_export_exact_output(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.run_id = "run-123" + + test_frames1 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("file.py", 10, "func1"), + MockFrameInfo("file.py", 20, "func2"), + ], + ) + ], + ) + ] + test_frames2 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("file.py", 10, "func1"), + MockFrameInfo("file.py", 20, "func2"), + ], + ) + ], + ) + ] # Same stack + test_frames3 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("other.py", 5, "other_func")] + ) + ], + ) + ] + + collector.collect(test_frames1) + collector.collect(test_frames2) + collector.collect(test_frames3) + + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + content = f.read() + + self.assertEqual( + content, + ( + '{"type":"meta","v":0,"run_id":"run-123","sample_interval_usec":1000}\n' + '{"type":"string_table","v":0,"run_id":"run-123","strings":[{"str_id":0,"value":"func1"},{"str_id":1,"value":"file.py"},{"str_id":2,"value":"func2"},{"str_id":3,"value":"other_func"},{"str_id":4,"value":"other.py"}]}\n' + '{"type":"frame_table","v":0,"run_id":"run-123","frames":[{"frame_id":0,"path_str_id":1,"func_str_id":0,"line":10,"end_line":10},{"frame_id":1,"path_str_id":1,"func_str_id":2,"line":20,"end_line":20},{"frame_id":2,"path_str_id":4,"func_str_id":3,"line":5,"end_line":5}]}\n' + '{"type":"agg","v":0,"run_id":"run-123","kind":"frame","scope":"final","samples_total":3,"entries":[{"frame_id":0,"self":2,"cumulative":2},{"frame_id":1,"self":0,"cumulative":2},{"frame_id":2,"self":1,"cumulative":1}]}\n' + '{"type":"end","v":0,"run_id":"run-123","samples_total":3}\n' + ), + ) + + def test_jsonl_collector_export_includes_mode_in_meta(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000, mode=PROFILING_MODE_CPU) + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func")] + ) + ], + ) + ] + ) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + meta_record = next( + record for record in records if record["type"] == "meta" + ) + self.assertEqual(meta_record["mode"], "cpu") + + def test_jsonl_collector_export_empty_profile(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.run_id = "run-123" + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + self.assertEqual( + [record["type"] for record in records], ["meta", "end"] + ) + self.assertEqual(records[0]["sample_interval_usec"], 1000) + self.assertEqual(records[0]["run_id"], "run-123") + self.assertEqual(records[1]["samples_total"], 0) + self.assertEqual(records[1]["run_id"], "run-123") + + def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + ], + ) + ], + ) + ] + ) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, agg_record, end_record = jsonl_tables(records) + self.assertEqual(len(frame_defs), 1) + self.assertEqual( + agg_record["entries"], + [ + { + "frame_id": frame_defs[0]["frame_id"], + "self": 1, + "cumulative": 1, + } + ], + ) + self.assertEqual(agg_record["samples_total"], 1) + self.assertEqual(end_record["samples_total"], 1) + + def test_jsonl_collector_skip_idle_filters_threads(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + active_status = THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU + frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [MockFrameInfo("active1.py", 10, "active_func1")], + status=active_status, + ), + MockThreadInfo( + 2, + [MockFrameInfo("idle.py", 20, "idle_func")], + status=0, + ), + MockThreadInfo( + 3, + [MockFrameInfo("active2.py", 30, "active_func2")], + status=active_status, + ), + ], + ) + ] + + def export_summary(skip_idle): + collector = JsonlCollector(1000, skip_idle=skip_idle) + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, agg_record, _ = jsonl_tables(records) + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + funcs = {str_defs[item["func_str_id"]] for item in frame_defs} + return paths, funcs, agg_record["samples_total"] + + paths, funcs, samples_total = export_summary(skip_idle=True) + self.assertEqual(paths, {"active1.py", "active2.py"}) + self.assertEqual(funcs, {"active_func1", "active_func2"}) + self.assertEqual(samples_total, 2) + + paths, funcs, samples_total = export_summary(skip_idle=False) + self.assertEqual(paths, {"active1.py", "idle.py", "active2.py"}) + self.assertEqual(funcs, {"active_func1", "idle_func", "active_func2"}) + self.assertEqual(samples_total, 3) + + def test_jsonl_collector_splits_large_exports_into_chunks(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + + for i in range(257): + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo( + f"file{i}.py", i + 1, f"func{i}" + ) + ], + ) + ], + ) + ] + ) + + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + run_ids = {record["run_id"] for record in records} + self.assertEqual(len(run_ids), 1) + self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") + + _, str_defs, frame_defs, agg_record, end_record = jsonl_tables( + records + ) + str_chunks = [ + record for record in records if record["type"] == "string_table" + ] + frame_chunks = [ + record for record in records if record["type"] == "frame_table" + ] + agg_chunks = [record for record in records if record["type"] == "agg"] + + self.assertEqual( + [len(record["strings"]) for record in str_chunks], + [256, 256, 2], + ) + self.assertEqual( + [len(record["frames"]) for record in frame_chunks], [256, 1] + ) + self.assertEqual( + [len(record["entries"]) for record in agg_chunks], [256, 1] + ) + self.assertEqual(len(str_defs), 514) + self.assertEqual(len(frame_defs), 257) + self.assertEqual(agg_record["samples_total"], 257) + self.assertEqual(end_record["samples_total"], 257) + + def test_jsonl_collector_respects_weight_for_rle_batched_samples(self): + """weight>1 (from binary replay RLE) is honored in self/cumulative.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + leaf = MockFrameInfo("file.py", 10, "leaf") + non_leaf = MockFrameInfo("file.py", 20, "non_leaf") + + collector.process_frames([leaf, non_leaf], _thread_id=1, weight=5) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, agg, end = jsonl_tables(records) + self.assertEqual(end["samples_total"], 5) + self.assertEqual(agg["samples_total"], 5) + self.assertEqual( + {str_defs[fd["func_str_id"]]: fd["frame_id"] for fd in frame_defs}, + {"leaf": 0, "non_leaf": 1}, + ) + self.assertEqual(agg["entries"], [ + {"frame_id": 0, "self": 5, "cumulative": 5}, + {"frame_id": 1, "self": 0, "cumulative": 5}, + ]) + + def test_jsonl_collector_recursion_with_weight(self): + """Recursion dedup respects weight, not occurrence count.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + recursive = MockFrameInfo("rec.py", 10, "f") + + collector.process_frames([recursive] * 3, _thread_id=1, weight=3) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, agg, _ = jsonl_tables(records) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(agg["entries"], [ + {"frame_id": 0, "self": 3, "cumulative": 3}, + ]) + + def test_jsonl_collector_emits_col_and_end_col_when_present(self): + """All four location fields are emitted when col/end_col are >= 0.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + frame = MockFrameInfo("test.py", 0, "f") + frame.location = LocationInfo(42, 45, 4, 12) + frames = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, _, _ = jsonl_tables(records) + self.assertEqual(frame_defs, [ + { + "frame_id": 0, + "path_str_id": 1, + "func_str_id": 0, + "line": 42, + "end_line": 45, + "col": 4, + "end_col": 12, + }, + ]) + self.assertEqual(str_defs, {0: "f", 1: "test.py"}) + + def test_jsonl_collector_partial_location_elision(self): + """Negative col/end_col/end_line fields are individually elided.""" + # _get_or_create_frame_id interns funcname before filename, so + # func_str_id=0 ("f") and path_str_id=1 ("test.py"). + common = {"frame_id": 0, "path_str_id": 1, "func_str_id": 0} + cases = [ + (LocationInfo(42, 45, -1, 12), + {**common, "line": 42, "end_line": 45, "end_col": 12}), + (LocationInfo(42, 45, 4, -1), + {**common, "line": 42, "end_line": 45, "col": 4}), + (LocationInfo(42, 0, 4, 8), + {**common, "line": 42, "col": 4, "end_col": 8}), + ] + for loc, expected_frame_def in cases: + with self.subTest(location=loc): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + frame = MockFrameInfo("test.py", 0, "f") + frame.location = loc + frames = [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)], + ) + ] + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, _, _ = jsonl_tables(records) + self.assertEqual(frame_defs, [expected_frame_def]) + class TestRecursiveFunctionHandling(unittest.TestCase): """Tests for correct handling of recursive functions in cumulative stats.""" @@ -1878,6 +2266,20 @@ def test_extract_lineno_from_none(self): """Test extracting lineno from None (synthetic frames).""" self.assertEqual(extract_lineno(None), 0) + def test_extract_lineno_from_int(self): + """Test extracting lineno from a bare integer line number. + + Mirrors normalize_location's int contract so callers like the + collapsed/flamegraph collectors do not crash on a bare-int location. + """ + self.assertEqual(extract_lineno(42), 42) + self.assertEqual(extract_lineno(0), 0) + + def test_normalize_location_with_int(self): + """Test normalize_location expands a legacy integer line number.""" + result = normalize_location(42) + self.assertEqual(result, (42, 42, -1, -1)) + def test_normalize_location_with_location_info(self): """Test normalize_location passes through LocationInfo.""" loc = LocationInfo(10, 15, 0, 5) @@ -2068,6 +2470,85 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) + def test_jsonl_collector_with_location_info(self): + """Test JsonlCollector handles LocationInfo properly.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(sample_interval_usec=1000) + + # Frame with LocationInfo + frame = MockFrameInfo("test.py", 42, "my_function") + frames = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + meta, str_defs, frame_defs, agg, end = jsonl_tables(records) + self.assertEqual(meta["sample_interval_usec"], 1000) + self.assertEqual(agg["samples_total"], 1) + self.assertEqual(end["samples_total"], 1) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(str_defs[frame_defs[0]["path_str_id"]], "test.py") + self.assertEqual(str_defs[frame_defs[0]["func_str_id"]], "my_function") + self.assertEqual( + frame_defs[0], + { + "frame_id": 0, + "path_str_id": frame_defs[0]["path_str_id"], + "func_str_id": frame_defs[0]["func_str_id"], + "line": 42, + "end_line": 42, + }, + ) + + def test_jsonl_collector_with_none_location(self): + """Test JsonlCollector handles None location (synthetic frames).""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(sample_interval_usec=1000) + + # Create frame with None location (like GC frame) + frame = MockFrameInfo("~", 0, "") + frame.location = None # Synthetic frame has no location + frames = [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + meta, str_defs, frame_defs, agg, end = jsonl_tables(records) + self.assertEqual(meta["sample_interval_usec"], 1000) + self.assertEqual(agg["samples_total"], 1) + self.assertEqual(end["samples_total"], 1) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(str_defs[frame_defs[0]["path_str_id"]], "~") + self.assertEqual(str_defs[frame_defs[0]["func_str_id"]], "") + self.assertEqual( + frame_defs[0], + { + "frame_id": 0, + "path_str_id": frame_defs[0]["path_str_id"], + "func_str_id": frame_defs[0]["func_str_id"], + "line": 0, + }, + ) + class TestOpcodeHandling(unittest.TestCase): """Tests for opcode field handling in collectors.""" @@ -2288,6 +2769,28 @@ def test_gecko_collector_frame_format(self): # Should have recorded 3 functions self.assertEqual(thread["funcTable"]["length"], 3) + def test_jsonl_collector_frame_format(self): + """Test JsonlCollector with 4-element frame format.""" + collector = JsonlCollector(sample_interval_usec=1000) + collector.collect(self._make_sample_frames()) + + with tempfile.NamedTemporaryFile(delete=False) as f: + self.addClassCleanup(close_and_unlink, f) + collector.export(f.name) + + with open(f.name, "r", encoding="utf-8") as fp: + records = [json.loads(line) for line in fp] + + _, str_defs, frame_defs, _, _ = jsonl_tables(records) + + self.assertEqual(len(frame_defs), 3) + + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + funcs = {str_defs[item["func_str_id"]] for item in frame_defs} + + self.assertEqual(paths, {"app.py", "utils.py", "lib.py"}) + self.assertEqual(funcs, {"main", "helper", "process"}) + class TestInternalFrameFiltering(unittest.TestCase): """Tests for filtering internal profiler frames from output.""" @@ -2415,3 +2918,42 @@ def test_collapsed_stack_collector_filters_internal_frames(self): for (call_tree, _), _ in collector.stack_counter.items(): for filename, _, _ in call_tree: self.assertNotIn("_sync_coordinator", filename) + + def test_jsonl_collector_filters_internal_frames(self): + """Test that JsonlCollector filters out internal frames.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("app.py", 50, "run"), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo("", 87, "_run_code"), + ], + status=THREAD_STATUS_HAS_GIL, + ) + ], + ) + ] + + collector = JsonlCollector(sample_interval_usec=1000) + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, _, _ = jsonl_tables(records) + + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + + self.assertIn("app.py", paths) + self.assertIn("", paths) + + for path in paths: + self.assertNotIn("_sync_coordinator", path) diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst new file mode 100644 index 00000000000000..636f45ae8d6c70 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -0,0 +1,4 @@ +The ``profiling.sampling`` module now supports JSONL output format via +``--jsonl``. Each run emits a newline-delimited JSON file that is +sequentially parseable by external tools, scripts, and programmatic +consumers. Patch by Maurycy Pawłowski-Wieroński. diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c index 6c32ef70ac3f65..a3364ce913923e 100644 --- a/Modules/_remote_debugging/binary_io_reader.c +++ b/Modules/_remote_debugging/binary_io_reader.c @@ -785,9 +785,9 @@ build_frame_list(RemoteDebuggingState *state, BinaryReader *reader, if (frame->lineno != LOCATION_NOT_AVAILABLE) { location = Py_BuildValue("(iiii)", frame->lineno, - frame->end_lineno != LOCATION_NOT_AVAILABLE ? frame->end_lineno : frame->lineno, - frame->column != LOCATION_NOT_AVAILABLE ? frame->column : 0, - frame->end_column != LOCATION_NOT_AVAILABLE ? frame->end_column : 0); + frame->end_lineno, + frame->column, + frame->end_column); if (!location) { Py_DECREF(frame_info); goto error;