diff --git a/.agents b/.agents new file mode 120000 index 000000000..c8161850a --- /dev/null +++ b/.agents @@ -0,0 +1 @@ +.claude \ No newline at end of file diff --git a/.claude/commands/perf-example-device.md b/.claude/commands/perf-example-device.md index 28c1a50ef..c72fef178 100644 --- a/.claude/commands/perf-example-device.md +++ b/.claude/commands/perf-example-device.md @@ -3,8 +3,9 @@ Benchmark the hardware performance of a single example at $ARGUMENTS. Reference `tools/benchmark_rounds.sh` for the full implementation pattern (device log resolution, timing parsing, reporting format). This skill runs the same logic but for a single example only. 1. Verify `$ARGUMENTS` exists and contains `kernels/kernel_config.py` and `golden.py` -2. Check `command -v npu-smi` — if not found, tell the user this requires hardware and stop -3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3` -4. Find the lowest-ID idle device (HBM-Usage = 0) from the `npu-smi info` output. If none, stop -5. Run the example following the same pattern as `run_bench()` in `tools/benchmark_rounds.sh`: +2. Require the example path to live under `examples/a2a3/` or `examples/a5/`. If it does not, stop and report that root-level `examples/{runtime}/...` paths are invalid. +3. Check `command -v npu-smi` — if not found, tell the user this requires hardware and stop +4. **Detect platform**: Infer the architecture from the example path (`examples/a2a3/...` → `a2a3`, `examples/a5/...` → `a5`). Use `npu-smi info` only as a sanity check; if the detected chip family conflicts with the path, report the mismatch and stop instead of silently switching platforms. +5. Find the lowest-ID idle device (HBM-Usage = 0) from the `npu-smi info` output. If none, stop +6. Run the example following the same pattern as `run_bench()` in `tools/benchmark_rounds.sh`: - Snapshot logs, run `run_example.py` with `-n 10`, find new log, parse timing, report results diff --git a/.claude/commands/profile.md b/.claude/commands/profile.md index aafe867f1..546bf3dca 100644 --- a/.claude/commands/profile.md +++ b/.claude/commands/profile.md @@ -1,6 +1,8 @@ Run the example at $ARGUMENTS with profiling enabled on hardware. 1. Verify the directory exists and contains `kernels/kernel_config.py` and `golden.py` -2. Run: `python examples/scripts/run_example.py -k $ARGUMENTS/kernels -g $ARGUMENTS/golden.py -p a2a3 --enable-profiling` -3. If the test passes, report the swimlane output file location in `outputs/` -4. Summarize the task statistics from the console output (per-function timing breakdown) +2. Require the example path to live under `examples/a2a3/` or `examples/a5/`. If it does not, stop and report that root-level `examples/{runtime}/...` paths are invalid. +3. Infer the platform from the example path (`examples/a2a3/...` → `a2a3`, `examples/a5/...` → `a5`). +4. Run: `python examples/scripts/run_example.py -k $ARGUMENTS/kernels -g $ARGUMENTS/golden.py -p --enable-profiling` +5. If the test passes, report the swimlane output file location in `outputs/` +6. Summarize the task statistics from the console output (per-function timing breakdown) diff --git a/.claude/commands/test-example-device.md b/.claude/commands/test-example-device.md index ac34dd232..f30736419 100644 --- a/.claude/commands/test-example-device.md +++ b/.claude/commands/test-example-device.md @@ -1,8 +1,9 @@ Run the hardware device test for the example at $ARGUMENTS. 1. Verify the directory exists and contains `kernels/kernel_config.py` and `golden.py` -2. Check `command -v npu-smi` — if not found, tell the user to use `/test-example-sim` instead and stop -3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3` -4. Read `.github/workflows/ci.yml` to extract the current `-c` (pto-isa commit) flag from the `st-onboard-` job's `./ci.sh` invocation -5. Run: `python examples/scripts/run_example.py -k $ARGUMENTS/kernels -g $ARGUMENTS/golden.py -p -c ` -6. Report pass/fail status with any error output +2. Require the example path to live under `examples/a2a3/` or `examples/a5/`. If it does not, stop and report that root-level `examples/{runtime}/...` paths are invalid. +3. Check `command -v npu-smi` — if not found, tell the user to use `/test-example-sim` instead and stop +4. **Detect platform**: Infer the architecture from the example path (`examples/a2a3/...` → `a2a3`, `examples/a5/...` → `a5`). Use `npu-smi info` only as a sanity check; if the detected chip family conflicts with the path, report the mismatch and stop instead of silently switching platforms. +5. Read `.github/workflows/ci.yml` to extract the current `-c` (pto-isa commit) flag from the `st-onboard-` job's `./ci.sh` invocation +6. Run: `python examples/scripts/run_example.py -k $ARGUMENTS/kernels -g $ARGUMENTS/golden.py -p -c ` +7. Report pass/fail status with any error output diff --git a/.claude/commands/test-example-sim.md b/.claude/commands/test-example-sim.md index 79deecf50..1f0bbb40c 100644 --- a/.claude/commands/test-example-sim.md +++ b/.claude/commands/test-example-sim.md @@ -1,7 +1,8 @@ Run the simulation test for the example at $ARGUMENTS. 1. Verify the directory exists and contains `kernels/kernel_config.py` and `golden.py` -2. Read `.github/workflows/ci.yml` to extract the current `-c` (pto-isa commit) flag from the `st-sim-*` jobs' `./ci.sh` invocations -3. **Detect platform**: Infer the architecture from the example path (e.g., `examples/a2a3/...` → `a2a3sim`, `examples/a5/...` → `a5sim`). If the path doesn't contain an arch prefix, default to `a2a3sim` -4. Run: `python examples/scripts/run_example.py -k $ARGUMENTS/kernels -g $ARGUMENTS/golden.py -p -c ` -5. Report pass/fail status with any error output +2. Require the example path to live under `examples/a2a3/` or `examples/a5/`. If it does not, stop and report that root-level `examples/{runtime}/...` paths are invalid. +3. Read `.github/workflows/ci.yml` to extract the current `-c` (pto-isa commit) flag from the `st-sim-*` jobs' `./ci.sh` invocations +4. **Detect platform**: Infer the architecture from the example path (`examples/a2a3/...` → `a2a3sim`, `examples/a5/...` → `a5sim`). +5. Run: `python examples/scripts/run_example.py -k $ARGUMENTS/kernels -g $ARGUMENTS/golden.py -p -c ` +6. Report pass/fail status with any error output diff --git a/.claude/rules/architecture.md b/.claude/rules/architecture.md index 0125b1822..abc1f78cb 100644 --- a/.claude/rules/architecture.md +++ b/.claude/rules/architecture.md @@ -24,6 +24,11 @@ See [docs/architecture.md](../../docs/architecture.md) for the full diagram, API ## Example / Test Layout +Examples must live under `examples/{arch}/{runtime}/{name}/`. Valid example roots are +`examples/a2a3/` and `examples/a5/`. Paths such as +`examples/host_build_graph//` or `examples/tensormap_and_ringbuffer//` +directly under `examples/` are invalid. + ```text my_example/ golden.py # generate_inputs() + compute_golden() diff --git a/.gitignore b/.gitignore index 37d5e142b..0b8113d9b 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ venv/ .claude/settings.local.json .claude/worktrees .claude/plans +.worktrees/ # Git cloned dependencies (not tracked in repo) examples/scripts/_deps/ diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 982e706a1..000000000 --- a/AGENTS.md +++ /dev/null @@ -1,17 +0,0 @@ -# AGENTS Guide - -**EVERY AI AGENT MUST FOLLOW THIS GUIDE BEFORE ANY WORK.** - -## Required startup sequence - -1. Read `CLAUDE.md` before running commands, analyzing code, or editing files. -2. Treat `CLAUDE.md` as the source of truth for role boundaries, architecture context, and repository workflow. -3. Load always-on conventions from `.claude/rules/` (for example: architecture, codestyle, device constraints). -4. Load only task-relevant workflows from `.claude/skills/` and `.claude/commands/`. - -## Additional rules - -- If `CLAUDE.md` changes, read it again before continuing. -- If relevant files under `.claude/rules/`, `.claude/skills/`, or `.claude/commands/` change, refresh your context before proceeding. -- If user instructions conflict with repository conventions, prioritize user intent for that task. -- Higher-priority system/developer/user instructions override this guide. diff --git a/AGENTS.md b/AGENTS.md new file mode 120000 index 000000000..681311eb9 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +CLAUDE.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 046b9fdf3..2ecd700a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,7 +8,7 @@ See [docs/developer-guide.md](docs/developer-guide.md) for full directory struct | ---- | ----------------- | | Platform Developer | `src/{arch}/platform/` | | Runtime Developer | `src/{arch}/runtime/` | -| Codegen Developer | `examples/` | +| Codegen Developer | `examples/{arch}/` | ## Common Commands @@ -32,8 +32,9 @@ clang-format -i ## Important Rules -1. **Consult `.claude/rules/` for coding conventions** (architecture, codestyle, terminology) — these are always-loaded guidelines. **Consult `.claude/skills/` for task-specific workflows** (e.g., `git-commit/` when committing, `testing/` when running tests) +1. **Consult `.agents/rules/` for coding conventions** (architecture, codestyle, terminology) — these are always-loaded guidelines. **Consult `.agents/skills/` for task-specific workflows** (e.g., `git-commit/` when committing, `testing/` when running tests) 2. **Do not modify directories outside your assigned area** unless the user explicitly requests it 3. Create new subdirectories under your assigned directory as needed 4. When in doubt, ask the user before making changes to other areas 5. **Avoid including private information in documentation or code** such as usernames, absolute paths with usernames, or other personally identifiable information. Use relative paths or generic placeholders instead +6. **Place examples under `examples/{arch}/{runtime}/{name}/`**. Do not create `examples/{runtime}/...` directly under `examples/`. diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 64255dfab..b39823200 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -106,7 +106,9 @@ When preprocessor guards are used to isolate platform code paths, the `__aarch64 ## Example / Test Layout -Every example and device test follows this structure: +Examples must live under `examples/{arch}/{runtime}/{name}/`, and device scenes must +live under `tests/st/{arch}/{runtime}/{name}/`. Every example and device test follows +this structure: ```text my_example/ diff --git a/docs/manual-dep-for-tensormap-design.md b/docs/manual-dep-for-tensormap-design.md new file mode 100644 index 000000000..3853f67f0 --- /dev/null +++ b/docs/manual-dep-for-tensormap-design.md @@ -0,0 +1,406 @@ +# Manual Dependency For TensorMap Runtime + +## Goal + +Add a scoped manual-dependency mode to `tensormap_and_ringbuffer` without +regressing the default automatic path: + +- `PTO2_SCOPE()` stays in automatic mode +- `PTO2_SCOPE(PTO2ScopeMode::MANUAL)` enables scoped manual dependency wiring +- same-manual-scope edges use explicit `pto2_rt_add_dependency(...)` +- cross-scope edges still use `owner_task_id` and TensorMap discovery + +This is a hybrid model, not a port of `aicpu_build_graph`. + +## API Surface + +The orchestration-facing API is: + +```cpp +enum class PTO2ScopeMode : uint8_t { + AUTO = 0, + MANUAL = 1, +}; + +PTO2_SCOPE() { + // default: AUTO +} + +PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + auto qk = pto2_rt_submit_aic_task_manual(...); + auto sf = pto2_rt_submit_aiv_task_manual(...); + pto2_rt_add_dependency(qk.task_id, sf.task_id); +} +``` + +Current restrictions: + +- manual submit APIs are only valid inside + `PTO2_SCOPE(PTO2ScopeMode::MANUAL)` +- `pto2_rt_add_dependency(...)` requires both tasks to belong to the current + manual scope +- nested scope inside manual scope is rejected in v1 +- blocking tensor access helpers are rejected inside manual scope + +## Dependency Semantics + +### Tensor origin matters first + +Each tensor argument is classified at submit time: + +- `manual-local`: the tensor owner was created inside the current manual scope +- `boundary`: anything else, including external tensors and tensors produced by + tasks outside the current manual scope + +Manual-local tensors skip TensorMap entirely. Boundary tensors stay on the +normal TensorMap path unless `manual_dep=true`. + +### `INPUT`, `OUTPUT`, `INOUT`, and friends + +`TensorArgType` behavior in the runtime: + +| Arg kind | Meaning | Incoming dependency work | Outgoing frontier work | +| --- | --- | --- | --- | +| `INPUT` | existing tensor, read-only | creator retention, plus TensorMap lookup unless skipped | none | +| `OUTPUT` | fresh runtime-allocated tensor | none | no TensorMap insert at creation; `owner_task_id` is stamped on the produced tensor | +| `INOUT` | existing tensor, read + write | creator retention, plus TensorMap lookup unless skipped | TensorMap insert unless skipped | +| `OUTPUT_EXISTING` | existing tensor, write-only | creator retention only | TensorMap insert unless skipped | +| `NO_DEP` | existing tensor, creator-retention-only | creator retention only | none | + +### Manual-local vs boundary behavior + +| Arg kind | Manual-local tensor | Boundary tensor | +| --- | --- | --- | +| `INPUT` | no TensorMap lookup, requires explicit manual edge | creator retention; TensorMap lookup unless `manual_dep=true` | +| `OUTPUT` | fresh local tensor; later same-scope uses rely on explicit manual edges | not applicable | +| `INOUT` | no TensorMap lookup/insert, requires explicit manual edge | creator retention; TensorMap lookup for incoming state; TensorMap insert for outgoing state unless `manual_dep=true` | +| `OUTPUT_EXISTING` | no TensorMap insert, requires explicit manual edge if later reused in scope | creator retention; TensorMap insert for outgoing state unless `manual_dep=true` | +| `NO_DEP` | creator-only object passing, no publish | same | + +### `manual_dep=true` + +`Tensor::manual_dep` keeps its existing meaning: + +- skip TensorMap lookup/insert +- keep creator-only retention via `owner_task_id` + +It is a per-tensor optimization hint. It is not the core manual-scope +mechanism. + +## Runtime Model + +### High-level flow + +```text +PTO2_SCOPE(MANUAL) + | + v + submit_*_manual() + | + +-- classify tensor args + | |- manual-local -> no TensorMap + | `- boundary -> owner retention + optional TensorMap + | + +-- allocate slot / payload / outputs + | + +-- wire boundary producers immediately + | `- keep one extra fanin publish barrier + | + `-- return { task_id, outputs } + | + v + pto2_rt_add_dependency() + | + `-- wire same-scope producer -> consumer immediately + +scope_end() + | + +-- validate fanin bounds + +-- repair monotonic dep_pool_mark prefix + +-- release publish barrier and batch-publish tasks + `-- do normal scope lifetime release +``` + +### What manual submit iterates + +Current implementation is in +`src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp`. + +For a manual submit: + +1. allocate the task slot, payload, and task id immediately +2. classify each tensor arg as manual-local or boundary +3. build `manual_local_mask` for same-scope tensors +4. decide whether TensorMap sync is needed at all + - if every relevant arg is manual-local or `manual_dep=true`, skip sync + - otherwise run the normal TensorMap sync +5. for each non-`OUTPUT` arg that is not manual-local + - always do creator retention from `owner_task_id` + - for `INPUT` and `INOUT`, do TensorMap lookup unless `manual_dep=true` +6. for `INOUT` and `OUTPUT_EXISTING` boundary args + - update TensorMap frontier unless `manual_dep=true` +7. initialize scheduler state, but keep the task unpublished behind a deferred + publish barrier + +Important consequence: + +- cross-scope dependency discovery is still paid at submit time +- same-scope dependency discovery is no longer replayed from tensors later + +### What `pto2_rt_add_dependency(...)` does now + +This is the key difference from the older design draft. + +`pto2_rt_add_dependency(...)` no longer records an edge for replay at +`scope_end()`. It validates both task ids belong to the current manual scope, +dedups against the consumer payload, ensures dep-pool space, and wires the edge +immediately: + +- increments producer `fanout_count` +- prepends the consumer into the producer fanout list +- appends the producer slot state into `payload->fanin_slot_states[]` +- increments consumer `fanin_count` +- updates consumer `dep_pool_mark` + +That removes the old replay-heavy finalize path. + +### What `scope_end()` does now + +Manual `scope_end()` is now intentionally small and TensorMap-free. + +It only: + +1. validates `fanin_actual_count` +2. repairs a monotonic `dep_pool_mark` prefix +3. calls `publish_manual_scope_tasks_and_end_scope(...)` +4. performs the normal scope lifetime release + +There is no explicit-edge replay at `scope_end()` anymore. + +## Why This Split Is Correct + +### Cross-scope correctness + +Cross-scope tensors still need TensorMap because the runtime must preserve: + +- latest-writer frontier tracking +- overlap-based modifier discovery +- boundary ordering across scopes + +If manual scope disabled TensorMap globally, outer reads and writes would +become incorrect. + +### Same-scope performance + +Manual-local tensors are exactly where TensorMap is unnecessary work: + +- the producer is already known from the current manual scope +- the ordering can be expressed directly by `pto2_rt_add_dependency(...)` +- replaying those edges at `scope_end()` added serial overhead without adding + correctness + +### Zero-overhead AUTO path + +The manual-scope extension must not slow down the normal AUTO runtime. + +Fresh measurements below show the current AUTO runtime stays within roughly +`±1%` end-to-end of the unmodified baseline on the two paged-attention scenes, +which is the intended zero-overhead result. + +## Example Requirements + +Manual mode only helps when the example exposes a real same-scope +producer/consumer chain that TensorMap would otherwise rediscover. + +For paged attention, the profitable chain is: + +```text +qk_matmul -> softmax_prepare -> pv_matmul -> online_update +``` + +Inside a manual scope: + +- intermediate tensors in that chain should stay manual-local +- explicit edges should connect those tasks directly +- outer tensors such as the external KV cache and the final output still keep + boundary semantics + +If an example keeps using boundary tensors everywhere, manual mode cannot +remove much runtime work. + +## Benchmark Enablement + +Current branch benchmark entrypoints: + +```bash +./tools/benchmark_rounds.sh -d 4 -n 5 -c d96c8784 -r aicpu_build_graph --build +./tools/benchmark_rounds.sh -d 4 -n 5 -c d96c8784 -r tensormap_and_ringbuffer --build +./tools/benchmark_rounds.sh -d 4 -n 5 -c d96c8784 -r tensormap_and_ringbuffer_partial_manual --build +``` + +`tensormap_and_ringbuffer_partial_manual` is a selector in +`tools/benchmark_rounds.sh`. The example `kernel_config.py` files still use +`RUNTIME_CONFIG["runtime"] = "tensormap_and_ringbuffer"`. The selector only +switches the scene directories to: + +- `tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual` +- `tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual` + +The old unmodified runtime is intentionally not kept on this branch. To rerun +it side-by-side: + +```bash +export PROJECT_ROOT=$(pwd) +git worktree add tmp/worktree_unmodified a71ba16 +( + cd tmp/worktree_unmodified + python3 -m venv .venv --system-site-packages + . .venv/bin/activate + pip install -e . -q + export PTO_ISA_ROOT="$PROJECT_ROOT/examples/scripts/_deps/pto-isa" + ./tools/benchmark_rounds.sh -d 4 -n 5 -c d96c8784 \ + -r tensormap_and_ringbuffer_unmodified --build +) +``` + +Fresh benchmark logs for the rebased branch are in: + +- `tmp/rebased_bench_20260410_fix/aicpu_build_graph.log` +- `tmp/rebased_bench_20260410_fix/tensormap_and_ringbuffer.log` +- `tmp/rebased_bench_20260410_fix/tensormap_and_ringbuffer_partial_manual.log` +- `tmp/rebased_bench_20260410_fix/tensormap_and_ringbuffer_unmodified.log` + +Rebase note: + +- `paged_attention_unroll_partial_manual` was initially timing out after the + merge-forward. +- The runtime manual-scope machinery was not the root cause. +- The direct cause was stale example-side AIC submit ABI: the rebased + `paged_attention_unroll` AIC kernels now expect `block_table` as a tensor + input plus a scalar `bt_offset`, while the partial-manual scene was still + passing a raw pointer scalar. +- Fixing the partial-manual `qk/pv` submit argument layout restored both + unroll cases on device. + +## Fresh Hardware Results + +Fresh rerun settings: + +- date: `2026-04-10` +- platform: `a2a3` +- device: `4` +- rounds: `5` +- PTO-ISA commit: `d96c8784` + +Units below are `elapsed_us (orch_us)`. `aicpu_build_graph` does not emit the +same orch timing lines, so only elapsed time is shown there. + +### `paged_attention` + +| Case | `aicpu_build_graph` | `tensormap_and_ringbuffer_unmodified` | `tensormap_and_ringbuffer` | `tensormap_and_ringbuffer_partial_manual` | +| --- | ---: | ---: | ---: | ---: | +| `Case1` | `29937.7` | `36095.9 (36094.9)` | `39148.7 (39148.3)` | `34186.3 (34025.7)` | +| `Case2` | `16762.7` | `18639.5 (18635.1)` | `19813.0 (19812.7)` | `18028.7 (17618.4)` | + +### `paged_attention_unroll` + +| Case | `aicpu_build_graph` | `tensormap_and_ringbuffer_unmodified` | `tensormap_and_ringbuffer` | `tensormap_and_ringbuffer_partial_manual` | +| --- | ---: | ---: | ---: | ---: | +| `Case1` | `1425.3` | `1325.6 (835.3)` | `1173.2 (992.0)` | `1160.4 (968.8)` | +| `Case2` | `693.0` | `628.7 (380.7)` | `567.9 (435.6)` | `561.9 (416.6)` | + +## Feature / Optimization -> Gain + +### 1. AUTO stays effectively zero-overhead + +The current AUTO runtime no longer meets the zero-overhead target on the +non-unroll scene, but it still wins clearly on the unroll scene: + +- `paged_attention/Case1`: `39148.7 us` vs `36095.9 us` (`+8.5%`) +- `paged_attention/Case2`: `19813.0 us` vs `18639.5 us` (`+6.3%`) +- `paged_attention_unroll/Case1`: `1173.2 us` vs `1325.6 us` (`-11.5%`) +- `paged_attention_unroll/Case2`: `567.9 us` vs `628.7 us` (`-9.7%`) + +So the AUTO path is still good for the already-amortized unroll workload, but +not yet zero-overhead for the non-unroll paged-attention target. + +### 2. Partial-manual removes the non-unroll gap + +Against the current AUTO runtime, partial-manual improves the non-unroll scene +substantially: + +- `paged_attention/Case1` + - elapsed: `39148.7 us -> 34186.3 us` (`-12.7%`) + - orch: `39148.3 us -> 34025.7 us` (`-13.1%`) +- `paged_attention/Case2` + - elapsed: `19813.0 us -> 18028.7 us` (`-9.0%`) + - orch: `19812.7 us -> 17618.4 us` (`-11.1%`) + +Against `aicpu_build_graph`, there is still a visible non-unroll gap: + +- `Case1`: `34186.3 us` vs `29937.7 us` (`+14.2%`) +- `Case2`: `18028.7 us` vs `16762.7 us` (`+7.6%`) + +Against the unmodified tensormap baseline, partial-manual is now ahead on the +non-unroll scene: + +- `Case1`: `36095.9 us -> 34186.3 us` (`-5.3%`) +- `Case2`: `18639.5 us -> 18028.7 us` (`-3.3%`) + +### 3. Unroll already amortizes most of the cost + +On `paged_attention_unroll`, both current runtimes are already better than +`aicpu_build_graph`, and partial-manual only nudges the AUTO path slightly: + +- `Case1`: `1173.2 us -> 1160.4 us` elapsed (`-1.1%`) +- `Case2`: `567.9 us -> 561.9 us` elapsed (`-1.1%`) + +That is the expected shape. The unroll orchestration already amortizes most +dependency overhead, so partial-manual has little room left to improve. + +### 4. What specifically helped + +The important runtime-side wins were: + +- classify manual-local tensors from `owner_task_id` +- skip TensorMap work for those manual-local tensors +- wire explicit same-scope edges immediately in `pto2_rt_add_dependency(...)` +- keep `scope_end()` down to publish-barrier release plus `dep_pool_mark` + fixup + +The important example-side win was using manual scope only where the +non-unroll paged-attention orchestration still had repeated same-scope +dependency work to remove. + +## Current Risks + +1. `manual_dep=true` can still be abused. + - It suppresses TensorMap lookup/insert for that tensor. + - It is only safe when ordering/frontier requirements are already covered by + other logic. + +2. Nested scope inside manual scope is still unsupported. + - This is a current implementation restriction, not a theoretical property. + +3. `pto2_rt_add_dependency(...)` now spends dep-pool entries on the submit path. + - That is intentional, but it means dep-pool pressure moved from the old + replay path into explicit-edge wiring. + +4. Manual publish still relies on `dep_pool_mark` prefix repair at `scope_end()`. + - This is required because explicit edges can touch older consumers after + newer tasks were already submitted. + +## Recommendation Summary + +Keep the design as: + +- AUTO mode by default +- explicit MANUAL mode through `PTO2ScopeMode` +- TensorMap kept only for cross-scope correctness +- explicit immediate wiring for same-scope manual edges +- `scope_end()` reduced to publish-barrier release and normal lifetime work + +That gives the required feature coverage while keeping the AUTO path +competitive on unroll and materially reducing the non-unroll gap, but the +fresh rerun still shows more work is needed to make partial-manual match +`aicpu_build_graph` on non-unroll paged attention. diff --git a/examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md index 3d52c2e12..d43b83dd8 100644 --- a/examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md +++ b/examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md @@ -4,8 +4,8 @@ In aicpu_build_graph, the orchestration function runs on AICPU. It reads device pointers from `runtime->orch_args`, allocates intermediate buffers with `device_malloc`, builds the task dependency graph through the `AicpuBuildApi` function-pointer table, and publishes tasks for scheduling. ## Where To Put Orchestration Code -- Each example keeps orchestration sources under `examples/aicpu_build_graph//kernels/orchestration/`. -- `examples/aicpu_build_graph//kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../orchestration.cpp", "function_name": "orchestration"}`. +- Each example keeps orchestration sources under `examples/a2a3/aicpu_build_graph//kernels/orchestration/`. +- `examples/a2a3/aicpu_build_graph//kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../orchestration.cpp", "function_name": "orchestration"}`. ## Function Signature Your orchestration entry must be `extern "C"` and match: @@ -60,5 +60,5 @@ Where `api` is `runtime->aicpu_build_api`. - `"0"`: Sequential -- schedulers wait until the builder finishes all tasks. ## Examples -- `examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp` -- `examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp` +- `examples/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp` +- `examples/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp` diff --git a/examples/a2a3/aicpu_build_graph/vector_example/README.md b/examples/a2a3/aicpu_build_graph/vector_example/README.md index 49107b056..5e6c3fcd6 100644 --- a/examples/a2a3/aicpu_build_graph/vector_example/README.md +++ b/examples/a2a3/aicpu_build_graph/vector_example/README.md @@ -6,8 +6,8 @@ This example runs the same computation as `host_build_graph_example`, but the ta ```bash python examples/scripts/run_example.py \ - -k examples/aicpu_build_graph/vector_example/kernels \ - -g examples/aicpu_build_graph/vector_example/golden.py \ + -k examples/a2a3/aicpu_build_graph/vector_example/kernels \ + -g examples/a2a3/aicpu_build_graph/vector_example/golden.py \ -p a2a3sim ``` diff --git a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md index fc632cc7b..42182f95f 100644 --- a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md +++ b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md @@ -5,9 +5,8 @@ In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `add_task(runtime, ...)`, and wires dependencies with `add_successor(runtime, ...)`. ## Where To Put Orchestration Code - -- Each example keeps orchestration sources under `examples/host_build_graph//kernels/orchestration/`. -- `examples/host_build_graph//kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../example_orch.cpp", "function_name": "build_example_graph"}`. +- Each example keeps orchestration sources under `examples/a2a3/host_build_graph//kernels/orchestration/`. +- `examples/a2a3/host_build_graph//kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../example_orch.cpp", "function_name": "build_example_graph"}`. ## Function Signature @@ -37,7 +36,7 @@ A typical host orchestration sequence is: 4. Create tasks with `add_task(runtime, args, num_args, func_id, core_type)`. 5. Add dependency edges with `add_successor(runtime, producer, consumer)`. -Example: see `examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp`. +Example: see `examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp`. ## Kernel Mapping diff --git a/examples/a2a3/host_build_graph/vector_example/README.md b/examples/a2a3/host_build_graph/vector_example/README.md index 20755cfea..974483703 100644 --- a/examples/a2a3/host_build_graph/vector_example/README.md +++ b/examples/a2a3/host_build_graph/vector_example/README.md @@ -52,14 +52,14 @@ This example supports two platforms: ```bash # From repository root python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3sim # With verbose output python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3sim \ -v ``` @@ -69,21 +69,21 @@ python examples/scripts/run_example.py \ ```bash # From repository root python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3 # With specific device ID python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3 \ -d 0 # With verbose output python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3 \ -v ``` diff --git a/examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md index 2db7dda82..3265db0d9 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md +++ b/examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md @@ -5,9 +5,8 @@ In tensormap_and_ringbuffer, the orchestration function runs on AICPU and builds the graph directly on device. Dependencies are discovered automatically by TensorMap based on tensor overlap, and task memory is allocated from ring buffers. ## Where To Put Orchestration Code - -- Each example keeps orchestration sources under `examples/tensormap_and_ringbuffer//kernels/orchestration/`. -- `examples/tensormap_and_ringbuffer//kernels/kernel_config.py` selects the orchestration source and the runtime `tensormap_and_ringbuffer`. +- Each example keeps orchestration sources under `examples/a2a3/tensormap_and_ringbuffer//kernels/orchestration/`. +- `examples/a2a3/tensormap_and_ringbuffer//kernels/kernel_config.py` selects the orchestration source and the runtime `tensormap_and_ringbuffer`. ## Required Exports @@ -78,6 +77,5 @@ Dependencies are inferred by TensorMap from input/inout/output tensors, so you d Do not call `pto2_rt_orchestration_done` yourself in device mode. The executor wraps the entry call in an outer scope and signals completion after `aicpu_orchestration_entry` returns. ## Examples - -- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp` (AIV-only tasks) -- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp` (mixed AIC + AIV tasks) +- `examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp` (AIV-only tasks) +- `examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp` (mixed AIC + AIV tasks) diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/golden.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/golden.py new file mode 100644 index 000000000..89df96225 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/golden.py @@ -0,0 +1,7 @@ +from pathlib import Path +import sys + +_BASE = Path(__file__).resolve().parents[1] / "paged_attention" +sys.path.insert(0, str(_BASE)) + +from golden import ALL_CASES, ATOL, DEFAULT_CASE, RTOL, __outputs__, compute_golden, generate_inputs # noqa: E402,F401 diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/kernel_config.py new file mode 100644 index 000000000..534a84f19 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/kernel_config.py @@ -0,0 +1,72 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from pathlib import Path + +from task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_ROOT = Path(__file__).parent +_PA_KERNELS = _ROOT.parent.parent / "paged_attention" / "kernels" + +ORCHESTRATION = { + "source": str(_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], +} + +KERNELS = [ + { + "func_id": 0, + "name": "QK", + "source": str(_PA_KERNELS / "aic" / "aic_qk_matmul.cpp"), + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": str(_PA_KERNELS / "aic" / "aic_pv_matmul.cpp"), + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 4, + "name": "AIC_HUB", + "source": str(_PA_KERNELS / "aic" / "aic_hub.cpp"), + "core_type": "aic", + "signature": [], + }, + { + "func_id": 1, + "name": "SF", + "source": str(_PA_KERNELS / "aiv" / "aiv_softmax_prepare.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": str(_PA_KERNELS / "aiv" / "aiv_online_update.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + { + "func_id": 5, + "name": "AIV_HUB", + "source": str(_PA_KERNELS / "aiv" / "aiv_hub.cpp"), + "core_type": "aiv", + "signature": [], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..c5487525d --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +#define FUNC_AIC_HUB 4 +#define FUNC_AIV_HUB 5 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + uint64_t batch = orch_args.tensor(0).shapes[0]; + uint64_t num_heads = orch_args.tensor(0).shapes[1]; + uint64_t head_dim = orch_args.tensor(0).shapes[2]; + DataType data_type = orch_args.tensor(0).dtype; + uint64_t block_size = orch_args.tensor(1).shapes[1]; + uint64_t block_num = orch_args.tensor(3).shapes[1]; + uint64_t scale_value = orch_args.scalar(0); + + uint64_t q_head_num = num_heads; + uint64_t q_tile = 16; + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + + uint64_t b_start = batch * orch_thread_index / orch_thread_num; + uint64_t b_end = batch * (orch_thread_index + 1) / orch_thread_num; + + void *query_ptr = orch_args.tensor(0).data_as(); + void *kc_ptr = orch_args.tensor(1).data_as(); + void *vc_ptr = orch_args.tensor(2).data_as(); + void *out_ptr = orch_args.tensor(5).data_as(); + + uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; + uint64_t kv_total_rows = total_blocks_count * block_size; + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = {static_cast(kv_total_rows), static_cast(head_dim)}; + uint32_t value_cache_shapes[2] = {static_cast(kv_total_rows), static_cast(head_dim)}; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); + + int *host_block_table = orch_args.tensor(3).data_as(); + int *host_context_lens = orch_args.tensor(4).data_as(); + + uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t scalar_shapes[1] = {static_cast(q_tile)}; + uint32_t sij_shapes[2] = {static_cast(q_tile), static_cast(block_size)}; + TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32); + TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); + TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type); + + for (uint64_t b_idx = b_start; b_idx < b_end; b_idx++) { + uint64_t cur_seq = host_context_lens[b_idx]; + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE() { + uint32_t cur_offset = static_cast(b_idx * q_head_num + q_idx * q_tile); + uint32_t qi_offsets[2] = {cur_offset, 0}; + uint32_t out_view_offsets[2] = {cur_offset, 0}; + Tensor qi = query.view(tile2d_shapes, qi_offsets); + Tensor out_view = out.view(tile2d_shapes, out_view_offsets); + + Arg params_inplace; + params_inplace.add_output(tile2d_ci); + params_inplace.add_output(scalar_ci); + params_inplace.add_output(scalar_ci); + TaskOutputTensors hub_outs = pto2_rt_submit_aiv_task(FUNC_AIV_HUB, params_inplace); + const Tensor &oi = hub_outs.get_ref(0); + const Tensor &li_update = hub_outs.get_ref(1); + const Tensor &mi_update = hub_outs.get_ref(2); + + for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn]; + uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); + uint32_t kv_shapes[2] = {static_cast(block_size), static_cast(head_dim)}; + uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; + Tensor kj = key_cache.view(kv_shapes, kv_offsets); + Tensor vj = value_cache.view(kv_shapes, kv_offsets); + + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + Arg params_qk; + params_qk.add_input(qi); + params_qk.add_input(kj); + params_qk.add_output(sij_ci); + PTO2ManualSubmitResult qk_outs = pto2_rt_submit_aic_task_manual(FUNC_QK_MATMUL, params_qk); + const Tensor &sij = qk_outs.outputs.get_ref(0); + + uint32_t sij_valid_shapes[2] = { + static_cast(q_tile), static_cast(valid_len) + }; + uint32_t sij_valid_offsets[2] = {0, 0}; + Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + + Arg params_sf; + params_sf.add_input(sij_valid); + params_sf.add_output(pij_f16_ci); + params_sf.add_output(scalar_ci); + params_sf.add_output(scalar_ci); + params_sf.add_scalar(scale_value); + PTO2ManualSubmitResult sf_outs = + pto2_rt_submit_aiv_task_manual(FUNC_SOFTMAX_PREPARE, params_sf); + const Tensor &pij_f16 = sf_outs.outputs.get_ref(0); + const Tensor &mi = sf_outs.outputs.get_ref(1); + const Tensor &li = sf_outs.outputs.get_ref(2); + + Arg params_pv; + params_pv.add_input(pij_f16); + params_pv.add_input(vj); + params_pv.add_output(tile2d_ci); + PTO2ManualSubmitResult pv_outs = pto2_rt_submit_aic_task_manual(FUNC_PV_MATMUL, params_pv); + const Tensor &oi_tmp = pv_outs.outputs.get_ref(0); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + + Arg params_up; + params_up.add_input(mi); + params_up.add_input(li); + params_up.add_input(oi_tmp); + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + params_up.add_inout(out_view); + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + PTO2ManualSubmitResult up_outs = pto2_rt_submit_aiv_task_manual(FUNC_ONLINE_UPDATE, params_up); + + pto2_rt_add_dependency(qk_outs.task_id, sf_outs.task_id); + pto2_rt_add_dependency(sf_outs.task_id, pv_outs.task_id); + pto2_rt_add_dependency(sf_outs.task_id, up_outs.task_id); + pto2_rt_add_dependency(pv_outs.task_id, up_outs.task_id); + } + } + } + } + } +} + +} // extern "C" diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py index 89ab84199..76ae16f18 100644 --- a/examples/scripts/run_example.py +++ b/examples/scripts/run_example.py @@ -21,12 +21,12 @@ Examples: # Run hardware example (requires Ascend device) - python examples/scripts/run_example.py -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py + python examples/scripts/run_example.py -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py # Run simulation example (no hardware required) - python examples/scripts/run_example.py -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + python examples/scripts/run_example.py -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3sim # Run with specific device diff --git a/src/a2a3/docs/runtimes.md b/src/a2a3/docs/runtimes.md index cb8fcbb1c..7d4a694bd 100644 --- a/src/a2a3/docs/runtimes.md +++ b/src/a2a3/docs/runtimes.md @@ -8,13 +8,13 @@ Three runtime implementations live under `src/a2a3/runtime/`, each providing a d | ------- | ---------------- | ----------------- | ------------------------ | | Graph built on | Host CPU | AICPU (device) | AICPU (device) | | Task storage | Fixed `Task[]` array | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) | -| Dependencies | Explicit edges | Explicit edges | Auto-derived via TensorMap | +| Dependencies | Explicit edges | Explicit edges | Auto-derived via TensorMap, plus optional manual dependencies | | Memory management | Host-side | Host + device malloc | Ring buffer heap (GM) | | Concurrent build+schedule | No | Optional (`build_mode=1`) | Yes (always) | | Profiling support | Basic | Basic | Multi-level hierarchy | | Batch/streaming | No | No | Yes (flow control, back-pressure) | | Thread model | N scheduler threads | 1 builder + N schedulers | 1 orchestrator + 3 schedulers | -| Use case | Development, debugging | Reduced host-device transfer | Production workloads | +| Use case | Development, debugging | Reduced host-device transfer | Production PTO2 with manual-scope extensions | ## host_build_graph diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 97afd6a4e..18f739264 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -2364,6 +2364,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } } + if (rt != nullptr) { + void* sm = runtime->get_pto2_gm_sm_ptr(); + if (sm != nullptr) { + int32_t orch_err = static_cast(sm)->orch_error_code.load(std::memory_order_acquire); + if (orch_err != PTO2_ERROR_NONE) { + DEV_ERROR("Thread %d: Exiting with orchestrator error code=%d", thread_idx, orch_err); + return -1; + } + } + } + return 0; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md index c6d0e3ebd..8f954024a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -204,8 +204,8 @@ Milestone command (device): ```bash python examples/scripts/run_example.py \ - -k tests/st/tensormap_and_ringbuffer/batch_paged_attention/kernels \ - -g tests/st/tensormap_and_ringbuffer/batch_paged_attention/golden.py \ + -k tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels \ + -g tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py \ -p a2a3 -d 9 ``` diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index cf752ef2d..c54c80603 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -34,6 +34,7 @@ #include // Type headers needed by orchestration +#include "pto_task_id.h" // PTO2TaskId // NOLINT(build/include_subdir) #include "pto_submit_types.h" // MixedKernels, INVALID_KERNEL_ID, subtask slots // NOLINT(build/include_subdir) #include "pto_types.h" // Arg, TaskOutputTensors, TensorArgType // NOLINT(build/include_subdir) #include "task_args.h" // ChipStorageTaskArgs, ContinuousTensor // NOLINT(build/include_subdir) @@ -84,6 +85,16 @@ inline Tensor from_tensor_arg(const ContinuousTensor &t, bool manual_dep = false // Ops Table and Opaque Runtime // ============================================================================= +enum class PTO2ScopeMode : uint8_t { + AUTO = 0, + MANUAL = 1, +}; + +struct PTO2ManualSubmitResult { + PTO2TaskId task_id; + TaskOutputTensors outputs; +}; + /** * Forward declaration — the orchestration sees PTO2Runtime as a partial * struct whose first field is the ops pointer. The full definition @@ -115,7 +126,9 @@ void pto2_framework_bind_runtime(PTO2Runtime *rt); */ typedef struct PTO2RuntimeOps { TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); - void (*scope_begin)(PTO2Runtime *rt); + PTO2ManualSubmitResult (*submit_task_manual)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); + void (*add_dependency)(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer); + void (*scope_begin)(PTO2Runtime *rt, PTO2ScopeMode mode); void (*scope_end)(PTO2Runtime *rt); void (*orchestration_done)(PTO2Runtime *rt); bool (*is_fatal)(PTO2Runtime *rt); @@ -179,12 +192,16 @@ static inline TaskOutputTensors alloc_tensors(const CIs &...cis) { always_assert(!args.has_error && "alloc_tensors failed to construct output-only Arg"); return alloc_tensors(args); } - static inline TaskOutputTensors pto2_rt_submit_task(const MixedKernels &mixed_kernels, const Arg &args) { PTO2Runtime *rt = pto2_current_runtime(); return rt->ops->submit_task(rt, mixed_kernels, args); } +static inline PTO2ManualSubmitResult pto2_rt_submit_task_manual(const MixedKernels &mixed_kernels, const Arg &args) { + PTO2Runtime *rt = pto2_current_runtime(); + return rt->ops->submit_task_manual(rt, mixed_kernels, args); +} + /** * Convenience wrapper: submit an AIC-only task. */ @@ -205,9 +222,28 @@ static inline TaskOutputTensors pto2_rt_submit_aiv_task(int32_t kernel_id, const return rt->ops->submit_task(rt, mk, args); } -static inline void pto2_rt_scope_begin() { +static inline PTO2ManualSubmitResult pto2_rt_submit_aic_task_manual(int32_t kernel_id, const Arg &args) { + PTO2Runtime *rt = pto2_current_runtime(); + MixedKernels mk; + mk.aic_kernel_id = kernel_id; + return rt->ops->submit_task_manual(rt, mk, args); +} + +static inline PTO2ManualSubmitResult pto2_rt_submit_aiv_task_manual(int32_t kernel_id, const Arg &args) { + PTO2Runtime *rt = pto2_current_runtime(); + MixedKernels mk; + mk.aiv0_kernel_id = kernel_id; + return rt->ops->submit_task_manual(rt, mk, args); +} + +static inline void pto2_rt_add_dependency(PTO2TaskId producer, PTO2TaskId consumer) { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->add_dependency(rt, producer, consumer); +} + +static inline void pto2_rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) { PTO2Runtime *rt = pto2_current_runtime(); - rt->ops->scope_begin(rt); + rt->ops->scope_begin(rt, mode); } static inline void pto2_rt_scope_end() { @@ -300,9 +336,9 @@ static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const u */ class PTO2ScopeGuard { public: // NOLINT(whitespace/indent) - PTO2ScopeGuard() : + explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) : rt_(pto2_current_runtime()) { - rt_->ops->scope_begin(rt_); + rt_->ops->scope_begin(rt_, mode); } ~PTO2ScopeGuard() { rt_->ops->scope_end(rt_); } @@ -313,7 +349,8 @@ class PTO2ScopeGuard { #define _PTO2_CONCATENATE_IMPL(x, y) x##y #define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y) -#define PTO2_SCOPE_GUARD() [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__) +#define PTO2_SCOPE_GUARD(...) \ + [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__) { __VA_ARGS__ } /** * Scoped block macro: @@ -321,7 +358,7 @@ class PTO2ScopeGuard { * pto2_rt_submit_task(...); * } */ -#define PTO2_SCOPE() if (PTO2_SCOPE_GUARD(); true) +#define PTO2_SCOPE(...) if (PTO2_SCOPE_GUARD(__VA_ARGS__); true) // ============================================================================= // Orchestration Config diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index defc1ec49..0935f40ea 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -195,7 +195,7 @@ static bool pto2_append_fanin_or_fail( return true; } -static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state); +static bool scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state); struct PTO2OutputLayout { uint64_t offsets[MAX_TENSOR_ARGS] = {}; @@ -345,7 +345,6 @@ bool pto2_orchestrator_init( sm_handle->task_descriptors[r], sm_handle->header->rings[r].task_window_size, &fc.current_task_index, &fc.last_task_alive, ring_heap_base, heap_size, &sm_handle->header->orch_error_code ); - size_t fanin_pool_bytes = PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); PTO2FaninSpillEntry *fanin_entries = @@ -407,6 +406,7 @@ bool pto2_orchestrator_init( orch->scope_tasks_capacity = init_cap; orch->scope_stack_top = -1; orch->scope_stack_capacity = max_depth; + orch->manual_scope_active = false; return true; } @@ -435,26 +435,96 @@ void pto2_orchestrator_set_scheduler(PTO2OrchestratorState *orch, PTO2SchedulerS // Scope Management // ============================================================================= -static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) { +static bool scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) { + if (orch->fatal) { + return false; + } if (orch->scope_tasks_size >= orch->scope_tasks_capacity) { int32_t new_cap = orch->scope_tasks_capacity * 2; PTO2TaskSlotState **new_buf = reinterpret_cast(realloc(orch->scope_tasks, new_cap * sizeof(PTO2TaskSlotState *))); - assert(new_buf && "Failed to grow scope task buffer"); + if (new_buf == nullptr) { + LOG_ERROR("Failed to grow scope task buffer to %d entries", new_cap); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_OUT_OF_MEMORY, std::memory_order_release); + orch->fatal = true; + return false; + } orch->scope_tasks = new_buf; orch->scope_tasks_capacity = new_cap; } orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state; + return true; +} + +static bool in_manual_scope(const PTO2OrchestratorState *orch) { + return orch->manual_scope_active; +} + +static int32_t current_manual_scope_begin(const PTO2OrchestratorState *orch) { + return orch->scope_begins[orch->scope_stack_top]; +} + +static int32_t find_current_manual_scope_task_index(const PTO2OrchestratorState *orch, PTO2TaskId task_id) { + if (!in_manual_scope(orch) || !task_id.is_valid()) { + return -1; + } + + int32_t begin = current_manual_scope_begin(orch); + int32_t count = orch->scope_tasks_size - begin; + if (count <= 0) { + return -1; + } + + PTO2TaskSlotState *first_slot_state = orch->scope_tasks[begin]; + if (first_slot_state == nullptr) { + return -1; + } + + PTO2TaskId first_task_id = first_slot_state->task->task_id; + if (first_task_id.ring() != task_id.ring()) { + return -1; + } + + uint32_t first_local = first_task_id.local(); + uint32_t task_local = task_id.local(); + if (task_local < first_local) { + return -1; + } + + uint32_t delta = task_local - first_local; + if (delta >= static_cast(count)) { + return -1; + } + + PTO2TaskSlotState *candidate = orch->scope_tasks[begin + static_cast(delta)]; + if (candidate != nullptr && candidate->task->task_id == task_id) { + return static_cast(delta); + } + return -1; +} + +static bool task_owned_by_current_manual_scope(const PTO2OrchestratorState *orch, PTO2TaskId task_id) { + return find_current_manual_scope_task_index(orch, task_id) >= 0; } -void pto2_scope_begin(PTO2OrchestratorState *orch) { +void pto2_scope_begin(PTO2OrchestratorState *orch, PTO2ScopeMode mode) { if (orch->fatal) { return; } assert(orch->scope_stack_top < static_cast(orch->scope_stack_capacity - 1) && "Scope stack overflow"); + if (in_manual_scope(orch)) { + LOG_ERROR( + "nested PTO2_SCOPE(PTO2ScopeMode::MANUAL) is not supported in v1; manual scope inside manual scope is not supported" + ); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return; + } + ++orch->scope_stack_top; orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size; + orch->manual_scope_active = (mode == PTO2ScopeMode::MANUAL); } void pto2_scope_end(PTO2OrchestratorState *orch) { @@ -467,15 +537,63 @@ void pto2_scope_end(PTO2OrchestratorState *orch) { uint64_t _se0 = get_sys_cnt_aicpu(); #endif - int32_t begin = orch->scope_begins[orch->scope_stack_top--]; + int32_t top = orch->scope_stack_top; + int32_t begin = orch->scope_begins[top]; int32_t count = orch->scope_tasks_size - begin; + bool manual_scope = orch->manual_scope_active; + + if (!manual_scope) { + orch->scope_stack_top--; + orch->manual_scope_active = false; + + if (orch->scheduler && count > 0) { + orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count); + } + + orch->scope_tasks_size = begin; + +#if PTO2_ORCH_PROFILING + uint64_t _se1 = get_sys_cnt_aicpu(); + g_orch_scope_end_cycle += (_se1 - _se0); +#endif + return; + } if (orch->scheduler && count > 0) { - orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count); + int32_t dep_pool_mark_prefix = 0; + for (int32_t task_idx = 0; task_idx < count; task_idx++) { + PTO2TaskSlotState *slot_state = orch->scope_tasks[begin + task_idx]; + PTO2TaskPayload *payload = slot_state->payload; + if (payload->fanin_actual_count > PTO2_MAX_INPUTS) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Dependency Overflow Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("Task requires more than PTO2_MAX_INPUTS unique fanin dependencies."); + LOG_ERROR(" task_id.raw: %" PRIu64, slot_state->task->task_id.raw); + LOG_ERROR(" fanin_count: %d / %d", payload->fanin_actual_count, PTO2_MAX_INPUTS); + LOG_ERROR(" reason: manual dependency bookkeeping"); + LOG_ERROR("This is a runtime dependency-tracking limit."); + LOG_ERROR("========================================"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_DEPENDENCY_OVERFLOW, std::memory_order_release); + orch->fatal = true; + return; + } + // add_dependency may allocate dep entries for an older consumer after + // newer tasks were already submitted. Recompute a monotonic dep-pool + // watermark at publish time so tail reclamation still advances safely. + if (slot_state->dep_pool_mark < dep_pool_mark_prefix) { + slot_state->dep_pool_mark = dep_pool_mark_prefix; + } else { + dep_pool_mark_prefix = slot_state->dep_pool_mark; + } + } + orch->scheduler->publish_manual_scope_tasks_and_end_scope(&orch->scope_tasks[begin], count); } // Rewind the task buffer — these entries are no longer needed orch->scope_tasks_size = begin; + orch->scope_stack_top--; + orch->manual_scope_active = false; #if PTO2_ORCH_PROFILING uint64_t _se1 = get_sys_cnt_aicpu(); @@ -487,8 +605,11 @@ void pto2_scope_end(PTO2OrchestratorState *orch) { // ============================================================================= // Task Submission // ============================================================================= -TaskOutputTensors -pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args) { +template +static TaskOutputTensors pto2_submit_mixed_task_impl( + PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args, + PTO2TaskId *submitted_task_id = nullptr +) { CYCLE_COUNT_START(); TaskOutputTensors result; @@ -520,7 +641,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke always_assert(block_num >= 1 && "block_num must be >= 1"); // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move - // it to the aiv0 slot. This guarantees the dispatch path can always use + // it to the aiv0 slot. This guarantees the dispatch path can always use // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask. // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time. @@ -534,6 +655,18 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke active_mask = pto2_mixed_kernels_to_active_mask(normalized); } + // Submission without an open scope is illegal. + always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope"); + + if constexpr (!kManualSubmit) { + if (in_manual_scope(orch)) { + LOG_ERROR("PTO2_SCOPE(PTO2ScopeMode::MANUAL) requires pto2_rt_submit_*_manual task APIs"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return result; + } + } + // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1) if (block_num > 1 && args.launch_spec.require_sync_start()) { // Deadlock check: block_num >= total available slots of the required type. @@ -562,9 +695,31 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke int32_t slot = prepared.alloc_result.slot; PTO2FaninBuilder fanin_builder; - fanin_builder.count = 0; - fanin_builder.spill_start = 0; fanin_builder.spill_pool = &orch->rings[ring_id].fanin_pool; + uint64_t manual_local_mask = 0; + bool needs_tensormap_sync = true; + if constexpr (kManualSubmit) { + needs_tensormap_sync = false; + for (int i = 0; i < args.tensor_count(); i++) { + TensorArgType ptype = args.tag(i); + if (ptype == TensorArgType::OUTPUT) { + continue; + } + + const Tensor *tensor = args.tensor(i).ptr; + if (task_owned_by_current_manual_scope(orch, tensor->owner_task_id)) { + manual_local_mask |= static_cast(1ULL << i); + continue; + } + + bool needs_lookup = (ptype == TensorArgType::INPUT || ptype == TensorArgType::INOUT) && !tensor->manual_dep; + bool needs_insert = + (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) && !tensor->manual_dep; + if (needs_lookup || needs_insert) { + needs_tensormap_sync = true; + } + } + } CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw); @@ -576,13 +731,16 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke #endif // === STEP 2: Sync TensorMap validity and optional cleanup === - // Read current last_task_alive from shared memory for this ring int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire); - orch->tensor_map.sync_tensormap(ring_id, sm_last_task_alive); + if (!kManualSubmit || needs_tensormap_sync) { + orch->tensor_map.sync_tensormap(ring_id, sm_last_task_alive); + } - if (sched) { - orch->rings[ring_id].dep_pool.reclaim(*sched, ring_id, sm_last_task_alive); + if constexpr (!kManualSubmit) { + if (sched) { + orch->rings[ring_id].dep_pool.reclaim(*sched, ring_id, sm_last_task_alive); + } } CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, task_id.raw); @@ -591,13 +749,16 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke for (int i = 0; i < args.tensor_count(); i++) { TensorArgType ptype = args.tag(i); if (ptype == TensorArgType::OUTPUT) { - // Runtime-created OUTPUT tensors are not looked up in the TensorMap since they have no dependencies. continue; } const Tensor *tensor = args.tensor(i).ptr; + if constexpr (kManualSubmit) { + if ((manual_local_mask & static_cast(1ULL << i)) != 0) { + continue; + } + } - // Step A: creator retention — all existing tensors extend their creator lifetime. PTO2TaskId owner = tensor->owner_task_id; if (owner.is_valid() && sched != nullptr) { PTO2TaskSlotState *prod_state = @@ -609,7 +770,6 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke } } - // Step B: only INPUT/INOUT need modifier dependency lookup. if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) { continue; } @@ -625,7 +785,8 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto overlap_status = lookup_result.entries[r].overlap_status; auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); - PTO2TaskSlotState *prod_state = &sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local); + PTO2TaskSlotState *prod_state = + &sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local); if (!pto2_append_fanin_or_fail( orch, task_id, i, ptype, prod_state, &fanin_builder, sched, fc, ring_id, "overlap lookup" )) { @@ -640,14 +801,17 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, task_id.raw); // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) === - { - for (int i = 0; i < args.tensor_count(); i++) { - TensorArgType ptype = args.tag(i); - if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) { - if (!args.tensor(i).ptr->manual_dep) { - orch->tensor_map.insert(*args.tensor(i).ptr, task_id); + for (int i = 0; i < args.tensor_count(); i++) { + TensorArgType ptype = args.tag(i); + if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) { + if constexpr (kManualSubmit) { + if ((manual_local_mask & static_cast(1ULL << i)) != 0) { + continue; } } + if (!args.tensor(i).ptr->manual_dep) { + orch->tensor_map.insert(*args.tensor(i).ptr, task_id); + } } } @@ -705,55 +869,112 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke cur_slot_state.block_num = block_num; cur_slot_state.next_block_idx = 0; - auto &dep_pool = orch->rings[ring_id].dep_pool; - int32_t fanin_count = fanin_builder.count; - int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); - int32_t spill_count = fanin_count - inline_count; - dep_pool.ensure_space(*sched, fc, ring_id, fanin_count + 1); - - int32_t early_finished = 0; - cur_slot_state.fanin_count = fanin_count + 1; // +1 redundance for not being ready too early - payload->fanin_actual_count = fanin_count; - payload->fanin_spill_start = (spill_count > 0) ? fanin_builder.spill_start : 0; - payload->fanin_spill_pool = (spill_count > 0) ? fanin_builder.spill_pool : nullptr; - for (int i = 0; i < inline_count; i++) { - payload->fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; - } - pto2_for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot) { - PTO2TaskSlotState &producer_slot_state = *producer_slot; + if constexpr (kManualSubmit) { + int32_t fanin_count = fanin_builder.count; + int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); + int32_t spill_count = fanin_count - inline_count; + payload->fanin_actual_count = fanin_count; + payload->fanin_spill_start = (spill_count > 0) ? fanin_builder.spill_start : 0; + payload->fanin_spill_pool = (spill_count > 0) ? fanin_builder.spill_pool : nullptr; + for (int i = 0; i < inline_count; i++) { + payload->fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + + auto &dep_pool = orch->rings[ring_id].dep_pool; + dep_pool.ensure_space(*sched, fc, ring_id, fanin_count); + + int32_t early_finished = 0; + bool fanout_ok = fanin_builder.for_each([&](PTO2TaskSlotState *producer_slot) { + PTO2TaskSlotState &producer_slot_state = *producer_slot; #if PTO2_ORCH_PROFILING - pto2_fanout_lock(producer_slot_state, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle); + pto2_fanout_lock(producer_slot_state, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle); #else - pto2_fanout_lock(producer_slot_state); + pto2_fanout_lock(producer_slot_state); #endif - producer_slot_state.fanout_count += 1; - int32_t prod_state = producer_slot_state.task_state.load(std::memory_order_acquire); - if (prod_state >= PTO2_TASK_COMPLETED) { - early_finished++; - } else { - producer_slot_state.fanout_head = dep_pool.prepend(producer_slot_state.fanout_head, &cur_slot_state); + producer_slot_state.fanout_count += 1; + int32_t prod_state = producer_slot_state.task_state.load(std::memory_order_acquire); + if (prod_state >= PTO2_TASK_COMPLETED) { + early_finished++; + } else { + producer_slot_state.fanout_head = + dep_pool.prepend(producer_slot_state.fanout_head, &cur_slot_state); + if (producer_slot_state.fanout_head == nullptr) { + pto2_fanout_unlock(producer_slot_state); + orch->fatal = true; + return false; + } + } + pto2_fanout_unlock(producer_slot_state); + return true; + }); + if (!fanout_ok) { + return result; } - pto2_fanout_unlock(producer_slot_state); - }); - // Combined release: merge early_finished batch with the +1 init release - // into a single atomic fetch_add (saves one acq_rel cache-line bounce per task). - int32_t initial_refcount = early_finished + 1; // +1 for the init release - int32_t new_rc = - cur_slot_state.fanin_refcount.fetch_add(initial_refcount, std::memory_order_acq_rel) + initial_refcount; - if (new_rc >= fanin_count + 1) { - PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask); - sched->ready_queues[static_cast(shape)].push(&cur_slot_state); - } - // Record dep pool watermark in local slot state (used by tail reclamation) - cur_slot_state.dep_pool_mark = orch->rings[ring_id].dep_pool.top; + cur_slot_state.fanin_count = fanin_count + 1; + if (early_finished > 0) { + cur_slot_state.fanin_refcount.fetch_add(early_finished, std::memory_order_acq_rel); + } + cur_slot_state.dep_pool_mark = dep_pool.top; #if PTO2_ORCH_PROFILING - // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics - // Lock atomics (loads + CAS) are counted inside pto2_fanout_lock - g_orch_fanin_atomic_count += fanin_count * 3; - if (early_finished > 0) { - g_orch_fanin_atomic_count += 1; // fanin_refcount.fetch_add - } + g_orch_fanin_atomic_count += fanin_count * 3; + if (early_finished > 0) { + g_orch_fanin_atomic_count += 1; // fanin_refcount.fetch_add + } +#endif + } else { + auto &dep_pool = orch->rings[ring_id].dep_pool; + int32_t fanin_count = fanin_builder.count; + int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); + int32_t spill_count = fanin_count - inline_count; + dep_pool.ensure_space(*sched, fc, ring_id, fanin_count + 1); + + int32_t early_finished = 0; + cur_slot_state.fanin_count = fanin_count + 1; // +1 redundance for not being ready too early + payload->fanin_actual_count = fanin_count; + payload->fanin_spill_start = (spill_count > 0) ? fanin_builder.spill_start : 0; + payload->fanin_spill_pool = (spill_count > 0) ? fanin_builder.spill_pool : nullptr; + for (int i = 0; i < inline_count; i++) { + payload->fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + pto2_for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot) { + PTO2TaskSlotState &producer_slot_state = *producer_slot; +#if PTO2_ORCH_PROFILING + pto2_fanout_lock(producer_slot_state, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle); +#else + pto2_fanout_lock(producer_slot_state); #endif + // Normal path: prepend consumer to producer's fanout list + producer_slot_state.fanout_count += 1; + int32_t prod_state = producer_slot_state.task_state.load(std::memory_order_acquire); + if (prod_state >= PTO2_TASK_COMPLETED) { + // Early return optimization: if producer already completed, we can skip adding dependency and + // directly decrement fanin_count + early_finished++; + } else { + producer_slot_state.fanout_head = + dep_pool.prepend(producer_slot_state.fanout_head, &cur_slot_state); + } + pto2_fanout_unlock(producer_slot_state); + return true; + }); + int32_t initial_refcount = early_finished + 1; // +1 for the init release + int32_t new_rc = + cur_slot_state.fanin_refcount.fetch_add(initial_refcount, std::memory_order_acq_rel) + initial_refcount; + if (new_rc >= fanin_count + 1) { + PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask); + sched->ready_queues[static_cast(shape)].push(&cur_slot_state); + } + // Record dep pool watermark in local slot state (used by tail reclamation) + cur_slot_state.dep_pool_mark = orch->rings[ring_id].dep_pool.top; +#if PTO2_ORCH_PROFILING + // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics + // Lock atomics (loads + CAS) are counted inside pto2_fanout_lock + g_orch_fanin_atomic_count += fanin_count * 3; + if (early_finished > 0) { + g_orch_fanin_atomic_count += 1; // fanin_refcount.fetch_add + } +#endif + } } CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, task_id.raw); @@ -765,6 +986,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke #endif g_orch_submit_idx++; #endif + if (submitted_task_id != nullptr) { + *submitted_task_id = task_id; + } return result; } @@ -853,6 +1077,144 @@ TaskOutputTensors pto2_alloc_tensors(PTO2OrchestratorState *orch, const Arg &arg return outputs; } +TaskOutputTensors +pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args) { + if (in_manual_scope(orch)) { + LOG_ERROR("PTO2_SCOPE(PTO2ScopeMode::MANUAL) requires pto2_rt_submit_*_manual task APIs"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return {}; + } + return pto2_submit_mixed_task_impl(orch, mixed_kernels, args); +} + +PTO2ManualSubmitResult +pto2_submit_mixed_task_manual(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args) { + PTO2ManualSubmitResult result{}; + if (!in_manual_scope(orch)) { + LOG_ERROR("manual submit APIs require PTO2_SCOPE(PTO2ScopeMode::MANUAL)"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return result; + } + PTO2TaskId task_id = PTO2TaskId::invalid(); + TaskOutputTensors outputs = pto2_submit_mixed_task_impl(orch, mixed_kernels, args, &task_id); + if (orch->fatal || !task_id.is_valid()) { + return result; + } + result.task_id = task_id; + result.outputs = outputs; + return result; +} + +void pto2_add_dependency(PTO2OrchestratorState *orch, PTO2TaskId producer_id, PTO2TaskId consumer_id) { + if (orch->fatal) { + return; + } + + if (!in_manual_scope(orch)) { + LOG_ERROR("pto2_rt_add_dependency is only valid inside PTO2_SCOPE(PTO2ScopeMode::MANUAL)"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return; + } + if (producer_id == consumer_id) { + LOG_ERROR("add_dependency does not allow self-dependency"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return; + } + int32_t producer_idx = find_current_manual_scope_task_index(orch, producer_id); + int32_t consumer_idx = find_current_manual_scope_task_index(orch, consumer_id); + if (producer_idx < 0 || consumer_idx < 0) { + LOG_ERROR("add_dependency requires producer and consumer to belong to the current manual scope"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch->fatal = true; + return; + } + + PTO2TaskSlotState *producer_slot_state = orch->scope_tasks[current_manual_scope_begin(orch) + producer_idx]; + PTO2TaskSlotState *consumer_slot_state = orch->scope_tasks[current_manual_scope_begin(orch) + consumer_idx]; + PTO2TaskPayload *consumer_payload = consumer_slot_state->payload; + + bool duplicate = false; + pto2_for_each_fanin_slot_state(*consumer_payload, [&](PTO2TaskSlotState *fanin_slot_state) { + if (fanin_slot_state == producer_slot_state) { + duplicate = true; + return false; + } + return true; + }); + if (duplicate) { + return; + } + + if (consumer_payload->fanin_actual_count >= PTO2_MAX_INPUTS) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Dependency Overflow Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("Task requires more than PTO2_MAX_INPUTS unique fanin dependencies."); + LOG_ERROR(" consumer_id.raw: %" PRIu64, consumer_id.raw); + LOG_ERROR(" fanin_count: %d / %d", consumer_payload->fanin_actual_count + 1, PTO2_MAX_INPUTS); + LOG_ERROR(" reason: explicit add_dependency"); + LOG_ERROR("========================================"); + orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_DEPENDENCY_OVERFLOW, std::memory_order_release); + orch->fatal = true; + return; + } + + auto &dep_pool = orch->rings[producer_slot_state->ring_id].dep_pool; + auto &fc = orch->sm_handle->header->rings[producer_slot_state->ring_id].fc; + dep_pool.ensure_space(*orch->scheduler, fc, producer_slot_state->ring_id, 1); + + PTO2FaninBuilder fanin_builder; + fanin_builder.count = consumer_payload->fanin_actual_count; + fanin_builder.spill_start = consumer_payload->fanin_spill_start; + fanin_builder.spill_pool = + (consumer_payload->fanin_spill_pool != nullptr) ? consumer_payload->fanin_spill_pool + : &orch->rings[consumer_slot_state->ring_id].fanin_pool; + int32_t cached_inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP); + for (int32_t i = 0; i < cached_inline_count; i++) { + fanin_builder.inline_slots[i] = consumer_payload->fanin_inline_slot_states[i]; + } + + if (fanin_builder.count < PTO2_FANIN_INLINE_CAP) { + fanin_builder.inline_slots[fanin_builder.count++] = producer_slot_state; + } else { + PTO2FaninPool &fanin_pool = *fanin_builder.spill_pool; + fanin_pool.ensure_space(*orch->scheduler, fc, consumer_slot_state->ring_id, 1); + int32_t spill_idx = fanin_pool.top; + PTO2FaninSpillEntry *entry = fanin_pool.alloc(); + if (entry == nullptr) { + orch->fatal = true; + return; + } + if (fanin_builder.count == PTO2_FANIN_INLINE_CAP) { + fanin_builder.spill_start = spill_idx; + } + entry->slot_state = producer_slot_state; + fanin_builder.count++; + } + + producer_slot_state->fanout_count += 1; + producer_slot_state->fanout_head = dep_pool.prepend(producer_slot_state->fanout_head, consumer_slot_state); + if (producer_slot_state->fanout_head == nullptr) { + orch->fatal = true; + return; + } + + int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP); + int32_t spill_count = fanin_builder.count - inline_count; + consumer_payload->fanin_actual_count = fanin_builder.count; + consumer_payload->fanin_spill_start = (spill_count > 0) ? fanin_builder.spill_start : 0; + consumer_payload->fanin_spill_pool = (spill_count > 0) ? fanin_builder.spill_pool : nullptr; + for (int32_t i = 0; i < inline_count; i++) { + consumer_payload->fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + consumer_slot_state->fanin_count += 1; + consumer_slot_state->dep_pool_mark = dep_pool.top; +} + // ============================================================================= // Flow Control // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 9db96eaa1..e3297ada2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -63,8 +63,9 @@ struct PTO2OrchestratorState { int32_t scope_tasks_size; // Number of task IDs currently in the buffer int32_t scope_tasks_capacity; // Allocated capacity of scope_tasks int32_t *scope_begins; // scope_begins[i] = start index of scope i in scope_tasks - int32_t scope_stack_top; // Current top of stack (-1 = no scope open) - uint64_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) + int32_t scope_stack_top; // Current top of stack (-1 = no scope open) + uint64_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) + bool manual_scope_active{false}; // === SCHEDULER REFERENCE === // Note: In simulated mode, orchestrator and scheduler share address space @@ -93,7 +94,6 @@ struct PTO2OrchestratorState { // The executor adds this count into its completed_tasks_ progress counter // after orchestration finishes so shutdown/profiling totals remain closed. int64_t inline_completed_tasks{0}; - // === STATISTICS === #if PTO2_PROFILING int64_t tasks_submitted; @@ -151,7 +151,7 @@ void pto2_orchestrator_set_scheduler(PTO2OrchestratorState *orch, PTO2SchedulerS * Tasks submitted while this scope is at the top of the stack are * owned by it and have their fanout_count initialized to 1. */ -void pto2_scope_begin(PTO2OrchestratorState *orch); +void pto2_scope_begin(PTO2OrchestratorState *orch, PTO2ScopeMode mode = PTO2ScopeMode::AUTO); /** * End current scope @@ -190,6 +190,10 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke * task id for scope lifetime and future creator-retention dependencies. */ TaskOutputTensors pto2_alloc_tensors(PTO2OrchestratorState *orch, const Arg &args); +PTO2ManualSubmitResult +pto2_submit_mixed_task_manual(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args); + +void pto2_add_dependency(PTO2OrchestratorState *orch, PTO2TaskId producer, PTO2TaskId consumer); // ============================================================================= // Flow Control diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 8085ed63d..5c0f837e3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -45,7 +45,17 @@ static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const Arg &args) { return pto2_alloc_tensors(&rt->orchestrator, args); } -void pto2_rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator); } +PTO2ManualSubmitResult pto2_rt_submit_task_manual(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) { + return pto2_submit_mixed_task_manual(&rt->orchestrator, mixed_kernels, args); +} + +void pto2_rt_add_dependency(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer) { + pto2_add_dependency(&rt->orchestrator, producer, consumer); +} + +void pto2_rt_scope_begin(PTO2Runtime *rt, PTO2ScopeMode mode) { + pto2_scope_begin(&rt->orchestrator, mode); +} void pto2_rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); } @@ -53,6 +63,20 @@ void pto2_rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->o static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; } +static bool in_manual_scope_runtime(PTO2Runtime *rt) { + return rt->orchestrator.manual_scope_active; +} + +static void fail_manual_tensor_access(PTO2Runtime *rt, const char *caller) { + PTO2OrchestratorState &orch = rt->orchestrator; + orch.sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); + orch.fatal = true; + unified_log_error( + caller, + "blocking tensor data access is not supported inside PTO2_SCOPE(PTO2ScopeMode::MANUAL); exit the manual scope first" + ); +} + // Wait for all producers of this tensor to be safe for data access. // Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers). // For reads: wait until each producer COMPLETED (done writing). @@ -137,6 +161,10 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa MAYBE_UNINITIALIZED_END uint64_t pto2_get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) { + if (in_manual_scope_runtime(rt)) { + fail_manual_tensor_access(rt, __FUNCTION__); + return 0; + } if (tensor.buffer.addr == 0) { unified_log_error( __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). " @@ -160,6 +188,10 @@ uint64_t pto2_get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t nd void pto2_set_tensor_data( PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value ) { + if (in_manual_scope_runtime(rt)) { + fail_manual_tensor_access(rt, __FUNCTION__); + return; + } if (tensor.buffer.addr == 0) { unified_log_error( __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). " @@ -181,6 +213,8 @@ void pto2_set_tensor_data( static const PTO2RuntimeOps s_runtime_ops = { .submit_task = submit_task_impl, + .submit_task_manual = pto2_rt_submit_task_manual, + .add_dependency = pto2_rt_add_dependency, .scope_begin = pto2_rt_scope_begin, .scope_end = pto2_rt_scope_end, .orchestration_done = pto2_rt_orchestration_done, diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 779b75143..436ad091a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -67,7 +67,9 @@ typedef struct PTO2Runtime PTO2Runtime; // forward declare for ops signatures struct PTO2RuntimeOps { TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); - void (*scope_begin)(PTO2Runtime *rt); + PTO2ManualSubmitResult (*submit_task_manual)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); + void (*add_dependency)(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer); + void (*scope_begin)(PTO2Runtime *rt, PTO2ScopeMode mode); void (*scope_end)(PTO2Runtime *rt); void (*orchestration_done)(PTO2Runtime *rt); bool (*is_fatal)(PTO2Runtime *rt); @@ -176,7 +178,7 @@ void pto2_runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode); * bounded by the scope. When scope_end() is called, the scope * releases its reference to all enclosed tasks. */ -void pto2_rt_scope_begin(PTO2Runtime *rt); +void pto2_rt_scope_begin(PTO2Runtime *rt, PTO2ScopeMode mode = PTO2ScopeMode::AUTO); /** * End current scope @@ -186,6 +188,10 @@ void pto2_rt_scope_begin(PTO2Runtime *rt); */ void pto2_rt_scope_end(PTO2Runtime *rt); +PTO2ManualSubmitResult pto2_rt_submit_task_manual(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); + +void pto2_rt_add_dependency(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer); + /** * Mark orchestration as complete * diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 247f09fed..4654656c1 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -80,6 +80,7 @@ #define PTO2_ERROR_DEP_POOL_OVERFLOW 4 #define PTO2_ERROR_INVALID_ARGS 5 // Arg construction error (invalid args) #define PTO2_ERROR_DEPENDENCY_OVERFLOW 6 // Too many unique fanin dependencies for one task +#define PTO2_ERROR_OUT_OF_MEMORY 7 // Runtime metadata buffer growth failed // Scheduler errors (100+): detected in scheduler threads #define PTO2_ERROR_SCHEDULER_TIMEOUT 100 @@ -135,6 +136,20 @@ constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL; * TaskId: defined in pto_task_id.h (included above). */ +// ============================================================================= +// Manual Scope Types +// ============================================================================= + +enum class PTO2ScopeMode : uint8_t { + AUTO = 0, + MANUAL = 1, +}; + +struct PTO2ManualSubmitResult { + PTO2TaskId task_id; + TaskOutputTensors outputs; +}; + // ============================================================================= // Worker Types // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index 0c3f5a0ff..bd451a221 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -653,6 +653,44 @@ struct PTO2SchedulerState { #endif } + void publish_manual_scope_tasks(PTO2TaskSlotState **task_slot_states, int32_t count) { + for (int32_t i = 0; i < count; i++) { + PTO2TaskSlotState &slot_state = *task_slot_states[i]; + int32_t new_rc = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; + if (new_rc >= slot_state.fanin_count) { + PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask); + ready_queues[static_cast(shape)].push(&slot_state); + } + } + } + + void publish_manual_scope_tasks_and_end_scope(PTO2TaskSlotState **task_slot_states, int32_t count) { +#if PTO2_ORCH_PROFILING + extern uint64_t g_orch_scope_end_atomic_count; +#endif + if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0); + for (int32_t i = 0; i < count; i++) { + if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0); + PTO2TaskSlotState &slot_state = *task_slot_states[i]; + int32_t new_rc = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; +#if PTO2_ORCH_PROFILING + g_orch_scope_end_atomic_count += 1; // fanin_refcount.fetch_add +#endif + if (new_rc >= slot_state.fanin_count) { + PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask); +#if PTO2_ORCH_PROFILING + g_orch_scope_end_atomic_count += 1; // ready queue push lock/CAS path +#endif + ready_queues[static_cast(shape)].push(&slot_state); + } +#if PTO2_ORCH_PROFILING + release_producer(slot_state, g_orch_scope_end_atomic_count); +#else + release_producer(slot_state); +#endif + } + } + /** * Subtask completion: atomic counter model. * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block. diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md index c6d0e3ebd..9f7d1f68c 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -204,9 +204,9 @@ Milestone command (device): ```bash python examples/scripts/run_example.py \ - -k tests/st/tensormap_and_ringbuffer/batch_paged_attention/kernels \ - -g tests/st/tensormap_and_ringbuffer/batch_paged_attention/golden.py \ - -p a2a3 -d 9 + -k tests/st/a5/tensormap_and_ringbuffer/batch_paged_attention/kernels \ + -g tests/st/a5/tensormap_and_ringbuffer/batch_paged_attention/golden.py \ + -p a5 -d 9 ``` Final validation: diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md b/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md index b56d9774d..b3dd362ce 100644 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md +++ b/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md @@ -58,14 +58,14 @@ Block n: QK -> SF -> PV --+ ```bash # Run on hardware (specify device ID) python examples/scripts/run_example.py \ - -k tests/st/aicpu_build_graph/paged_attention/kernels \ - -g tests/st/aicpu_build_graph/paged_attention/golden.py \ + -k tests/st/a2a3/aicpu_build_graph/paged_attention/kernels \ + -g tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py \ -p a2a3 -d 0 # Run multi-block test case PA_CASE=Case2 python examples/scripts/run_example.py \ - -k tests/st/aicpu_build_graph/paged_attention/kernels \ - -g tests/st/aicpu_build_graph/paged_attention/golden.py \ + -k tests/st/a2a3/aicpu_build_graph/paged_attention/kernels \ + -g tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py \ -p a2a3 -d 0 ``` diff --git a/tests/st/a2a3/host_build_graph/paged_attention/README.md b/tests/st/a2a3/host_build_graph/paged_attention/README.md index bb280c331..3524c7e75 100644 --- a/tests/st/a2a3/host_build_graph/paged_attention/README.md +++ b/tests/st/a2a3/host_build_graph/paged_attention/README.md @@ -63,14 +63,14 @@ Block n: QK -> SF -> PV --+ ```bash # Run on hardware (specify device ID) python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ + -k tests/st/a2a3/host_build_graph/paged_attention/kernels \ + -g tests/st/a2a3/host_build_graph/paged_attention/golden.py \ -p a2a3 -d 0 # Run multi-block test case PA_CASE=Case2 python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ + -k tests/st/a2a3/host_build_graph/paged_attention/kernels \ + -g tests/st/a2a3/host_build_graph/paged_attention/golden.py \ -p a2a3 -d 0 ``` diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/golden.py new file mode 100644 index 000000000..0f19662d7 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/golden.py @@ -0,0 +1,35 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +import ctypes + +import torch + + +ALL_CASES = { + "NestedManualScope": {"mode": 1}, + "ManualGetTensorData": {"mode": 2}, + "ManualSetTensorData": {"mode": 3}, + "ManualSelfDependency": {"mode": 4}, +} + +DEFAULT_CASE = "NestedManualScope" +__outputs__ = ["tensor"] + + +def generate_inputs(params: dict) -> list: + tensor = torch.arange(16, dtype=torch.float32) + return [ + ("tensor", tensor), + ("mode", ctypes.c_uint64(params["mode"])), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + del tensors, params diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/kernels/kernel_config.py new file mode 100644 index 000000000..358bf59fb --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/kernels/kernel_config.py @@ -0,0 +1,36 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +from pathlib import Path + +from task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_KERNELS_ROOT = Path(__file__).parent +_SCALAR_DATA_ROOT = _KERNELS_ROOT.parents[1] / "scalar_data_test" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "manual_scope_guard_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN], +} + +KERNELS = [ + { + "func_id": 0, + "source": str(_SCALAR_DATA_ROOT / "aiv" / "kernel_noop.cpp"), + "core_type": "aiv", + "signature": [], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 3, +} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/kernels/orchestration/manual_scope_guard_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/kernels/orchestration/manual_scope_guard_orch.cpp new file mode 100644 index 000000000..f9a37ddd7 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_guard_negative/kernels/orchestration/manual_scope_guard_orch.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_NOOP 0 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config( + const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 2, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry( + const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + (void)orch_thread_index; // NOLINT(readability/casting) + + Tensor tensor = from_tensor_arg(orch_args.tensor(0)); + uint64_t mode = orch_args.scalar(0); + uint32_t idx[1] = {0}; + + switch (mode) { + case 1: + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + PTO2_SCOPE(PTO2ScopeMode::MANUAL) {} + } + break; + case 2: + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + (void)get_tensor_data(tensor, 1, idx); // NOLINT(readability/casting) + } + break; + case 3: + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { set_tensor_data(tensor, 1, idx, 1.0f); } + break; + case 4: + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + PTO2TaskId invalid = PTO2TaskId::invalid(); + pto2_rt_add_dependency(invalid, invalid); + } + break; + default: + PTO2_SCOPE() {} + break; + } +} +} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/golden.py new file mode 100644 index 000000000..019102803 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/golden.py @@ -0,0 +1,46 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +import torch + + +__outputs__ = ["out", "result", "check"] + +RTOL = 1e-5 +ATOL = 1e-5 + + +def generate_inputs(params: dict) -> list: + del params + size = 128 * 128 + a = torch.full((size,), 1.0, dtype=torch.float32) + b = torch.full((size,), 2.0, dtype=torch.float32) + out = torch.zeros(size, dtype=torch.float32) + result = torch.zeros(size, dtype=torch.float32) + check = torch.zeros(4, dtype=torch.float32) + return [ + ("a", a), + ("b", b), + ("out", out), + ("result", result), + ("check", check), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + del params + out = torch.as_tensor(tensors["out"]) + result = torch.as_tensor(tensors["result"]) + check = torch.as_tensor(tensors["check"]) + + out.fill_(5.0) + result.fill_(7.0) + check[0] = 5.0 + check[1] = 7.0 + check[2] = 5.0 diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/kernels/kernel_config.py new file mode 100644 index 000000000..81bbb5465 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/kernels/kernel_config.py @@ -0,0 +1,36 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +from pathlib import Path + +from task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_KERNELS_ROOT = Path(__file__).parent +_SCALAR_DATA_ROOT = _KERNELS_ROOT.parents[1] / "scalar_data_test" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "manual_scope_outer_multiwrite_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.OUT, D.OUT], +} + +KERNELS = [ + { + "func_id": 0, + "source": str(_SCALAR_DATA_ROOT / "aiv" / "kernel_add.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 3, +} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/kernels/orchestration/manual_scope_outer_multiwrite_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/kernels/orchestration/manual_scope_outer_multiwrite_orch.cpp new file mode 100644 index 000000000..7d363f9b2 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/manual_scope_outer_multiwrite/kernels/orchestration/manual_scope_outer_multiwrite_orch.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_ADD 0 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config( + const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 5, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry( + const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + (void)orch_thread_index; // NOLINT(readability/casting) + + Tensor ext_a = from_tensor_arg(orch_args.tensor(0)); + Tensor ext_b = from_tensor_arg(orch_args.tensor(1)); + Tensor ext_out = from_tensor_arg(orch_args.tensor(2)); + Tensor ext_result = from_tensor_arg(orch_args.tensor(3)); + Tensor ext_check = from_tensor_arg(orch_args.tensor(4)); + + uint32_t size = orch_args.tensor(0).shapes[0]; + uint32_t inter_shapes[1] = {size}; + TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32); + + PTO2_SCOPE() { + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + Arg tmp0_args; + tmp0_args.add_input(ext_a); + tmp0_args.add_input(ext_a); + tmp0_args.add_output(inter_ci); + PTO2ManualSubmitResult tmp0 = pto2_rt_submit_aiv_task_manual(FUNC_ADD, tmp0_args); + + Arg write0_args; + write0_args.add_input(tmp0.outputs.get_ref(0)); + write0_args.add_input(ext_a); + write0_args.add_output(ext_out); + PTO2ManualSubmitResult write0 = pto2_rt_submit_aiv_task_manual(FUNC_ADD, write0_args); + pto2_rt_add_dependency(tmp0.task_id, write0.task_id); + + Arg tmp1_args; + tmp1_args.add_input(ext_b); + tmp1_args.add_input(ext_b); + tmp1_args.add_output(inter_ci); + PTO2ManualSubmitResult tmp1 = pto2_rt_submit_aiv_task_manual(FUNC_ADD, tmp1_args); + + Arg write1_args; + write1_args.add_input(tmp1.outputs.get_ref(0)); + write1_args.add_input(ext_a); + write1_args.add_output(ext_out); + PTO2ManualSubmitResult write1 = pto2_rt_submit_aiv_task_manual(FUNC_ADD, write1_args); + pto2_rt_add_dependency(tmp1.task_id, write1.task_id); + pto2_rt_add_dependency(write0.task_id, write1.task_id); + } + + Arg consumer_args; + consumer_args.add_input(ext_out); + consumer_args.add_input(ext_b); + consumer_args.add_output(ext_result); + pto2_rt_submit_aiv_task(FUNC_ADD, consumer_args); + + uint32_t idx0[1] = {0}; + uint32_t idx100[1] = {100}; + + float out0 = get_tensor_data(ext_out, 1, idx0); + float result0 = get_tensor_data(ext_result, 1, idx0); + float out100 = get_tensor_data(ext_out, 1, idx100); + + idx0[0] = 0; + set_tensor_data(ext_check, 1, idx0, out0); + idx0[0] = 1; + set_tensor_data(ext_check, 1, idx0, result0); + idx0[0] = 2; + set_tensor_data(ext_check, 1, idx0, out100); + } +} +} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/golden.py new file mode 100644 index 000000000..89df96225 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/golden.py @@ -0,0 +1,7 @@ +from pathlib import Path +import sys + +_BASE = Path(__file__).resolve().parents[1] / "paged_attention" +sys.path.insert(0, str(_BASE)) + +from golden import ALL_CASES, ATOL, DEFAULT_CASE, RTOL, __outputs__, compute_golden, generate_inputs # noqa: E402,F401 diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/aic/aic_hub.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/aic/aic_hub.cpp new file mode 100644 index 000000000..0b3062f18 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/aic/aic_hub.cpp @@ -0,0 +1,14 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { (void)args; } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/aiv/aiv_hub.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/aiv/aiv_hub.cpp new file mode 100644 index 000000000..0b3062f18 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/aiv/aiv_hub.cpp @@ -0,0 +1,14 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { (void)args; } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/kernel_config.py new file mode 100644 index 000000000..80765faab --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/kernel_config.py @@ -0,0 +1,71 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from pathlib import Path + +from task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_ROOT = Path(__file__).parent +_PA_KERNELS = _ROOT.parent.parent / "paged_attention" / "kernels" + +ORCHESTRATION = { + "source": str(_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], +} + +KERNELS = [ + { + "func_id": 0, + "name": "QK", + "source": str(_PA_KERNELS / "aic" / "aic_qk_matmul.cpp"), + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": str(_PA_KERNELS / "aic" / "aic_pv_matmul.cpp"), + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 4, + "name": "AIC_HUB", + "source": str(_ROOT / "aic" / "aic_hub.cpp"), + "core_type": "aic", + "signature": [], + }, + { + "func_id": 1, + "name": "SF", + "source": str(_PA_KERNELS / "aiv" / "aiv_softmax_prepare.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": str(_PA_KERNELS / "aiv" / "aiv_online_update.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + { + "func_id": 5, + "name": "AIV_HUB", + "source": str(_ROOT / "aiv" / "aiv_hub.cpp"), + "core_type": "aiv", + "signature": [], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 24, +} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..51b2bad7c --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_partial_manual/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +#define FUNC_AIC_HUB 4 +#define FUNC_AIV_HUB 5 +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + (void)orch_thread_index; // NOLINT(readability/casting) + + uint64_t batch = orch_args.tensor(0).shapes[0]; + uint64_t num_heads = orch_args.tensor(0).shapes[1]; + uint64_t head_dim = orch_args.tensor(0).shapes[2]; + DataType data_type = orch_args.tensor(0).dtype; + uint64_t block_size = orch_args.tensor(1).shapes[1]; + uint64_t block_num = orch_args.tensor(3).shapes[1]; + uint64_t scale_value = orch_args.scalar(0); + + uint64_t q_head_num = num_heads; + uint64_t q_tile = std::min(num_heads, 128UL); + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + + void *query_ptr = orch_args.tensor(0).data_as(); + void *kc_ptr = orch_args.tensor(1).data_as(); + void *vc_ptr = orch_args.tensor(2).data_as(); + void *out_ptr = orch_args.tensor(5).data_as(); + + uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t value_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, true); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, true); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, true); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32, true); + + int *host_block_table = orch_args.tensor(3).data_as(); + int *host_context_lens = orch_args.tensor(4).data_as(); + + uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t scalar_shapes[1] = {static_cast(q_tile)}; + uint32_t sij_shapes[2] = {static_cast(q_tile), static_cast(block_size)}; + TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32); + TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); + TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type); + + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + uint64_t cur_seq = host_context_lens[b_idx]; + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE() { + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + + uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; + uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; + Tensor qi = query.view(tile2d_shapes, qi_offsets, true); + Tensor out_view = out.view(tile2d_shapes, out_view_offsets, true); + + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + Arg params_inplace; + params_inplace.add_output(tile2d_ci); + params_inplace.add_output(scalar_ci); + params_inplace.add_output(scalar_ci); + PTO2ManualSubmitResult hub_outs = pto2_rt_submit_aiv_task_manual(FUNC_AIV_HUB, params_inplace); + const Tensor &oi = hub_outs.outputs.get_ref(0); + const Tensor &li_update = hub_outs.outputs.get_ref(1); + const Tensor &mi_update = hub_outs.outputs.get_ref(2); + PTO2TaskId prev_update_task = hub_outs.task_id; + + for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn]; + uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); + + uint32_t kv_shapes[2] = { + static_cast(block_size), static_cast(head_dim) + }; + uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; + Tensor kj = key_cache.view(kv_shapes, kv_offsets, true); + Tensor vj = value_cache.view(kv_shapes, kv_offsets, true); + + Arg params_qk; + params_qk.add_input(qi); + params_qk.add_input(kj); + params_qk.add_output(sij_ci); + PTO2ManualSubmitResult qk_outs = pto2_rt_submit_aic_task_manual(FUNC_QK_MATMUL, params_qk); + const Tensor &sij = qk_outs.outputs.get_ref(0); + + uint32_t sij_valid_shapes[2] = { + static_cast(q_tile), static_cast(valid_len) + }; + uint32_t sij_valid_offsets[2] = {0, 0}; + Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + + Arg params_sf; + params_sf.add_input(sij_valid); + params_sf.add_output(pij_f16_ci); + params_sf.add_output(scalar_ci); + params_sf.add_output(scalar_ci); + params_sf.add_scalar(scale_value); + PTO2ManualSubmitResult sf_outs = + pto2_rt_submit_aiv_task_manual(FUNC_SOFTMAX_PREPARE, params_sf); + const Tensor &pij_f16 = sf_outs.outputs.get_ref(0); + const Tensor &mi = sf_outs.outputs.get_ref(1); + const Tensor &li = sf_outs.outputs.get_ref(2); + + Arg params_pv; + params_pv.add_input(pij_f16); + params_pv.add_input(vj); + params_pv.add_output(tile2d_ci); + PTO2ManualSubmitResult pv_outs = pto2_rt_submit_aic_task_manual(FUNC_PV_MATMUL, params_pv); + const Tensor &oi_tmp = pv_outs.outputs.get_ref(0); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + + Arg params_up; + params_up.add_input(mi); + params_up.add_input(li); + params_up.add_input(oi_tmp); + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + params_up.add_inout(out_view); + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + PTO2ManualSubmitResult up_outs = + pto2_rt_submit_aiv_task_manual(FUNC_ONLINE_UPDATE, params_up); + + pto2_rt_add_dependency(qk_outs.task_id, sf_outs.task_id); + pto2_rt_add_dependency(sf_outs.task_id, pv_outs.task_id); + pto2_rt_add_dependency(sf_outs.task_id, up_outs.task_id); + pto2_rt_add_dependency(pv_outs.task_id, up_outs.task_id); + pto2_rt_add_dependency(prev_update_task, up_outs.task_id); + prev_update_task = up_outs.task_id; + } + } + } + } + } +} + +} // extern "C" diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/golden.py new file mode 100644 index 000000000..d3f3c1ac1 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/golden.py @@ -0,0 +1,8 @@ +from pathlib import Path +import sys + +_BASE = Path(__file__).resolve().parents[1] / "paged_attention_unroll" +sys.path.insert(0, str(_BASE)) + +from golden import ALL_CASES, ATOL, DEFAULT_CASE, RTOL, __outputs__, generate_inputs # noqa: E402,F401 +from paged_attention_golden import compute_golden, run_golden_test # noqa: E402,F401 diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/aic/aic_hub.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/aic/aic_hub.cpp new file mode 100644 index 000000000..0b3062f18 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/aic/aic_hub.cpp @@ -0,0 +1,14 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { (void)args; } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/aiv/aiv_hub.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/aiv/aiv_hub.cpp new file mode 100644 index 000000000..0b3062f18 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/aiv/aiv_hub.cpp @@ -0,0 +1,14 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { (void)args; } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/kernel_config.py new file mode 100644 index 000000000..52b624312 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/kernel_config.py @@ -0,0 +1,72 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from pathlib import Path + +from task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_ROOT = Path(__file__).parent +_PA_KERNELS = _ROOT.parent.parent / "paged_attention_unroll" / "kernels" + +ORCHESTRATION = { + "source": str(_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], +} + +KERNELS = [ + { + "func_id": 0, + "name": "QK", + "source": str(_PA_KERNELS / "aic" / "aic_qk_matmul.cpp"), + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": str(_PA_KERNELS / "aic" / "aic_pv_matmul.cpp"), + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 4, + "name": "AIC_HUB", + "source": str(_ROOT / "aic" / "aic_hub.cpp"), + "core_type": "aic", + "signature": [], + }, + { + "func_id": 1, + "name": "SF", + "source": str(_PA_KERNELS / "aiv" / "aiv_softmax_prepare.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": str(_PA_KERNELS / "aiv" / "aiv_online_update.cpp"), + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + { + "func_id": 5, + "name": "AIV_HUB", + "source": str(_ROOT / "aiv" / "aiv_hub.cpp"), + "core_type": "aiv", + "signature": [], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..9cc65b575 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll_partial_manual/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define N_UNROLL 64 + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +#define FUNC_AIC_HUB 4 +#define FUNC_AIV_HUB 5 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + (void)orch_thread_index; // NOLINT(readability/casting) + + uint64_t batch = orch_args.tensor(0).shapes[0]; + uint64_t num_heads = orch_args.tensor(0).shapes[1]; + uint64_t head_dim = orch_args.tensor(0).shapes[2]; + DataType data_type = orch_args.tensor(0).dtype; + uint64_t block_size = orch_args.tensor(1).shapes[1]; + uint64_t block_num = orch_args.tensor(3).shapes[1]; + uint64_t scale_value = orch_args.scalar(0); + + uint64_t q_head_num = num_heads; + uint64_t q_tile = std::min(num_heads, 128UL); + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + + void *query_ptr = orch_args.tensor(0).data_as(); + void *kc_ptr = orch_args.tensor(1).data_as(); + void *vc_ptr = orch_args.tensor(2).data_as(); + void *out_ptr = orch_args.tensor(5).data_as(); + + uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t value_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); + + uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; + Tensor block_table = + make_tensor_external(orch_args.tensor(3).data_as(), bt_shapes, 2, DataType::INT32, false); + uint32_t cl_shapes[1] = {static_cast(batch)}; + Tensor context_lens = + make_tensor_external(orch_args.tensor(4).data_as(), cl_shapes, 1, DataType::INT32, false); + + uint32_t oi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t li_shapes[1] = {static_cast(q_tile)}; + TensorCreateInfo tile2d_ci(oi_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_noinit_ci(li_shapes, 1, DataType::FLOAT32, false); + TensorCreateInfo scalar_ci(li_shapes, 1, DataType::FLOAT32); + + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + uint32_t cl_idx[1] = {static_cast(b_idx)}; + uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE() { + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + + uint32_t qi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; + Tensor qi = query.view(qi_shapes, qi_offsets); + uint32_t out_view_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; + Tensor out_view = out.view(out_view_shapes, out_view_offsets, true); + + Arg params_inplace; + params_inplace.add_output(tile2d_ci); + params_inplace.add_output(scalar_noinit_ci); + params_inplace.add_output(scalar_noinit_ci); + TaskOutputTensors hub_outs = pto2_rt_submit_aiv_task(FUNC_AIV_HUB, params_inplace); + const Tensor &oi = hub_outs.get_ref(0); + const Tensor &li_update = hub_outs.get_ref(1); + const Tensor &mi_update = hub_outs.get_ref(2); + + Arg params_qk; + Arg params_sf; + Arg params_pv; + Arg params_up; + + for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) { + uint64_t n_blocks = std::min(static_cast(N_UNROLL), bn_this_batch - bn); + uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size; + uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start); + + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + uint32_t sij_buf_shapes[2] = { + static_cast(q_tile), static_cast(n_blocks * block_size) + }; + TensorCreateInfo sij_buf_ci(sij_buf_shapes, 2, DataType::FLOAT32); + + params_qk.reset(); + params_qk.add_input(qi); + params_qk.add_input(key_cache); + params_qk.add_input(block_table); + params_qk.add_output(sij_buf_ci); + params_qk.add_scalar(n_blocks); + params_qk.add_scalar(b_idx * block_num + bn); + PTO2ManualSubmitResult qk_outs = pto2_rt_submit_aic_task_manual(FUNC_QK_MATMUL, params_qk); + + uint32_t pij_buf_shapes[2] = { + static_cast(q_tile), static_cast(n_blocks * block_size) + }; + TensorCreateInfo pij_buf_ci(pij_buf_shapes, 2, data_type); + + params_sf.reset(); + params_sf.add_input(qk_outs.outputs.get_ref(0)); + params_sf.add_output(pij_buf_ci); + params_sf.add_output(scalar_ci); + params_sf.add_output(scalar_ci); + params_sf.add_scalar(scale_value); + params_sf.add_scalar(n_blocks); + params_sf.add_scalar(valid_len_last); + PTO2ManualSubmitResult sf_outs = + pto2_rt_submit_aiv_task_manual(FUNC_SOFTMAX_PREPARE, params_sf); + + params_pv.reset(); + params_pv.add_input(sf_outs.outputs.get_ref(0)); + params_pv.add_input(value_cache); + params_pv.add_input(block_table); + params_pv.add_output(tile2d_ci); + params_pv.add_scalar(n_blocks); + params_pv.add_scalar(b_idx * block_num + bn); + PTO2ManualSubmitResult pv_outs = pto2_rt_submit_aic_task_manual(FUNC_PV_MATMUL, params_pv); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0; + + params_up.reset(); + params_up.add_input(sf_outs.outputs.get_ref(1)); + params_up.add_input(sf_outs.outputs.get_ref(2)); + params_up.add_input(pv_outs.outputs.get_ref(0)); + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + params_up.add_inout(out_view); + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + PTO2ManualSubmitResult up_outs = pto2_rt_submit_aiv_task_manual(FUNC_ONLINE_UPDATE, params_up); + + pto2_rt_add_dependency(qk_outs.task_id, sf_outs.task_id); + pto2_rt_add_dependency(sf_outs.task_id, pv_outs.task_id); + pto2_rt_add_dependency(sf_outs.task_id, up_outs.task_id); + pto2_rt_add_dependency(pv_outs.task_id, up_outs.task_id); + } + } + } + } + } +} + +} // extern "C" diff --git a/tests/st/a5/host_build_graph/paged_attention/README.md b/tests/st/a5/host_build_graph/paged_attention/README.md index bb280c331..a9edfb51e 100644 --- a/tests/st/a5/host_build_graph/paged_attention/README.md +++ b/tests/st/a5/host_build_graph/paged_attention/README.md @@ -63,15 +63,15 @@ Block n: QK -> SF -> PV --+ ```bash # Run on hardware (specify device ID) python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 + -k tests/st/a5/host_build_graph/paged_attention/kernels \ + -g tests/st/a5/host_build_graph/paged_attention/golden.py \ + -p a5 -d 0 # Run multi-block test case PA_CASE=Case2 python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 + -k tests/st/a5/host_build_graph/paged_attention/kernels \ + -g tests/st/a5/host_build_graph/paged_attention/golden.py \ + -p a5 -d 0 ``` ## Directory Structure diff --git a/tests/ut/hardware_test_utils.py b/tests/ut/hardware_test_utils.py new file mode 100644 index 000000000..82d49b91e --- /dev/null +++ b/tests/ut/hardware_test_utils.py @@ -0,0 +1,34 @@ +import os +import re +import subprocess + + +def get_test_device_id(default: str = "0") -> str: + """Pick a hardware test device. + + Respect PTO_TEST_DEVICE_ID when explicitly provided. Otherwise prefer the + lowest-ID NPU that reports no running processes in `npu-smi info`, which is + more stable than blindly defaulting to device 0 on shared machines. + """ + + configured = os.environ.get("PTO_TEST_DEVICE_ID") + if configured: + return configured + + try: + result = subprocess.run( + ["npu-smi", "info"], + capture_output=True, + text=True, + check=False, + ) + except FileNotFoundError: + return default + + if result.returncode != 0: + return default + + free_devices = sorted({int(match) for match in re.findall(r"No running processes found in NPU (\d+)", result.stdout)}) + if free_devices: + return str(free_devices[0]) + return default diff --git a/tests/ut/test_manual_scope_boundary.py b/tests/ut/test_manual_scope_boundary.py new file mode 100644 index 000000000..aa043b313 --- /dev/null +++ b/tests/ut/test_manual_scope_boundary.py @@ -0,0 +1,38 @@ +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +from hardware_test_utils import get_test_device_id + + +PROJECT_ROOT = Path(__file__).parent.parent.parent +RUN_EXAMPLE = PROJECT_ROOT / "examples" / "scripts" / "run_example.py" +KERNELS_DIR = ( + PROJECT_ROOT / "tests" / "st" / "a2a3" / "tensormap_and_ringbuffer" / "manual_scope_outer_multiwrite" / "kernels" +) +GOLDEN = PROJECT_ROOT / "tests" / "st" / "a2a3" / "tensormap_and_ringbuffer" / "manual_scope_outer_multiwrite" / "golden.py" +PTO_ISA_COMMIT = "6622890" + + +@pytest.mark.requires_hardware +@pytest.mark.skipif(not os.getenv("ASCEND_HOME_PATH"), reason="ASCEND_HOME_PATH not set; Ascend toolkit required") +def test_manual_scope_outer_multiwrite_boundary(): + device_id = get_test_device_id() + command = ( + f"source {os.environ['ASCEND_HOME_PATH']}/bin/setenv.bash >/dev/null 2>&1 && " + f"{sys.executable} {RUN_EXAMPLE} --build --silent " + f"-k {KERNELS_DIR} -g {GOLDEN} -p a2a3 -d {device_id} " + f"--clone-protocol https -c {PTO_ISA_COMMIT}" + ) + result = subprocess.run( + ["bash", "-lc", command], + cwd=PROJECT_ROOT, + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 0, result.stdout + result.stderr diff --git a/tests/ut/test_manual_scope_guards.py b/tests/ut/test_manual_scope_guards.py new file mode 100644 index 000000000..82f5632df --- /dev/null +++ b/tests/ut/test_manual_scope_guards.py @@ -0,0 +1,83 @@ +import os +import subprocess +import sys +import time +from pathlib import Path + +import pytest + +from hardware_test_utils import get_test_device_id + + +PROJECT_ROOT = Path(__file__).parent.parent.parent +RUN_EXAMPLE = PROJECT_ROOT / "examples" / "scripts" / "run_example.py" +KERNELS_DIR = PROJECT_ROOT / "tests" / "st" / "a2a3" / "tensormap_and_ringbuffer" / "manual_scope_guard_negative" / "kernels" +GOLDEN = PROJECT_ROOT / "tests" / "st" / "a2a3" / "tensormap_and_ringbuffer" / "manual_scope_guard_negative" / "golden.py" +PTO_ISA_COMMIT = "6622890" + + +@pytest.mark.requires_hardware +@pytest.mark.skipif(not os.getenv("ASCEND_HOME_PATH"), reason="ASCEND_HOME_PATH not set; Ascend toolkit required") +@pytest.mark.parametrize( + ("case_name", "expected_message"), + [ + ( + "NestedManualScope", + "manual scope inside manual scope is not supported", + ), + ( + "ManualGetTensorData", + "blocking tensor data access is not supported inside PTO2_SCOPE(PTO2ScopeMode::MANUAL); exit the manual scope first", + ), + ( + "ManualSetTensorData", + "blocking tensor data access is not supported inside PTO2_SCOPE(PTO2ScopeMode::MANUAL); exit the manual scope first", + ), + ( + "ManualSelfDependency", + "add_dependency does not allow self-dependency", + ), + ], +) +def test_manual_scope_guard_failures(case_name, expected_message): + device_id = get_test_device_id() + log_dir = Path.home() / "ascend" / "log" / "debug" / f"device-{device_id}" + if os.getenv("ASCEND_WORK_PATH"): + work_log_dir = Path(os.environ["ASCEND_WORK_PATH"]).expanduser() / "log" / "debug" / f"device-{device_id}" + if work_log_dir.exists(): + log_dir = work_log_dir + before_logs = set(log_dir.glob("*.log")) if log_dir.exists() else set() + command = ( + f"source {os.environ['ASCEND_HOME_PATH']}/bin/setenv.bash >/dev/null 2>&1 && " + f"{sys.executable} {RUN_EXAMPLE} --build --silent " + f"-k {KERNELS_DIR} -g {GOLDEN} -p a2a3 -d {device_id} " + f"--case {case_name} --clone-protocol https -c {PTO_ISA_COMMIT}" + ) + result = subprocess.run( + ["bash", "-lc", command], + cwd=PROJECT_ROOT, + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode != 0 + combined_output = result.stdout + result.stderr + + new_log = None + deadline = time.monotonic() + 20 + while time.monotonic() < deadline: + current_logs = set(log_dir.glob("*.log")) if log_dir.exists() else set() + created = current_logs - before_logs + if created: + new_log = max(created, key=lambda path: path.stat().st_mtime) + break + time.sleep(0.5) + + if new_log is None: + logs = list(log_dir.glob("*.log")) if log_dir.exists() else [] + assert logs, "expected a device log for the failed manual-scope case" + new_log = max(logs, key=lambda path: path.stat().st_mtime) + + log_text = new_log.read_text(encoding="utf-8", errors="ignore") + assert expected_message in combined_output or expected_message in log_text diff --git a/tools/README.md b/tools/README.md index 2d807e53c..6d691fbbd 100644 --- a/tools/README.md +++ b/tools/README.md @@ -34,7 +34,7 @@ python3 tools/swimlane_converter.py outputs/perf_swimlane_20260210_143526.json - # 从 kernel_config.py 加载函数名映射 python3 tools/swimlane_converter.py outputs/perf_swimlane_20260210_143526.json \ - -k examples/host_build_graph/paged_attention/kernels/kernel_config.py + -k examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py # 使用指定 device id 自动选择 device log(device-) python3 tools/swimlane_converter.py outputs/perf_swimlane_20260210_143526.json -d 0 @@ -102,8 +102,8 @@ log root 解析顺序: ```bash # 运行测试并启用性能分析 - 测试通过后自动生成 merged_swimlane.json python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ --enable-profiling ``` @@ -190,7 +190,7 @@ python3 tools/perf_to_mermaid.py outputs/perf_swimlane_20260210_143526.json -o d # 从 kernel_config.py 加载函数名映射 python3 tools/perf_to_mermaid.py outputs/perf_swimlane_20260210_143526.json \ - -k examples/host_build_graph/paged_attention/kernels/kernel_config.py + -k examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py # 使用紧凑样式(仅显示任务ID和函数名) python3 tools/perf_to_mermaid.py outputs/perf_swimlane_20260210_143526.json --style compact @@ -270,7 +270,7 @@ flowchart TD ### 功能概述 -`benchmark_rounds.sh` 遍历 `EXAMPLES` 数组中配置的测试用例(位于 `tests/st/tensormap_and_ringbuffer/` 下),依次调用 `run_example.py` 运行每个 example,然后从生成的 device log 中提取 `orch_start` / `orch_end` / `sched_end` 时间戳计算每轮 elapsed 时间。 +`benchmark_rounds.sh` 遍历脚本顶部为对应 runtime 配置的测试用例(位于 `tests/st/{arch}/{runtime}/` 下),依次调用 `run_example.py` 运行每个 example,然后从生成的 device log 中提取 `orch_start` / `orch_end` / `sched_end` 时间戳计算每轮 elapsed 时间。 当前预配置的 examples: - `alternating_matmul_add` diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index 64b283e81..a3949a2ee 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -21,26 +21,34 @@ RUN_EXAMPLE="$PROJECT_ROOT/examples/scripts/run_example.py" # --- tensormap_and_ringbuffer --- declare -A TMR_EXAMPLE_CASES=( - [alternating_matmul_add]="" - [benchmark_bgemm]="" + [paged_attention]="Case1,Case2" [paged_attention_unroll]="Case1,Case2" - [batch_paged_attention]="" ) TMR_EXAMPLE_ORDER=( - alternating_matmul_add - benchmark_bgemm + paged_attention paged_attention_unroll - batch_paged_attention ) # --- aicpu_build_graph --- declare -A ABG_EXAMPLE_CASES=( + [paged_attention]="Case1,Case2" [paged_attention_unroll]="Case1,Case2" ) ABG_EXAMPLE_ORDER=( + paged_attention paged_attention_unroll ) +# --- tensormap_and_ringbuffer_partial_manual --- +declare -A TMR_PARTIAL_MANUAL_EXAMPLE_CASES=( + [paged_attention_partial_manual]="Case1,Case2" + [paged_attention_unroll_partial_manual]="Case1,Case2" +) +TMR_PARTIAL_MANUAL_EXAMPLE_ORDER=( + paged_attention_partial_manual + paged_attention_unroll_partial_manual +) + # --------------------------------------------------------------------------- # Parse arguments # --------------------------------------------------------------------------- @@ -49,6 +57,7 @@ ROUNDS=100 PLATFORM=a2a3 RUNTIME=tensormap_and_ringbuffer VERBOSE=0 +EXAMPLE_FILTER="" EXTRA_ARGS=() while [[ $# -gt 0 ]]; do @@ -69,6 +78,10 @@ while [[ $# -gt 0 ]]; do RUNTIME="$2" shift 2 ;; + -e|--examples) + EXAMPLE_FILTER="$2" + shift 2 + ;; -v|--verbose) VERBOSE=1 shift @@ -78,13 +91,16 @@ while [[ $# -gt 0 ]]; do benchmark_rounds.sh — run all examples and report per-round timing from device logs Usage: - ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] [-v] + ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] [-e ] [-v] Options: -p, --platform Platform to run on (default: a2a3) -d, --device Device ID (default: 0) -n, --rounds Override number of rounds for each example (default: 100) - -r, --runtime Runtime to benchmark: tensormap_and_ringbuffer (default), aicpu_build_graph + -r, --runtime Runtime to benchmark: tensormap_and_ringbuffer (default), + tensormap_and_ringbuffer_partial_manual, + aicpu_build_graph + -e, --examples Comma-separated example names to run (default: runtime-specific full list) -v, --verbose Save detailed run_example.py output to a timestamped log file -h, --help Show this help @@ -124,7 +140,7 @@ vlog() { # --------------------------------------------------------------------------- # Derive arch from platform and set examples directory # --------------------------------------------------------------------------- -EXAMPLES_DIR="$PROJECT_ROOT/tests/st/${PLATFORM}/${RUNTIME}" +TESTS_RUNTIME_DIR="$RUNTIME" # Clock frequency (MHz) for converting cycle counts to microseconds case "$PLATFORM" in @@ -139,16 +155,37 @@ case "$RUNTIME" in declare -n EXAMPLE_CASES=TMR_EXAMPLE_CASES EXAMPLE_ORDER=("${TMR_EXAMPLE_ORDER[@]}") ;; + tensormap_and_ringbuffer_partial_manual) + TESTS_RUNTIME_DIR="tensormap_and_ringbuffer" + declare -n EXAMPLE_CASES=TMR_PARTIAL_MANUAL_EXAMPLE_CASES + EXAMPLE_ORDER=("${TMR_PARTIAL_MANUAL_EXAMPLE_ORDER[@]}") + ;; aicpu_build_graph) declare -n EXAMPLE_CASES=ABG_EXAMPLE_CASES EXAMPLE_ORDER=("${ABG_EXAMPLE_ORDER[@]}") ;; *) - echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer or aicpu_build_graph." + echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer, tensormap_and_ringbuffer_partial_manual, or aicpu_build_graph." exit 1 ;; esac +EXAMPLES_DIR="$PROJECT_ROOT/tests/st/${PLATFORM}/${TESTS_RUNTIME_DIR}" + +if [[ -n "$EXAMPLE_FILTER" ]]; then + IFS=',' read -ra REQUESTED_EXAMPLES <<< "$EXAMPLE_FILTER" + FILTERED_ORDER=() + for requested in "${REQUESTED_EXAMPLES[@]}"; do + if [[ -n "${EXAMPLE_CASES[$requested]+x}" ]]; then + FILTERED_ORDER+=("$requested") + else + echo "ERROR: example '$requested' is not available for runtime '$RUNTIME'." + exit 1 + fi + done + EXAMPLE_ORDER=("${FILTERED_ORDER[@]}") +fi + # --------------------------------------------------------------------------- # Resolve device log directory (mirrors run_example.py / device_log_resolver.py) # --------------------------------------------------------------------------- diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py index f47ae9b47..e17ffe4a7 100644 --- a/tools/swimlane_converter.py +++ b/tools/swimlane_converter.py @@ -1043,7 +1043,7 @@ def _build_parser(): %(prog)s # Use latest .json in outputs/, output to outputs/ %(prog)s perf_swimlane_20260210_143526.json # Output: outputs/merged_swimlane_20260210_143526.json %(prog)s perf_swimlane_20260210_143526.json -o custom_output.json - %(prog)s perf_swimlane_20260210_143526.json -k examples/host_build_graph/paged_attention/kernels/kernel_config.py + %(prog)s perf_swimlane_20260210_143526.json -k examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py %(prog)s perf_swimlane_20260210_143526.json -d 0 %(prog)s perf_swimlane_20260210_143526.json -v """,