Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
f2881a0
fix sink error for asm fmha (#1652)
LJ-underdog Dec 17, 2025
f2d1627
add guard in case pynccl init failed (#1671)
valarLip Dec 17, 2025
999ebcd
One shot pa (#1670)
fsx950223 Dec 18, 2025
3d84d01
fix(pa_ps): fix pa_ps_asm .co for gfx950 (#1669)
dbyoung18 Dec 18, 2025
78994e0
modify test_bf16gemm_test (#1678)
amd-ruitang3 Dec 18, 2025
290e659
Fix Ruff command in pre-checks (#1675)
Boss2002n Dec 18, 2025
235cfa6
fix mha bwd golden perf issue (#1666)
JaxChen29 Dec 18, 2025
b1278bd
topk uplift v1 (#1662)
steamedMantou Dec 19, 2025
d3a4a0f
fix missing return in mha_bwd (#1688)
yuguo68 Dec 19, 2025
97e760c
Remove the input parameter "out" in gemm_a4w4 (#1679)
junhaha666 Dec 19, 2025
6f20772
fwd v3 hd192 optimize inst alignment for causal mode (#1663)
shay-li77 Dec 19, 2025
7df7b36
fix swa case mismatch (#1694)
JaxChen29 Dec 19, 2025
1e7630f
fixing the fp4 gemm tune script Exception caused by tile_m name incon…
hongxiayang Dec 19, 2025
126d28f
CI: Migrate Triton tests to aiter-1gpu-runner (#1690)
gyohuangxin Dec 19, 2025
1127ab4
add ntile 128 for a8 blkQ moe 1 stage (#1695)
zufayu Dec 19, 2025
9032211
Optimize RoPE in the cases that hdim is small. (#1698)
ruanjm Dec 19, 2025
7f8ed6a
rm garbage from whl (#1696)
amd-ruitang3 Dec 19, 2025
70562e8
enhance prebuild logic (#1672)
zufayu Dec 19, 2025
bf02586
LLfp4 qr cap for atom (#1673)
amirumoAMD Dec 19, 2025
3ad1c76
[MLA] MLA conditions rewrite (#1665)
Zzz9990 Dec 20, 2025
84b2b2f
fix dp causal (#1677)
Zzz9990 Dec 20, 2025
723467d
add two fp4 tune shapes and tuned config (#1687)
hongxiayang Dec 20, 2025
48ee8cc
Dev/a8w4 and a8w8splitk (#1667)
yadaish Dec 20, 2025
c6965e6
bf16_gemm_clean_in_kl (#1700)
amd-ruitang3 Dec 20, 2025
a9d356f
fix tuner (#1701)
valarLip Dec 21, 2025
ac6142e
add gen_fake for 4 gemm operators (#1456)
mqhc2020 Dec 21, 2025
420f5da
fix llvm issue (#1703)
valarLip Dec 21, 2025
f174268
feat: Adaptive topk algorithm selection based on input characteristic…
ClementLinCF Dec 22, 2025
463e8e7
fix mha bwd build error (#1705)
JaxChen29 Dec 22, 2025
94c5b98
fix moe bug when pipever=v1 and nblk=64 (#1707)
lalala-sh Dec 22, 2025
2d71438
fix (#1710)
valarLip Dec 22, 2025
14d92a0
[PA] Optimize PA Decode Gluon Performance for BF16/FP16 with KV_BLOCK…
yanguahe Dec 23, 2025
707b9fc
Fix argument parsing logic when AITER_JIT_DIR is set (#1715)
omoisis-dn Dec 24, 2025
c17b074
fix topk deocde bug in logit value is same (#1716)
steamedMantou Dec 24, 2025
5c11567
add fp32 input (#1706)
zufayu Dec 24, 2025
455bc5d
add sampling aot (#1711)
fsx950223 Dec 24, 2025
f9ac657
Merge branch 'main' into wjx/ck_tile_moe
Zzz9990 Dec 25, 2025
73673b6
update
Zzz9990 Dec 25, 2025
c5aa662
bugfix
Zzz9990 Dec 25, 2025
2210c48
update
lalala-sh Dec 25, 2025
dcdf9ef
update
Zzz9990 Dec 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/pre-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python environment
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: "3.12"
- name: Install dependencies
Expand All @@ -46,7 +46,16 @@ jobs:
env:
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
ruff check . -e | reviewdog -efm="%f:%l:%c: %m" -diff="git diff FETCH_HEAD" -reporter=github-pr-check -tee
ruff check . \
--output-format=rdjson \
--exit-zero \
--no-fix \
| reviewdog \
-f=rdjson \
-name="ruff" \
-reporter=github-pr-review \
-filter-mode=diff_context \
-fail-on-error=true

upload-success-artifact:
name: Upload Success Signal
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/triton-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
GITHUB_SHA: ${{ github.sha }}

triton:
runs-on: aiter-mi300-1gpu
runs-on: aiter-1gpu-runner
needs: [check-signal]
env:
DOCKER_IMAGE: "rocm/pytorch:latest"
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/composable_kernel
Submodule composable_kernel updated 400 files
6 changes: 5 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
graft aiter
graft aiter_meta
graft aiter_meta

# exclude cache and compiled files .pyc / .pyo / .pyd / .pyd
global-exclude *.py[cod]
prune aiter/jit/build
89 changes: 89 additions & 0 deletions aiter/aot/sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from collections import namedtuple
import os
import concurrent.futures
from csrc.cpp_itfs.sampling.top_k_renorm_probs import (
compile as top_k_renorm_probs_compile,
)
from csrc.cpp_itfs.sampling.top_p_sampling_from_probs import (
compile as top_p_sampling_from_probs_compile,
)
from csrc.cpp_itfs.sampling.top_k_top_p_sampling_from_probs import (
compile as top_k_top_p_sampling_from_probs_compile,
)

TopKRenormConfig = namedtuple(
"TopKRenormConfig",
["vec_size", "func_name"],
)

TopPSamplingConfig = namedtuple(
"TopPSamplingConfig",
["vec_size", "deterministic", "func_name"],
)

TopKTopPSamplingConfig = namedtuple(
"TopKTopPSamplingConfig",
["vec_size", "deterministic", "func_name"],
)


def process_top_k_renorm_config(config):
return top_k_renorm_probs_compile(config.vec_size)


def process_top_p_sampling_config(config):
return top_p_sampling_from_probs_compile(config.vec_size, config.deterministic)


def process_top_k_top_p_sampling_config(config):
return top_k_top_p_sampling_from_probs_compile(
config.vec_size, config.deterministic
)


def main():
# Generate configs for top_k_renorm_probs
top_k_renorm_configs = []
for vec_size in range(1, 5):
top_k_renorm_configs.append(
TopKRenormConfig(
vec_size=vec_size,
func_name="top_k_renorm_probs",
)
)

# Generate configs for top_p_sampling_from_probs
top_p_sampling_configs = []
for vec_size in range(1, 5):
for deterministic in [False, True]:
top_p_sampling_configs.append(
TopPSamplingConfig(
vec_size=vec_size,
deterministic=deterministic,
func_name="top_p_sampling_from_probs",
)
)

# Generate configs for top_k_top_p_sampling_from_probs
top_k_top_p_sampling_configs = []
for vec_size in range(1, 5):
for deterministic in [False, True]:
top_k_top_p_sampling_configs.append(
TopKTopPSamplingConfig(
vec_size=vec_size,
deterministic=deterministic,
func_name="top_k_top_p_sampling_from_probs",
)
)

max_jobs = int(os.environ.get("MAX_JOBS", os.cpu_count() or 16))

# Process all configs in parallel
with concurrent.futures.ProcessPoolExecutor(max_workers=max_jobs) as executor:
executor.map(process_top_k_renorm_config, top_k_renorm_configs)
executor.map(process_top_p_sampling_config, top_p_sampling_configs)
executor.map(process_top_k_top_p_sampling_config, top_k_top_p_sampling_configs)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions aiter/configs/a4w4_blockscale_tuned_gemm.csv
Original file line number Diff line number Diff line change
Expand Up @@ -921,3 +921,5 @@ cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio
256,8,3072,1536,42,0,5.4682,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,13.81,441.57,0.0
256,8,7168,2048,29,0,5.836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.25,1278.77,0.0
256,8,512,7168,29,0,9.6677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.07,193.62,0.0
256,32768,2112,7168,48,0,293.0219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3385.88,898.98,0.0
256,65536,2112,7168,48,0,575.6528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3447.0,902.06,0.0
2 changes: 2 additions & 0 deletions aiter/configs/a4w4_blockscale_untuned_gemm.csv
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,5 @@ M,N,K
3000, 7168, 2048
3000, 512, 7168
60000, 4096, 512
32768, 2112, 7168
65536, 2112, 7168
Loading
Loading