From b8e87b39d8200e4a190198d5e4b96a9f4b15938f Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Fri, 11 Jul 2025 02:49:52 +0000 Subject: [PATCH 1/4] fix compile bug --- csrc/cpp_itfs/utils.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/csrc/cpp_itfs/utils.py b/csrc/cpp_itfs/utils.py index e7be81c4a3..a7a6115315 100644 --- a/csrc/cpp_itfs/utils.py +++ b/csrc/cpp_itfs/utils.py @@ -85,13 +85,6 @@ def validate_and_update_archs(): return archs -def init_build_dir(dir): - if not os.path.exists(dir): - os.makedirs(dir) - else: - subprocess.run(f"rm -rf {dir}/*", shell=True) - - def compile_lib(src_file, folder, includes=None, sources=None, cxxflags=None): if includes is None: includes = [] @@ -100,7 +93,6 @@ def compile_lib(src_file, folder, includes=None, sources=None, cxxflags=None): if cxxflags is None: cxxflags = [] sub_build_dir = os.path.join(BUILD_DIR, folder) - init_build_dir(sub_build_dir) include_dir = f"{sub_build_dir}/include" os.makedirs(include_dir, exist_ok=True) for include in includes + [f"{CK_DIR}/include"]: @@ -139,19 +131,19 @@ def compile_lib(src_file, folder, includes=None, sources=None, cxxflags=None): # Imitate https://github.com/ROCm/composable_kernel/blob/c8b6b64240e840a7decf76dfaa13c37da5294c4a/CMakeLists.txt#L190-L214 hip_version = get_hip_version() + if hip_version > Version("5.5.00000"): + cxxflags += ["-mllvm --lsr-drop-solution=1"] if hip_version > Version("5.7.23302"): cxxflags += ["-fno-offload-uniform-block"] if hip_version > Version("6.1.40090"): - cxxflags += ["-mllvm", "-enable-post-misched=0"] + cxxflags += ["-mllvm -enable-post-misched=0"] if hip_version > Version("6.2.41132"): cxxflags += [ - "-mllvm", - "-amdgpu-early-inline-all=true", - "-mllvm", - "-amdgpu-function-calls=false", + "-mllvm -amdgpu-early-inline-all=true", + "-mllvm -amdgpu-function-calls=false", ] - if hip_version > Version("6.2.41133") and hip_version < Version("6.3.00000"): - cxxflags += ["-mllvm", "-amdgpu-coerce-illegal-types=1"] + if hip_version > Version("6.2.41133"): + cxxflags += ["-mllvm -amdgpu-coerce-illegal-types=1"] archs = validate_and_update_archs() cxxflags += [f"--offload-arch={arch}" for arch in archs] makefile_file = makefile_template.render( From b67a470e48d8b5ad0b78f01523b9676d23da970a Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 7 Jul 2025 05:42:42 +0000 Subject: [PATCH 2/4] add aiter debug --- csrc/cpp_itfs/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/cpp_itfs/utils.py b/csrc/cpp_itfs/utils.py index a7a6115315..b6a0831467 100644 --- a/csrc/cpp_itfs/utils.py +++ b/csrc/cpp_itfs/utils.py @@ -26,6 +26,7 @@ AITER_ROOT_DIR = os.environ.get("AITER_ROOT_DIR", f"{HOME_PATH}/.aiter") BUILD_DIR = os.path.abspath(os.path.join(AITER_ROOT_DIR, "build")) AITER_LOG_MORE = int(os.getenv("AITER_LOG_MORE", 0)) +AITER_DEBUG = int(os.getenv("AITER_DEBUG", 0)) if AITER_REBUILD >= 1: subprocess.run(f"rm -rf {BUILD_DIR}/*", shell=True) @@ -121,7 +122,6 @@ def compile_lib(src_file, folder, includes=None, sources=None, cxxflags=None): "-U__HIP_NO_HALF_OPERATORS__", "-mllvm", "--amdgpu-kernarg-preload-count=16", - # "-v", "--save-temps", "-Wno-unused-result", "-Wno-switch-bool", "-Wno-vla-cxx-extension", @@ -129,6 +129,9 @@ def compile_lib(src_file, folder, includes=None, sources=None, cxxflags=None): "-fgpu-flush-denormals-to-zero", ] + if AITER_DEBUG: + cxxflags += ["-g", "-fverbose-asm", "--save-temps", "-Wno-gnu-line-marker"] + # Imitate https://github.com/ROCm/composable_kernel/blob/c8b6b64240e840a7decf76dfaa13c37da5294c4a/CMakeLists.txt#L190-L214 hip_version = get_hip_version() if hip_version > Version("5.5.00000"): From 88b60ac7cd719eb3282145646e06d8889a3ce5ae Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Fri, 11 Jul 2025 08:00:47 +0000 Subject: [PATCH 3/4] add mp lock to compile process --- csrc/cpp_itfs/utils.py | 175 ++++++++++++++++++++++++++--------------- 1 file changed, 110 insertions(+), 65 deletions(-) diff --git a/csrc/cpp_itfs/utils.py b/csrc/cpp_itfs/utils.py index b6a0831467..d0c39bdfd6 100644 --- a/csrc/cpp_itfs/utils.py +++ b/csrc/cpp_itfs/utils.py @@ -5,10 +5,15 @@ import ctypes from packaging.version import parse, Version from collections import OrderedDict -from functools import lru_cache +from functools import lru_cache, partial import binascii import hashlib +from aiter.jit.utils.file_baton import FileBaton +import logging +import time + +logger = logging.getLogger("aiter") this_dir = os.path.dirname(os.path.abspath(__file__)) AITER_CORE_DIR = os.path.abspath(f"{this_dir}/../../") DEFAULT_GPU_ARCH = ( @@ -54,6 +59,31 @@ ) +def mp_lock( + lock_path: str, + main_func: callable, + final_func: callable = None, + wait_func: callable = None, +): + """ + Using FileBaton for multiprocessing. + """ + baton = FileBaton(lock_path) + if baton.try_acquire(): + try: + ret = main_func() + finally: + if final_func is not None: + final_func() + baton.release() + else: + baton.wait() + if wait_func is not None: + ret = wait_func() + ret = None + return ret + + def get_hip_version(): version = subprocess.run( "/opt/rocm/bin/hipconfig --version", shell=True, capture_output=True, text=True @@ -87,76 +117,91 @@ def validate_and_update_archs(): def compile_lib(src_file, folder, includes=None, sources=None, cxxflags=None): - if includes is None: - includes = [] - if sources is None: - sources = [] - if cxxflags is None: - cxxflags = [] sub_build_dir = os.path.join(BUILD_DIR, folder) include_dir = f"{sub_build_dir}/include" os.makedirs(include_dir, exist_ok=True) - for include in includes + [f"{CK_DIR}/include"]: - if os.path.isdir(include): - shutil.copytree(include, include_dir, dirs_exist_ok=True) - else: - shutil.copy(include, include_dir) - for source in sources: - if os.path.isdir(source): - shutil.copytree(source, sub_build_dir, dirs_exist_ok=True) - else: - shutil.copy(source, sub_build_dir) - with open(f"{sub_build_dir}/{folder}.cpp", "w") as f: - f.write(src_file) - sources += [f"{folder}.cpp"] - cxxflags += [ - "-DUSE_ROCM", - "-DENABLE_FP8", - "-O3", - "-std=c++17", - "-DLEGACY_HIPBLAS_DIRECT", - "-DUSE_PROF_API=1", - "-D__HIP_PLATFORM_HCC__=1", - "-D__HIP_PLATFORM_AMD__=1", - "-U__HIP_NO_HALF_CONVERSIONS__", - "-U__HIP_NO_HALF_OPERATORS__", - "-mllvm", - "--amdgpu-kernarg-preload-count=16", - "-Wno-unused-result", - "-Wno-switch-bool", - "-Wno-vla-cxx-extension", - "-Wno-undefined-func-template", - "-fgpu-flush-denormals-to-zero", - ] + lock_path = f"{sub_build_dir}/lock" + start_ts = time.perf_counter() - if AITER_DEBUG: - cxxflags += ["-g", "-fverbose-asm", "--save-temps", "-Wno-gnu-line-marker"] - - # Imitate https://github.com/ROCm/composable_kernel/blob/c8b6b64240e840a7decf76dfaa13c37da5294c4a/CMakeLists.txt#L190-L214 - hip_version = get_hip_version() - if hip_version > Version("5.5.00000"): - cxxflags += ["-mllvm --lsr-drop-solution=1"] - if hip_version > Version("5.7.23302"): - cxxflags += ["-fno-offload-uniform-block"] - if hip_version > Version("6.1.40090"): - cxxflags += ["-mllvm -enable-post-misched=0"] - if hip_version > Version("6.2.41132"): + def main_func(includes=None, sources=None, cxxflags=None): + logger.info(f"start build {sub_build_dir}") + if includes is None: + includes = [] + if sources is None: + sources = [] + if cxxflags is None: + cxxflags = [] + + for include in includes + [f"{CK_DIR}/include"]: + if os.path.isdir(include): + shutil.copytree(include, include_dir, dirs_exist_ok=True) + else: + shutil.copy(include, include_dir) + for source in sources: + if os.path.isdir(source): + shutil.copytree(source, sub_build_dir, dirs_exist_ok=True) + else: + shutil.copy(source, sub_build_dir) + with open(f"{sub_build_dir}/{folder}.cpp", "w") as f: + f.write(src_file) + sources += [f"{folder}.cpp"] cxxflags += [ - "-mllvm -amdgpu-early-inline-all=true", - "-mllvm -amdgpu-function-calls=false", + "-DUSE_ROCM", + "-DENABLE_FP8", + "-O3", + "-std=c++17", + "-DLEGACY_HIPBLAS_DIRECT", + "-DUSE_PROF_API=1", + "-D__HIP_PLATFORM_HCC__=1", + "-D__HIP_PLATFORM_AMD__=1", + "-U__HIP_NO_HALF_CONVERSIONS__", + "-U__HIP_NO_HALF_OPERATORS__", + "-mllvm", + "--amdgpu-kernarg-preload-count=16", + "-Wno-unused-result", + "-Wno-switch-bool", + "-Wno-vla-cxx-extension", + "-Wno-undefined-func-template", + "-fgpu-flush-denormals-to-zero", ] - if hip_version > Version("6.2.41133"): - cxxflags += ["-mllvm -amdgpu-coerce-illegal-types=1"] - archs = validate_and_update_archs() - cxxflags += [f"--offload-arch={arch}" for arch in archs] - makefile_file = makefile_template.render( - includes=[f"-I{include_dir}"], sources=sources, cxxflags=cxxflags - ) - with open(f"{sub_build_dir}/Makefile", "w") as f: - f.write(makefile_file) - subprocess.run( - f"cd {sub_build_dir} && make build -j{len(sources)}", shell=True, check=True - ) + + if AITER_DEBUG: + cxxflags += ["-g", "-fverbose-asm", "--save-temps", "-Wno-gnu-line-marker"] + + # Imitate https://github.com/ROCm/composable_kernel/blob/c8b6b64240e840a7decf76dfaa13c37da5294c4a/CMakeLists.txt#L190-L214 + hip_version = get_hip_version() + if hip_version > Version("5.5.00000"): + cxxflags += ["-mllvm --lsr-drop-solution=1"] + if hip_version > Version("5.7.23302"): + cxxflags += ["-fno-offload-uniform-block"] + if hip_version > Version("6.1.40090"): + cxxflags += ["-mllvm -enable-post-misched=0"] + if hip_version > Version("6.2.41132"): + cxxflags += [ + "-mllvm -amdgpu-early-inline-all=true", + "-mllvm -amdgpu-function-calls=false", + ] + if hip_version > Version("6.2.41133"): + cxxflags += ["-mllvm -amdgpu-coerce-illegal-types=1"] + archs = validate_and_update_archs() + cxxflags += [f"--offload-arch={arch}" for arch in archs] + makefile_file = makefile_template.render( + includes=[f"-I{include_dir}"], sources=sources, cxxflags=cxxflags + ) + with open(f"{sub_build_dir}/Makefile", "w") as f: + f.write(makefile_file) + subprocess.run( + f"cd {sub_build_dir} && make build -j{len(sources)}", shell=True, check=True + ) + + def final_func(): + logger.info( + f"finish build {sub_build_dir}, cost {time.perf_counter()-start_ts:.8f}s" + ) + + main_func = partial(main_func, includes=includes, sources=sources, cxxflags=cxxflags) + + mp_lock(lock_path=lock_path, main_func=main_func, final_func=final_func) @lru_cache(maxsize=AITER_MAX_CACHE_SIZE) From 8a9777aaa08a9e15f878f3765da50a02b8163149 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Fri, 11 Jul 2025 08:28:57 +0000 Subject: [PATCH 4/4] format code --- csrc/cpp_itfs/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/csrc/cpp_itfs/utils.py b/csrc/cpp_itfs/utils.py index d0c39bdfd6..99876ac4b6 100644 --- a/csrc/cpp_itfs/utils.py +++ b/csrc/cpp_itfs/utils.py @@ -131,7 +131,7 @@ def main_func(includes=None, sources=None, cxxflags=None): sources = [] if cxxflags is None: cxxflags = [] - + for include in includes + [f"{CK_DIR}/include"]: if os.path.isdir(include): shutil.copytree(include, include_dir, dirs_exist_ok=True) @@ -144,6 +144,7 @@ def main_func(includes=None, sources=None, cxxflags=None): shutil.copy(source, sub_build_dir) with open(f"{sub_build_dir}/{folder}.cpp", "w") as f: f.write(src_file) + sources += [f"{folder}.cpp"] cxxflags += [ "-DUSE_ROCM", @@ -193,13 +194,15 @@ def main_func(includes=None, sources=None, cxxflags=None): subprocess.run( f"cd {sub_build_dir} && make build -j{len(sources)}", shell=True, check=True ) - + def final_func(): logger.info( f"finish build {sub_build_dir}, cost {time.perf_counter()-start_ts:.8f}s" ) - - main_func = partial(main_func, includes=includes, sources=sources, cxxflags=cxxflags) + + main_func = partial( + main_func, includes=includes, sources=sources, cxxflags=cxxflags + ) mp_lock(lock_path=lock_path, main_func=main_func, final_func=final_func)