rust-lang · Flakebi · Dec 4, 2025 · RalfJung · Sep 4, 2025 · Flakebi
diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs
@@ -1719,6 +1719,9 @@ pub struct AddressSpace(pub u32);
 impl AddressSpace {
     /// LLVM's `0` address space.
     pub const ZERO: Self = AddressSpace(0);
+    /// The address space for work-group shared memory on nvptx and amdgpu.
+    /// See e.g. the `gpu_dynamic_groupshared_mem` intrinsic for details.
+    pub const GPU_SHARED: Self = AddressSpace(3);
 }
 
 /// The way we represent values to the backend

diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs
@@ -14,6 +14,7 @@
 use std::borrow::Borrow;
 
 use itertools::Itertools;
+use rustc_abi::AddressSpace;
 use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods;
 use rustc_data_structures::fx::FxIndexSet;
 use rustc_middle::ty::{Instance, Ty};
@@ -97,6 +98,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             )
         }
     }
+
+    /// Declare a global value in a specific address space.
+    ///
+    /// If there’s a value with the same name already declared, the function will
+    /// return its Value instead.
+    pub(crate) fn declare_global_in_addrspace(
+        &self,
+        name: &str,
+        ty: &'ll Type,
+        addr_space: AddressSpace,
+    ) -> &'ll Value {
+        debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
+        unsafe {
+            llvm::LLVMRustGetOrInsertGlobalInAddrspace(
+                (**self).borrow().llmod,
+                name.as_c_char_ptr(),
+                name.len(),
+                ty,
+                addr_space.0,
+            )
+        }
+    }
 }
 
 impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {

diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -1,7 +1,9 @@
 use std::assert_matches::assert_matches;
 use std::cmp::Ordering;
 
-use rustc_abi::{Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size};
+use rustc_abi::{
+    AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size,
+};
 use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
 use rustc_codegen_ssa::codegen_attrs::autodiff_attrs;
 use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
@@ -553,6 +555,31 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 return Ok(());
             }
 
+            sym::gpu_dynamic_groupshared_mem => {
+                // The name of the global variable is not relevant, the important properties are.
+                // 1. The global is in the shared address space
+                // 2. It is an extern global
+                // All instances of extern addrspace(shared) globals are merged in the LLVM backend.
+                // Generate an unnamed global per intrinsic call, so that different kernels can have
+                // different minimum alignments.
+                // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
+                let global = self.declare_global_in_addrspace(
+                    "",
+                    self.type_array(self.type_i8(), 0),
+                    AddressSpace::GPU_SHARED,
+                );
+                let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
+                // The alignment of the global is used to specify the *minimum* alignment that the
+                // must be obeyed by the GPU runtime.
+                // When multiple of these global variables are merged, the maximum alignment is taken.
+                // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
+                let alignment = self.align_of(*inner_ty).bytes() as u32;
+                unsafe {
+                    llvm::LLVMSetAlignment(global, alignment);
+                }
+                self.cx().const_pointercast(global, self.type_ptr())
+            }
+
             _ if name.as_str().starts_with("simd_") => {
                 // Unpack non-power-of-2 #[repr(packed, simd)] arguments.
                 // This gives them the expected layout of a regular #[repr(simd)] vector.

diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -2017,6 +2017,13 @@ unsafe extern "C" {
         NameLen: size_t,
         T: &'a Type,
     ) -> &'a Value;
+    pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
+        M: &'a Module,
+        Name: *const c_char,
+        NameLen: size_t,
+        T: &'a Type,
+        AddressSpace: c_uint,
+    ) -> &'a Value;
     pub(crate) fn LLVMRustGetNamedValue(
         M: &Module,
         Name: *const c_char,

diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
@@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 sym::abort
                 | sym::unreachable
                 | sym::cold_path
+                | sym::gpu_dynamic_groupshared_mem
                 | sym::breakpoint
                 | sym::assert_zero_valid
                 | sym::assert_mem_uninitialized_valid

diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -132,6 +132,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
         | sym::forget
         | sym::frem_algebraic
         | sym::fsub_algebraic
+        | sym::gpu_dynamic_groupshared_mem
         | sym::is_val_statically_known
         | sym::log2f16
         | sym::log2f32
@@ -293,6 +294,7 @@ pub(crate) fn check_intrinsic_type(
         sym::offset_of => (1, 0, vec![tcx.types.u32, tcx.types.u32], tcx.types.usize),
         sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
         sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
+        sym::gpu_dynamic_groupshared_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
         sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
             (1, 0, vec![], tcx.types.unit)
         }

diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@@ -261,10 +261,10 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
                   .getCallee());
 }
 
-extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
-                                                  const char *Name,
-                                                  size_t NameLen,
-                                                  LLVMTypeRef Ty) {
+extern "C" LLVMValueRef
+LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
+                                     size_t NameLen, LLVMTypeRef Ty,
+                                     unsigned AddressSpace) {
   Module *Mod = unwrap(M);
   auto NameRef = StringRef(Name, NameLen);
 
@@ -275,10 +275,21 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
   GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
   if (!GV)
     GV = new GlobalVariable(*Mod, unwrap(Ty), false,
-                            GlobalValue::ExternalLinkage, nullptr, NameRef);
+                            GlobalValue::ExternalLinkage, nullptr, NameRef,
+                            nullptr, GlobalValue::NotThreadLocal, AddressSpace);
   return wrap(GV);
 }
 
+extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
+                                                  const char *Name,
+                                                  size_t NameLen,
+                                                  LLVMTypeRef Ty) {
+  Module *Mod = unwrap(M);
+  unsigned AddressSpace = Mod->getDataLayout().getDefaultGlobalsAddressSpace();
+  return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
+                                              AddressSpace);
+}
+
 // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
 enum class LLVMRustAttributeKind {
   AlwaysInline = 0,

diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
@@ -1152,6 +1152,7 @@ symbols! {
         global_asm,
         global_registration,
         globs,
+        gpu_dynamic_groupshared_mem,
         gt,
         guard_patterns,
         half_open_range_patterns,

diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
@@ -3436,6 +3436,43 @@ pub(crate) const fn miri_promise_symbolic_alignment(ptr: *const (), align: usize
     )
 }
 
+/// Returns the pointer to dynamic group-shared memory on GPUs.
+///
+/// Group-shared memory is a memory region that is shared between all threads in
+/// the same work-group. It is faster to access than other memory but pointers do not
+/// work outside the work-group where they were obtained.
+/// Dynamic group-shared memory is in the group-shared memory region, the allocated
+/// size is specified late, after compilation, when launching a gpu-kernel.
+/// The size can differ between launches of a gpu-kernel, therefore it is called dynamic.
+/// However, the alignment is fixed by the kernel itself (at compile-time).
+///
+/// The returned pointer is the start of the dynamic group-shared memory region.
+/// All calls to `gpu_dynamic_groupshared_mem` in a work-group, independent of the
+/// generic type, return the same address, so alias the same memory.
+/// The returned pointer is aligned by at least the alignment of `T`.
+///
+/// # Safety
+///
+/// The pointer is safe to dereference from the start (the returned pointer) up to the
+/// size of dynamic group-shared memory that was specified when launching the current
+/// gpu-kernel.
+///
+/// The user must take care of synchronizing access to group-shared memory between
+/// threads in a work-group. The usual data race requirements apply.
+///
+/// # Other APIs
+///
+/// CUDA and HIP call this shared memory, shared between threads in a block.
+/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
+/// GLSL calls this shared memory, shared between invocations in a work group.
+/// DirectX calls this groupshared memory, shared between threads in a thread-group.
+#[must_use = "returns a pointer that does nothing unless used"]
+#[rustc_intrinsic]
+#[rustc_nounwind]
+#[unstable(feature = "gpu_dynamic_groupshared_mem", issue = "135513")]
+#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
+pub fn gpu_dynamic_groupshared_mem<T>() -> *mut T;
+
 /// Copies the current location of arglist `src` to the arglist `dst`.
 ///
 /// # Safety

diff --git a/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs b/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs
@@ -0,0 +1,28 @@
+// Checks that the GPU dynamic group-shared memory intrinsic works.
+
+//@ revisions: amdgpu nvptx
+//@ compile-flags: --crate-type=rlib
+//
+//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
+//@ [amdgpu] needs-llvm-components: amdgpu
+//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda
+//@ [nvptx] needs-llvm-components: nvptx
+//@ add-minicore
+#![feature(intrinsics, no_core, rustc_attrs)]
+#![no_core]
+
+extern crate minicore;
+
+#[rustc_intrinsic]
+#[rustc_nounwind]
+fn gpu_dynamic_groupshared_mem<T>() -> *mut T;
+
+// CHECK-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
+// CHECK-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
+// CHECK: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
+#[unsafe(no_mangle)]
+pub fn fun() -> (*mut i32, *mut f64) {
+    let small = gpu_dynamic_groupshared_mem::<i32>();
+    let big = gpu_dynamic_groupshared_mem::<f64>(); // Increase alignment to 8
+    (small, big)
+}