From 98573f87cd7ed0436250f6332e6838e24a73fa1f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Keck Date: Thu, 20 Jan 2022 14:57:26 +0100 Subject: [PATCH 1/2] Add opengl/interop when cuda-python is present --- examples/cuda/helpers.h | 6 +- examples/dynamic_geometry.py | 3 +- examples/sutil/cuda_output_buffer.py | 120 +++++++++++++++++++++++++-- examples/sutil/gl_display.py | 1 - setup.py | 1 - 5 files changed, 115 insertions(+), 16 deletions(-) diff --git a/examples/cuda/helpers.h b/examples/cuda/helpers.h index 046cb47..af0995c 100644 --- a/examples/cuda/helpers.h +++ b/examples/cuda/helpers.h @@ -42,11 +42,6 @@ __forceinline__ __device__ float3 toSRGB( const float3& c ) c.z < 0.0031308f ? 12.92f * c.z : 1.055f * powed.z - 0.055f ); } -//__forceinline__ __device__ float dequantizeUnsigned8Bits( const unsigned char i ) -//{ -// enum { N = (1 << 8) - 1 }; -// return min((float)i / (float)N), 1.f) -//} __forceinline__ __device__ unsigned char quantizeUnsigned8Bits( float x ) { x = clamp( x, 0.0f, 1.0f ); @@ -60,6 +55,7 @@ __forceinline__ __device__ uchar4 make_color( const float3& c ) float3 srgb = toSRGB( clamp( c, 0.0f, 1.0f ) ); return make_uchar4( quantizeUnsigned8Bits( srgb.x ), quantizeUnsigned8Bits( srgb.y ), quantizeUnsigned8Bits( srgb.z ), 255u ); } + __forceinline__ __device__ uchar4 make_color( const float4& c ) { return make_color( make_float3( c.x, c.y, c.z ) ); diff --git a/examples/dynamic_geometry.py b/examples/dynamic_geometry.py index 49f94df..ed8296b 100644 --- a/examples/dynamic_geometry.py +++ b/examples/dynamic_geometry.py @@ -341,7 +341,6 @@ def create_module(state): else: exception_flags=ox.ExceptionFlags.NONE - print("Triangle value", ox.PrimitiveTypeFlags.TRIANGLE.value) pipeline_opts = ox.PipelineCompileOptions( uses_motion_blur=False, uses_primitive_type_flags =ox.PrimitiveTypeFlags.TRIANGLE, @@ -414,7 +413,7 @@ def create_sbt(state): animation_time = 1.0 buffer_format = BufferImageFormat.UCHAR4 - output_buffer_type = CudaOutputBufferType.CUDA_DEVICE + output_buffer_type = CudaOutputBufferType.enable_gl_interop() init_camera_state(state) create_context(state) diff --git a/examples/sutil/cuda_output_buffer.py b/examples/sutil/cuda_output_buffer.py index 059fa6d..d09353f 100644 --- a/examples/sutil/cuda_output_buffer.py +++ b/examples/sutil/cuda_output_buffer.py @@ -1,4 +1,5 @@ -import enum +import sys, os, enum +from packaging import version import numpy as np import cupy as cp @@ -7,6 +8,49 @@ from .vecmath import vtype_to_dtype +try: + import cuda as _cuda + from cuda import cudart + has_cudart = True + has_gl_interop = version.parse(_cuda.__version__) >= version.parse("11.6.0") +except ImportError: + cudart = None + has_cudart = False + has_gl_interop = False + +_cuda_opengl_interop_msg = ( + "Cuda Python low level bindings v11.6.0 or later are required to enable " + f"Cuda/OpenGL interoperability.{os.linesep}You can install the missing package with:" + f"{os.linesep} {sys.executable} -m pip install --upgrade --user cuda-python" +) + +if has_cudart: + def format_cudart_err(err): + return ( + f"{cudart.cudaGetErrorName(err)[1].decode('utf-8')}({int(err)}): " + f"{cudart.cudaGetErrorString(err)[1].decode('utf-8')}" + ) + + + def check_cudart_err(args): + if isinstance(args, tuple): + assert len(args) >= 1 + err = args[0] + if len(args) == 1: + ret = None + elif len(args) == 2: + ret = args[1] + else: + ret = args[1:] + else: + ret = None + + assert isinstance(err, cudart.cudaError_t), type(err) + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(format_cudart_err(err)) + + return ret + class BufferImageFormat(enum.Enum): UCHAR4=0 @@ -35,11 +79,22 @@ class CudaOutputBufferType(enum.Enum): ZERO_COPY = 2, # general case, preferred for multi-gpu if not fully nvlink connected CUDA_P2P = 3, # fully connected only, preferred for fully nvlink connected + @classmethod + def enable_gl_interop(cls, fallback=True): + if has_gl_interop: + return cls.GL_INTEROP + elif fallback: + msg = _cuda_opengl_interop_msg + f"{os.linesep}Falling back to slower CUDA_DEVICE output buffer." + print(msg) + return cls.CUDA_DEVICE + else: + raise RuntimeError(_cuda_opengl_interop_msg) + class CudaOutputBuffer: __slots__ = ['_pixel_format', '_buffer_type', '_width', '_height', '_device', '_device_idx', '_device', '_stream', - '_host_buffer', '_device_buffer', '_pbo'] + '_host_buffer', '_device_buffer', '_cuda_gfx_ressource', '_pbo'] def __init__(self, buffer_type, pixel_format, width, height, device_idx=0): for attr in self.__slots__: @@ -50,6 +105,16 @@ def __init__(self, buffer_type, pixel_format, width, height, device_idx=0): self.buffer_type = buffer_type self.resize(width, height) self.stream = None + + if buffer_type is CudaOutputBufferType.GL_INTEROP: + if not has_gl_interop: + raise RuntimeError(_cuda_opengl_interop_msg) + device_count, device_ids = check_cudart_err( cudart.cudaGLGetDevices(1, cudart.cudaGLDeviceList.cudaGLDeviceListAll) ) + if device_count <= 0: + raise RuntimeError("No OpenGL device found, cannot enable GL_INTEROP.") + elif device_ids[0] != device_idx: + raise RuntimeError(f"OpenGL device id {device_ids[0]} does not match requested " + f"device index {device_idx} for Cuda/OpenGL interop.") self._reallocate_buffers() @@ -69,13 +134,29 @@ def map(self): self._make_current() if (self._host_buffer is None) or (self._device_buffer is None): self._reallocate_buffers() - return self._device_buffer.data.ptr + if self.buffer_type is CudaOutputBufferType.CUDA_DEVICE: + return self._device_buffer.data.ptr + elif self.buffer_type is CudaOutputBufferType.GL_INTEROP: + check_cudart_err( + cudart.cudaGraphicsMapResources(1, self._cuda_gfx_ressource, self._stream.ptr) + ) + ptr, size = check_cudart_err( + cudart.cudaGraphicsResourceGetMappedPointer(self._cuda_gfx_ressource) + ) + return ptr + else: + msg = f'Buffer type {self.buffer_type} has not been implemented yet.' + raise NotImplementedError(msg) def unmap(self): self._make_current() buffer_type = self.buffer_type if buffer_type is CudaOutputBufferType.CUDA_DEVICE: self._stream.synchronize() + elif buffer_type is CudaOutputBufferType.GL_INTEROP: + check_cudart_err( + cudart.cudaGraphicsUnmapResources(1, self._cuda_gfx_ressource, self._stream.ptr) + ) else: msg = f'Buffer type {buffer_type} has not been implemented yet.' raise NotImplementedError(msg) @@ -85,12 +166,13 @@ def get_pbo(self): self._make_current() - if self._pbo is None: - self._pbo = gl.glGenBuffers(1) - if buffer_type is CudaOutputBufferType.CUDA_DEVICE: + if self._pbo is None: + self._pbo = gl.glGenBuffers(1) self.copy_device_to_host() self.copy_host_to_pbo() + elif buffer_type is CudaOutputBufferType.GL_INTEROP: + assert self._pbo is not None else: msg = f'Buffer type {buffer_type} has not been implemented yet.' raise NotImplementedError(msg) @@ -121,14 +203,26 @@ def _reallocate_buffers(self): dtype = self.pixel_format shape = (self.height, self.width) + + self._host_buffer = np.empty(shape=shape, dtype=dtype) if buffer_type is CudaOutputBufferType.CUDA_DEVICE: - self._host_buffer = np.empty(shape=shape, dtype=dtype) self._device_buffer = cp.empty(shape=shape, dtype=dtype) if self._pbo is not None: gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo) gl.glBufferData(gl.GL_ARRAY_BUFFER, self._host_buffer, gl.GL_STREAM_DRAW) gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0) + elif buffer_type is CudaOutputBufferType.GL_INTEROP: + self._pbo = gl.glGenBuffers(1) if self._pbo is None else self._pbo + + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo) + gl.glBufferData(gl.GL_ARRAY_BUFFER, self.width*self.height*dtype.itemsize, None, gl.GL_STREAM_DRAW) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0) + + self.cuda_gfx_ressource = check_cudart_err( + cudart.cudaGraphicsGLRegisterBuffer(self._pbo, + cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard) + ) else: msg = f'Buffer type {buffer_type} has not been implemented yet.' raise NotImplementedError(msg) @@ -215,3 +309,15 @@ def _set_stream(self, value): assert isinstance(value, cp.cuda.Stream), type(value) self._stream = value stream = property(_get_stream, _set_stream) + + def _get_cuda_gfx_ressource(self): + assert self._cuda_gfx_ressource is not None + return self._cuda_gfx_ressource + def _set_cuda_gfx_ressource(self, value): + if (self._cuda_gfx_ressource is not None) and (self._cuda_gfx_ressource != value): + check_cudart_err( + cudart.cudaGraphicsUnregisterResource(self._cuda_gfx_ressource) + ) + self._cuda_gfx_ressource = value + + cuda_gfx_ressource = property(_get_cuda_gfx_ressource, _set_cuda_gfx_ressource) diff --git a/examples/sutil/gl_display.py b/examples/sutil/gl_display.py index bee2939..280cf1e 100644 --- a/examples/sutil/gl_display.py +++ b/examples/sutil/gl_display.py @@ -49,7 +49,6 @@ class GLDisplay: '_quad_vertex_buffer', '_image_format'] def __init__(self, image_format): - print(image_format, type(image_format), isinstance(BufferImageFormat.UCHAR4, BufferImageFormat)) assert isinstance(image_format, BufferImageFormat) vertex_array = gl.glGenVertexArrays(1) diff --git a/setup.py b/setup.py index 2658567..4457906 100644 --- a/setup.py +++ b/setup.py @@ -75,7 +75,6 @@ def import_module_from_path(path): ext_modules=extensions, install_requires=[ 'numpy', - 'cupy>=9.0' ], license="MIT", classifiers=[ From e6fe401d49bce0bc4068cbdbece35094f81d8b2d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Keck Date: Fri, 21 Jan 2022 16:24:57 +0100 Subject: [PATCH 2/2] Fix examples cuda source relative path --- examples/compile_with_tasks.py | 2 +- examples/dynamic_geometry.py | 22 +++++++++++++--------- examples/hello.py | 9 ++++++--- examples/spheres.py | 10 +++++++--- examples/triangle.py | 6 +++++- setup.py | 1 + 6 files changed, 33 insertions(+), 17 deletions(-) diff --git a/examples/compile_with_tasks.py b/examples/compile_with_tasks.py index 9e967ff..f3b2a35 100644 --- a/examples/compile_with_tasks.py +++ b/examples/compile_with_tasks.py @@ -72,4 +72,4 @@ tic = time.time() for i in range(args.num_iters): module = ox.Module(ctx, ptx, module_compile_options=compile_opts, pipeline_compile_options=pipeline_options) - print("Overall run time without tasks", time.time()-tic) \ No newline at end of file + print("Overall run time without tasks", time.time()-tic) diff --git a/examples/dynamic_geometry.py b/examples/dynamic_geometry.py index ed8296b..a16acf9 100644 --- a/examples/dynamic_geometry.py +++ b/examples/dynamic_geometry.py @@ -18,6 +18,16 @@ DEBUG=False +if DEBUG: + exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW, + debug_level = ox.CompileDebugLevel.FULL + opt_level = ox.CompileOptimizationLevel.LEVEL_0 +else: + exception_flags=ox.ExceptionFlags.NONE + debug_level = ox.CompileDebugLevel.MINIMAL + opt_level = ox.CompileOptimizationLevel.LEVEL_3 + + #------------------------------------------------------------------------------ # Local types #------------------------------------------------------------------------------ @@ -336,14 +346,9 @@ def build_mesh_accel(state): def create_module(state): - if DEBUG: - exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW, - else: - exception_flags=ox.ExceptionFlags.NONE - pipeline_opts = ox.PipelineCompileOptions( uses_motion_blur=False, - uses_primitive_type_flags =ox.PrimitiveTypeFlags.TRIANGLE, + uses_primitive_type_flags=ox.PrimitiveTypeFlags.TRIANGLE, traversable_graph_flags=ox.TraversableGraphFlags.ALLOW_SINGLE_LEVEL_INSTANCING, exception_flags=exception_flags, num_payload_values=3, @@ -352,8 +357,7 @@ def create_module(state): compile_opts = ox.ModuleCompileOptions( max_register_count=ox.ModuleCompileOptions.DEFAULT_MAX_REGISTER_COUNT, - opt_level=ox.CompileOptimizationLevel.DEFAULT, - debug_level=ox.CompileDebugLevel.MODERATE) + opt_level=opt_level, debug_level=debug_level) cuda_source = os.path.join(script_dir, 'cuda', 'dynamic_geometry.cu') state.module = ox.Module(state.ctx, cuda_source, compile_opts, pipeline_opts) @@ -370,7 +374,7 @@ def create_pipeline(state): program_grps = [state.raygen_grp, state.miss_grp, state.hit_grp] link_opts = ox.PipelineLinkOptions(max_trace_depth=1, - debug_level=ox.CompileDebugLevel.MODERATE) + debug_level=debug_level) pipeline = ox.Pipeline(state.ctx, compile_options=state.pipeline_opts, diff --git a/examples/hello.py b/examples/hello.py index e79fdaa..57ea583 100644 --- a/examples/hello.py +++ b/examples/hello.py @@ -1,15 +1,18 @@ +import os, sys, logging import optix as ox import cupy as cp import numpy as np from PIL import Image, ImageOps -import logging -import sys + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) log = logging.getLogger() +script_dir = os.path.dirname(__file__) +cuda_src = os.path.join(script_dir, "cuda", "hello.cu") + def create_module(ctx, pipeline_opts): compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0) - module = ox.Module(ctx, 'cuda/hello.cu', compile_opts, pipeline_opts) + module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts) return module diff --git a/examples/spheres.py b/examples/spheres.py index a7776e9..c2b55d3 100644 --- a/examples/spheres.py +++ b/examples/spheres.py @@ -1,11 +1,15 @@ +import os, sys, logging import optix as ox import cupy as cp import numpy as np from PIL import Image, ImageOps -import logging -import sys + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) log = logging.getLogger() + +script_dir = os.path.dirname(__file__) +cuda_src = os.path.join(script_dir, "cuda", "spheres.cu") + img_size = (1024, 768) def compute_spheres_bbox(centers, radii): @@ -23,7 +27,7 @@ def create_acceleration_structure(ctx, bboxes): def create_module(ctx, pipeline_opts): compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0) - module = ox.Module(ctx, 'cuda/spheres.cu', compile_opts, pipeline_opts) + module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts) return module diff --git a/examples/triangle.py b/examples/triangle.py index b9d7bb3..2b319fa 100644 --- a/examples/triangle.py +++ b/examples/triangle.py @@ -1,8 +1,12 @@ +import os import optix as ox import cupy as cp import numpy as np from PIL import Image, ImageOps +script_dir = os.path.dirname(__file__) +cuda_src = os.path.join(script_dir, "cuda", "triangle.cu") + img_size = (1024, 768) # use a regular function for logging @@ -19,7 +23,7 @@ def create_acceleration_structure(ctx, vertices): def create_module(ctx, pipeline_opts): compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0) - module = ox.Module(ctx, 'cuda/triangle.cu', compile_opts, pipeline_opts) + module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts) return module diff --git a/setup.py b/setup.py index 4457906..2658567 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ def import_module_from_path(path): ext_modules=extensions, install_requires=[ 'numpy', + 'cupy>=9.0' ], license="MIT", classifiers=[