From 98573f87cd7ed0436250f6332e6838e24a73fa1f Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Keck <jean-baptiste.keck@univ-grenoble-alpes.fr>
Date: Thu, 20 Jan 2022 14:57:26 +0100
Subject: [PATCH 1/2] Add opengl/interop when cuda-python is present

---
 examples/cuda/helpers.h              |   6 +-
 examples/dynamic_geometry.py         |   3 +-
 examples/sutil/cuda_output_buffer.py | 120 +++++++++++++++++++++++++--
 examples/sutil/gl_display.py         |   1 -
 setup.py                             |   1 -
 5 files changed, 115 insertions(+), 16 deletions(-)

diff --git a/examples/cuda/helpers.h b/examples/cuda/helpers.h
index 046cb47..af0995c 100644
--- a/examples/cuda/helpers.h
+++ b/examples/cuda/helpers.h
@@ -42,11 +42,6 @@ __forceinline__ __device__ float3 toSRGB( const float3& c )
         c.z < 0.0031308f ? 12.92f * c.z : 1.055f * powed.z - 0.055f );
 }
 
-//__forceinline__ __device__ float dequantizeUnsigned8Bits( const unsigned char i )
-//{
-//    enum { N = (1 << 8) - 1 };
-//    return min((float)i / (float)N), 1.f)
-//}
 __forceinline__ __device__ unsigned char quantizeUnsigned8Bits( float x )
 {
     x = clamp( x, 0.0f, 1.0f );
@@ -60,6 +55,7 @@ __forceinline__ __device__ uchar4 make_color( const float3& c )
     float3 srgb = toSRGB( clamp( c, 0.0f, 1.0f ) );
     return make_uchar4( quantizeUnsigned8Bits( srgb.x ), quantizeUnsigned8Bits( srgb.y ), quantizeUnsigned8Bits( srgb.z ), 255u );
 }
+
 __forceinline__ __device__ uchar4 make_color( const float4& c )
 {
     return make_color( make_float3( c.x, c.y, c.z ) );
diff --git a/examples/dynamic_geometry.py b/examples/dynamic_geometry.py
index 49f94df..ed8296b 100644
--- a/examples/dynamic_geometry.py
+++ b/examples/dynamic_geometry.py
@@ -341,7 +341,6 @@ def create_module(state):
     else:
         exception_flags=ox.ExceptionFlags.NONE
 
-    print("Triangle value", ox.PrimitiveTypeFlags.TRIANGLE.value)
     pipeline_opts = ox.PipelineCompileOptions(
             uses_motion_blur=False,
             uses_primitive_type_flags =ox.PrimitiveTypeFlags.TRIANGLE,
@@ -414,7 +413,7 @@ def create_sbt(state):
     animation_time = 1.0
 
     buffer_format = BufferImageFormat.UCHAR4
-    output_buffer_type = CudaOutputBufferType.CUDA_DEVICE
+    output_buffer_type = CudaOutputBufferType.enable_gl_interop()
 
     init_camera_state(state)
     create_context(state)
diff --git a/examples/sutil/cuda_output_buffer.py b/examples/sutil/cuda_output_buffer.py
index 059fa6d..d09353f 100644
--- a/examples/sutil/cuda_output_buffer.py
+++ b/examples/sutil/cuda_output_buffer.py
@@ -1,4 +1,5 @@
-import enum
+import sys, os, enum
+from packaging import version
 
 import numpy as np
 import cupy as cp
@@ -7,6 +8,49 @@
 
 from .vecmath import vtype_to_dtype
 
+try:
+    import cuda as _cuda
+    from cuda import cudart
+    has_cudart = True
+    has_gl_interop = version.parse(_cuda.__version__) >= version.parse("11.6.0")
+except ImportError:
+    cudart = None
+    has_cudart = False
+    has_gl_interop = False
+
+_cuda_opengl_interop_msg = (
+    "Cuda Python low level bindings v11.6.0 or later are required to enable "
+   f"Cuda/OpenGL interoperability.{os.linesep}You can install the missing package with:"
+   f"{os.linesep}  {sys.executable} -m pip install --upgrade --user cuda-python"
+)
+
+if has_cudart:
+    def format_cudart_err(err):
+        return (
+            f"{cudart.cudaGetErrorName(err)[1].decode('utf-8')}({int(err)}): "
+            f"{cudart.cudaGetErrorString(err)[1].decode('utf-8')}"
+        )
+
+
+    def check_cudart_err(args):
+        if isinstance(args, tuple):
+            assert len(args) >= 1
+            err = args[0]
+            if len(args) == 1:
+                ret = None
+            elif len(args) == 2:
+                ret = args[1]
+            else:
+                ret = args[1:]
+        else:
+            ret = None
+
+        assert isinstance(err, cudart.cudaError_t), type(err)
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError(format_cudart_err(err))
+
+        return ret
+
 
 class BufferImageFormat(enum.Enum):
     UCHAR4=0
@@ -35,11 +79,22 @@ class CudaOutputBufferType(enum.Enum):
     ZERO_COPY   = 2,  # general case, preferred for multi-gpu if not fully nvlink connected
     CUDA_P2P    = 3,  # fully connected only, preferred for fully nvlink connected
 
+    @classmethod
+    def enable_gl_interop(cls, fallback=True):
+        if has_gl_interop:
+            return cls.GL_INTEROP
+        elif fallback:
+            msg = _cuda_opengl_interop_msg + f"{os.linesep}Falling back to slower CUDA_DEVICE output buffer."
+            print(msg)
+            return cls.CUDA_DEVICE
+        else:
+            raise RuntimeError(_cuda_opengl_interop_msg)
+
 
 class CudaOutputBuffer:
     __slots__ = ['_pixel_format', '_buffer_type', '_width', '_height',
             '_device', '_device_idx', '_device', '_stream',
-            '_host_buffer', '_device_buffer', '_pbo']
+            '_host_buffer', '_device_buffer', '_cuda_gfx_ressource', '_pbo']
 
     def __init__(self, buffer_type, pixel_format, width, height, device_idx=0):
         for attr in self.__slots__:
@@ -50,6 +105,16 @@ def __init__(self, buffer_type, pixel_format, width, height, device_idx=0):
         self.buffer_type = buffer_type
         self.resize(width, height)
         self.stream = None
+        
+        if buffer_type is CudaOutputBufferType.GL_INTEROP:
+            if not has_gl_interop:
+                raise RuntimeError(_cuda_opengl_interop_msg)
+            device_count, device_ids = check_cudart_err( cudart.cudaGLGetDevices(1, cudart.cudaGLDeviceList.cudaGLDeviceListAll) )
+            if device_count <= 0:
+                raise RuntimeError("No OpenGL device found, cannot enable GL_INTEROP.")
+            elif device_ids[0] != device_idx:
+                raise RuntimeError(f"OpenGL device id {device_ids[0]} does not match requested "
+                                   f"device index {device_idx} for Cuda/OpenGL interop.")
 
         self._reallocate_buffers()
 
@@ -69,13 +134,29 @@ def map(self):
         self._make_current()
         if (self._host_buffer is None) or (self._device_buffer is None):
             self._reallocate_buffers()
-        return self._device_buffer.data.ptr
+        if self.buffer_type is CudaOutputBufferType.CUDA_DEVICE:
+            return self._device_buffer.data.ptr
+        elif self.buffer_type is CudaOutputBufferType.GL_INTEROP:
+            check_cudart_err(
+                cudart.cudaGraphicsMapResources(1, self._cuda_gfx_ressource, self._stream.ptr)
+            )
+            ptr, size = check_cudart_err(
+                cudart.cudaGraphicsResourceGetMappedPointer(self._cuda_gfx_ressource)
+            )
+            return ptr
+        else:
+            msg = f'Buffer type {self.buffer_type} has not been implemented yet.'
+            raise NotImplementedError(msg)
 
     def unmap(self):
         self._make_current()
         buffer_type = self.buffer_type
         if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
             self._stream.synchronize()
+        elif buffer_type is CudaOutputBufferType.GL_INTEROP:
+            check_cudart_err(
+                cudart.cudaGraphicsUnmapResources(1, self._cuda_gfx_ressource, self._stream.ptr)
+            )
         else:
             msg = f'Buffer type {buffer_type} has not been implemented yet.'
             raise NotImplementedError(msg)
@@ -85,12 +166,13 @@ def get_pbo(self):
 
         self._make_current()
 
-        if self._pbo is None:
-            self._pbo = gl.glGenBuffers(1)
-
         if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
+            if self._pbo is None:
+                self._pbo = gl.glGenBuffers(1)
             self.copy_device_to_host()
             self.copy_host_to_pbo()
+        elif buffer_type is CudaOutputBufferType.GL_INTEROP:
+            assert self._pbo is not None
         else:
             msg = f'Buffer type {buffer_type} has not been implemented yet.'
             raise NotImplementedError(msg)
@@ -121,14 +203,26 @@ def _reallocate_buffers(self):
 
         dtype = self.pixel_format
         shape = (self.height, self.width)
+        
+        self._host_buffer = np.empty(shape=shape, dtype=dtype)
 
         if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
-            self._host_buffer = np.empty(shape=shape, dtype=dtype)
             self._device_buffer = cp.empty(shape=shape, dtype=dtype)
             if self._pbo is not None:
                 gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo)
                 gl.glBufferData(gl.GL_ARRAY_BUFFER, self._host_buffer, gl.GL_STREAM_DRAW)
                 gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+        elif buffer_type is CudaOutputBufferType.GL_INTEROP:
+            self._pbo = gl.glGenBuffers(1) if self._pbo is None else self._pbo
+            
+            gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo)
+            gl.glBufferData(gl.GL_ARRAY_BUFFER, self.width*self.height*dtype.itemsize, None, gl.GL_STREAM_DRAW)
+            gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+        
+            self.cuda_gfx_ressource = check_cudart_err(
+                cudart.cudaGraphicsGLRegisterBuffer(self._pbo,
+                    cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard)
+            )
         else:
             msg = f'Buffer type {buffer_type} has not been implemented yet.'
             raise NotImplementedError(msg)
@@ -215,3 +309,15 @@ def _set_stream(self, value):
         assert isinstance(value, cp.cuda.Stream), type(value)
         self._stream = value
     stream = property(_get_stream, _set_stream)
+    
+    def _get_cuda_gfx_ressource(self):
+        assert self._cuda_gfx_ressource is not None
+        return self._cuda_gfx_ressource
+    def _set_cuda_gfx_ressource(self, value):
+        if (self._cuda_gfx_ressource is not None) and (self._cuda_gfx_ressource != value):
+            check_cudart_err(
+                cudart.cudaGraphicsUnregisterResource(self._cuda_gfx_ressource)
+            )
+        self._cuda_gfx_ressource = value
+
+    cuda_gfx_ressource = property(_get_cuda_gfx_ressource, _set_cuda_gfx_ressource)
diff --git a/examples/sutil/gl_display.py b/examples/sutil/gl_display.py
index bee2939..280cf1e 100644
--- a/examples/sutil/gl_display.py
+++ b/examples/sutil/gl_display.py
@@ -49,7 +49,6 @@ class GLDisplay:
             '_quad_vertex_buffer', '_image_format']
 
     def __init__(self, image_format):
-        print(image_format, type(image_format), isinstance(BufferImageFormat.UCHAR4, BufferImageFormat))
         assert isinstance(image_format, BufferImageFormat)
 
         vertex_array = gl.glGenVertexArrays(1)
diff --git a/setup.py b/setup.py
index 2658567..4457906 100644
--- a/setup.py
+++ b/setup.py
@@ -75,7 +75,6 @@ def import_module_from_path(path):
     ext_modules=extensions,
     install_requires=[
         'numpy',
-        'cupy>=9.0'
     ],
     license="MIT",
     classifiers=[

From e6fe401d49bce0bc4068cbdbece35094f81d8b2d Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Keck <jean-baptiste.keck@univ-grenoble-alpes.fr>
Date: Fri, 21 Jan 2022 16:24:57 +0100
Subject: [PATCH 2/2] Fix examples cuda source relative path

---
 examples/compile_with_tasks.py |  2 +-
 examples/dynamic_geometry.py   | 22 +++++++++++++---------
 examples/hello.py              |  9 ++++++---
 examples/spheres.py            | 10 +++++++---
 examples/triangle.py           |  6 +++++-
 setup.py                       |  1 +
 6 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/examples/compile_with_tasks.py b/examples/compile_with_tasks.py
index 9e967ff..f3b2a35 100644
--- a/examples/compile_with_tasks.py
+++ b/examples/compile_with_tasks.py
@@ -72,4 +72,4 @@
         tic = time.time()
         for i in range(args.num_iters):
             module = ox.Module(ctx, ptx, module_compile_options=compile_opts, pipeline_compile_options=pipeline_options)
-        print("Overall run time without tasks", time.time()-tic)
\ No newline at end of file
+        print("Overall run time without tasks", time.time()-tic)
diff --git a/examples/dynamic_geometry.py b/examples/dynamic_geometry.py
index ed8296b..a16acf9 100644
--- a/examples/dynamic_geometry.py
+++ b/examples/dynamic_geometry.py
@@ -18,6 +18,16 @@
 
 DEBUG=False
 
+if DEBUG:
+    exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW,
+    debug_level = ox.CompileDebugLevel.FULL
+    opt_level = ox.CompileOptimizationLevel.LEVEL_0
+else:
+    exception_flags=ox.ExceptionFlags.NONE
+    debug_level = ox.CompileDebugLevel.MINIMAL
+    opt_level = ox.CompileOptimizationLevel.LEVEL_3
+
+
 #------------------------------------------------------------------------------
 # Local types
 #------------------------------------------------------------------------------
@@ -336,14 +346,9 @@ def build_mesh_accel(state):
 
 
 def create_module(state):
-    if DEBUG:
-        exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW,
-    else:
-        exception_flags=ox.ExceptionFlags.NONE
-
     pipeline_opts = ox.PipelineCompileOptions(
             uses_motion_blur=False,
-            uses_primitive_type_flags =ox.PrimitiveTypeFlags.TRIANGLE,
+            uses_primitive_type_flags=ox.PrimitiveTypeFlags.TRIANGLE,
             traversable_graph_flags=ox.TraversableGraphFlags.ALLOW_SINGLE_LEVEL_INSTANCING,
             exception_flags=exception_flags,
             num_payload_values=3,
@@ -352,8 +357,7 @@ def create_module(state):
 
     compile_opts = ox.ModuleCompileOptions(
             max_register_count=ox.ModuleCompileOptions.DEFAULT_MAX_REGISTER_COUNT,
-            opt_level=ox.CompileOptimizationLevel.DEFAULT,
-            debug_level=ox.CompileDebugLevel.MODERATE)
+            opt_level=opt_level, debug_level=debug_level)
 
     cuda_source = os.path.join(script_dir, 'cuda', 'dynamic_geometry.cu')
     state.module = ox.Module(state.ctx, cuda_source, compile_opts, pipeline_opts)
@@ -370,7 +374,7 @@ def create_pipeline(state):
     program_grps = [state.raygen_grp, state.miss_grp, state.hit_grp]
 
     link_opts = ox.PipelineLinkOptions(max_trace_depth=1,
-                                       debug_level=ox.CompileDebugLevel.MODERATE)
+                                       debug_level=debug_level)
 
     pipeline = ox.Pipeline(state.ctx,
                            compile_options=state.pipeline_opts,
diff --git a/examples/hello.py b/examples/hello.py
index e79fdaa..57ea583 100644
--- a/examples/hello.py
+++ b/examples/hello.py
@@ -1,15 +1,18 @@
+import os, sys, logging
 import optix as ox
 import cupy as cp
 import numpy as np
 from PIL import Image, ImageOps
-import logging
-import sys
+
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 log = logging.getLogger()
 
+script_dir = os.path.dirname(__file__)
+cuda_src = os.path.join(script_dir, "cuda", "hello.cu")
+
 def create_module(ctx, pipeline_opts):
     compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
-    module = ox.Module(ctx, 'cuda/hello.cu', compile_opts, pipeline_opts)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
     return module
 
 
diff --git a/examples/spheres.py b/examples/spheres.py
index a7776e9..c2b55d3 100644
--- a/examples/spheres.py
+++ b/examples/spheres.py
@@ -1,11 +1,15 @@
+import os, sys, logging
 import optix as ox
 import cupy as cp
 import numpy as np
 from PIL import Image, ImageOps
-import logging
-import sys
+
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 log = logging.getLogger()
+
+script_dir = os.path.dirname(__file__)
+cuda_src = os.path.join(script_dir, "cuda", "spheres.cu")
+
 img_size = (1024, 768)
 
 def compute_spheres_bbox(centers, radii):
@@ -23,7 +27,7 @@ def create_acceleration_structure(ctx, bboxes):
 
 def create_module(ctx, pipeline_opts):
     compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
-    module = ox.Module(ctx, 'cuda/spheres.cu', compile_opts, pipeline_opts)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
     return module
 
 
diff --git a/examples/triangle.py b/examples/triangle.py
index b9d7bb3..2b319fa 100644
--- a/examples/triangle.py
+++ b/examples/triangle.py
@@ -1,8 +1,12 @@
+import os
 import optix as ox
 import cupy as cp
 import numpy as np
 from PIL import Image, ImageOps
 
+script_dir = os.path.dirname(__file__)
+cuda_src = os.path.join(script_dir, "cuda", "triangle.cu")
+
 img_size = (1024, 768)
 
 # use a regular function for logging
@@ -19,7 +23,7 @@ def create_acceleration_structure(ctx, vertices):
 
 def create_module(ctx, pipeline_opts):
     compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
-    module = ox.Module(ctx, 'cuda/triangle.cu', compile_opts, pipeline_opts)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
     return module
 
 
diff --git a/setup.py b/setup.py
index 4457906..2658567 100644
--- a/setup.py
+++ b/setup.py
@@ -75,6 +75,7 @@ def import_module_from_path(path):
     ext_modules=extensions,
     install_requires=[
         'numpy',
+        'cupy>=9.0'
     ],
     license="MIT",
     classifiers=[