Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/compile_with_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,4 @@
tic = time.time()
for i in range(args.num_iters):
module = ox.Module(ctx, ptx, module_compile_options=compile_opts, pipeline_compile_options=pipeline_options)
print("Overall run time without tasks", time.time()-tic)
print("Overall run time without tasks", time.time()-tic)
6 changes: 1 addition & 5 deletions examples/cuda/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ __forceinline__ __device__ float3 toSRGB( const float3& c )
c.z < 0.0031308f ? 12.92f * c.z : 1.055f * powed.z - 0.055f );
}

//__forceinline__ __device__ float dequantizeUnsigned8Bits( const unsigned char i )
//{
// enum { N = (1 << 8) - 1 };
// return min((float)i / (float)N), 1.f)
//}
__forceinline__ __device__ unsigned char quantizeUnsigned8Bits( float x )
{
x = clamp( x, 0.0f, 1.0f );
Expand All @@ -60,6 +55,7 @@ __forceinline__ __device__ uchar4 make_color( const float3& c )
float3 srgb = toSRGB( clamp( c, 0.0f, 1.0f ) );
return make_uchar4( quantizeUnsigned8Bits( srgb.x ), quantizeUnsigned8Bits( srgb.y ), quantizeUnsigned8Bits( srgb.z ), 255u );
}

__forceinline__ __device__ uchar4 make_color( const float4& c )
{
return make_color( make_float3( c.x, c.y, c.z ) );
Expand Down
25 changes: 14 additions & 11 deletions examples/dynamic_geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@

DEBUG=False

if DEBUG:
exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW,
debug_level = ox.CompileDebugLevel.FULL
opt_level = ox.CompileOptimizationLevel.LEVEL_0
else:
exception_flags=ox.ExceptionFlags.NONE
debug_level = ox.CompileDebugLevel.MINIMAL
opt_level = ox.CompileOptimizationLevel.LEVEL_3


#------------------------------------------------------------------------------
# Local types
#------------------------------------------------------------------------------
Expand Down Expand Up @@ -336,15 +346,9 @@ def build_mesh_accel(state):


def create_module(state):
if DEBUG:
exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW,
else:
exception_flags=ox.ExceptionFlags.NONE

print("Triangle value", ox.PrimitiveTypeFlags.TRIANGLE.value)
pipeline_opts = ox.PipelineCompileOptions(
uses_motion_blur=False,
uses_primitive_type_flags =ox.PrimitiveTypeFlags.TRIANGLE,
uses_primitive_type_flags=ox.PrimitiveTypeFlags.TRIANGLE,
traversable_graph_flags=ox.TraversableGraphFlags.ALLOW_SINGLE_LEVEL_INSTANCING,
exception_flags=exception_flags,
num_payload_values=3,
Expand All @@ -353,8 +357,7 @@ def create_module(state):

compile_opts = ox.ModuleCompileOptions(
max_register_count=ox.ModuleCompileOptions.DEFAULT_MAX_REGISTER_COUNT,
opt_level=ox.CompileOptimizationLevel.DEFAULT,
debug_level=ox.CompileDebugLevel.MODERATE)
opt_level=opt_level, debug_level=debug_level)

cuda_source = os.path.join(script_dir, 'cuda', 'dynamic_geometry.cu')
state.module = ox.Module(state.ctx, cuda_source, compile_opts, pipeline_opts)
Expand All @@ -371,7 +374,7 @@ def create_pipeline(state):
program_grps = [state.raygen_grp, state.miss_grp, state.hit_grp]

link_opts = ox.PipelineLinkOptions(max_trace_depth=1,
debug_level=ox.CompileDebugLevel.MODERATE)
debug_level=debug_level)

pipeline = ox.Pipeline(state.ctx,
compile_options=state.pipeline_opts,
Expand Down Expand Up @@ -414,7 +417,7 @@ def create_sbt(state):
animation_time = 1.0

buffer_format = BufferImageFormat.UCHAR4
output_buffer_type = CudaOutputBufferType.CUDA_DEVICE
output_buffer_type = CudaOutputBufferType.enable_gl_interop()

init_camera_state(state)
create_context(state)
Expand Down
9 changes: 6 additions & 3 deletions examples/hello.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import os, sys, logging
import optix as ox
import cupy as cp
import numpy as np
from PIL import Image, ImageOps
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
log = logging.getLogger()

script_dir = os.path.dirname(__file__)
cuda_src = os.path.join(script_dir, "cuda", "hello.cu")

def create_module(ctx, pipeline_opts):
compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
module = ox.Module(ctx, 'cuda/hello.cu', compile_opts, pipeline_opts)
module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
return module


Expand Down
10 changes: 7 additions & 3 deletions examples/spheres.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import os, sys, logging
import optix as ox
import cupy as cp
import numpy as np
from PIL import Image, ImageOps
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
log = logging.getLogger()

script_dir = os.path.dirname(__file__)
cuda_src = os.path.join(script_dir, "cuda", "spheres.cu")

img_size = (1024, 768)

def compute_spheres_bbox(centers, radii):
Expand All @@ -23,7 +27,7 @@ def create_acceleration_structure(ctx, bboxes):

def create_module(ctx, pipeline_opts):
compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
module = ox.Module(ctx, 'cuda/spheres.cu', compile_opts, pipeline_opts)
module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
return module


Expand Down
120 changes: 113 additions & 7 deletions examples/sutil/cuda_output_buffer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import enum
import sys, os, enum
from packaging import version

import numpy as np
import cupy as cp
Expand All @@ -7,6 +8,49 @@

from .vecmath import vtype_to_dtype

try:
import cuda as _cuda
from cuda import cudart
has_cudart = True
has_gl_interop = version.parse(_cuda.__version__) >= version.parse("11.6.0")
except ImportError:
cudart = None
has_cudart = False
has_gl_interop = False

_cuda_opengl_interop_msg = (
"Cuda Python low level bindings v11.6.0 or later are required to enable "
f"Cuda/OpenGL interoperability.{os.linesep}You can install the missing package with:"
f"{os.linesep} {sys.executable} -m pip install --upgrade --user cuda-python"
)

if has_cudart:
def format_cudart_err(err):
return (
f"{cudart.cudaGetErrorName(err)[1].decode('utf-8')}({int(err)}): "
f"{cudart.cudaGetErrorString(err)[1].decode('utf-8')}"
)


def check_cudart_err(args):
if isinstance(args, tuple):
assert len(args) >= 1
err = args[0]
if len(args) == 1:
ret = None
elif len(args) == 2:
ret = args[1]
else:
ret = args[1:]
else:
ret = None

assert isinstance(err, cudart.cudaError_t), type(err)
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(format_cudart_err(err))

return ret


class BufferImageFormat(enum.Enum):
UCHAR4=0
Expand Down Expand Up @@ -35,11 +79,22 @@ class CudaOutputBufferType(enum.Enum):
ZERO_COPY = 2, # general case, preferred for multi-gpu if not fully nvlink connected
CUDA_P2P = 3, # fully connected only, preferred for fully nvlink connected

@classmethod
def enable_gl_interop(cls, fallback=True):
if has_gl_interop:
return cls.GL_INTEROP
elif fallback:
msg = _cuda_opengl_interop_msg + f"{os.linesep}Falling back to slower CUDA_DEVICE output buffer."
print(msg)
return cls.CUDA_DEVICE
else:
raise RuntimeError(_cuda_opengl_interop_msg)


class CudaOutputBuffer:
__slots__ = ['_pixel_format', '_buffer_type', '_width', '_height',
'_device', '_device_idx', '_device', '_stream',
'_host_buffer', '_device_buffer', '_pbo']
'_host_buffer', '_device_buffer', '_cuda_gfx_ressource', '_pbo']

def __init__(self, buffer_type, pixel_format, width, height, device_idx=0):
for attr in self.__slots__:
Expand All @@ -50,6 +105,16 @@ def __init__(self, buffer_type, pixel_format, width, height, device_idx=0):
self.buffer_type = buffer_type
self.resize(width, height)
self.stream = None

if buffer_type is CudaOutputBufferType.GL_INTEROP:
if not has_gl_interop:
raise RuntimeError(_cuda_opengl_interop_msg)
device_count, device_ids = check_cudart_err( cudart.cudaGLGetDevices(1, cudart.cudaGLDeviceList.cudaGLDeviceListAll) )
if device_count <= 0:
raise RuntimeError("No OpenGL device found, cannot enable GL_INTEROP.")
elif device_ids[0] != device_idx:
raise RuntimeError(f"OpenGL device id {device_ids[0]} does not match requested "
f"device index {device_idx} for Cuda/OpenGL interop.")

self._reallocate_buffers()

Expand All @@ -69,13 +134,29 @@ def map(self):
self._make_current()
if (self._host_buffer is None) or (self._device_buffer is None):
self._reallocate_buffers()
return self._device_buffer.data.ptr
if self.buffer_type is CudaOutputBufferType.CUDA_DEVICE:
return self._device_buffer.data.ptr
elif self.buffer_type is CudaOutputBufferType.GL_INTEROP:
check_cudart_err(
cudart.cudaGraphicsMapResources(1, self._cuda_gfx_ressource, self._stream.ptr)
)
ptr, size = check_cudart_err(
cudart.cudaGraphicsResourceGetMappedPointer(self._cuda_gfx_ressource)
)
return ptr
else:
msg = f'Buffer type {self.buffer_type} has not been implemented yet.'
raise NotImplementedError(msg)

def unmap(self):
self._make_current()
buffer_type = self.buffer_type
if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
self._stream.synchronize()
elif buffer_type is CudaOutputBufferType.GL_INTEROP:
check_cudart_err(
cudart.cudaGraphicsUnmapResources(1, self._cuda_gfx_ressource, self._stream.ptr)
)
else:
msg = f'Buffer type {buffer_type} has not been implemented yet.'
raise NotImplementedError(msg)
Expand All @@ -85,12 +166,13 @@ def get_pbo(self):

self._make_current()

if self._pbo is None:
self._pbo = gl.glGenBuffers(1)

if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
if self._pbo is None:
self._pbo = gl.glGenBuffers(1)
self.copy_device_to_host()
self.copy_host_to_pbo()
elif buffer_type is CudaOutputBufferType.GL_INTEROP:
assert self._pbo is not None
else:
msg = f'Buffer type {buffer_type} has not been implemented yet.'
raise NotImplementedError(msg)
Expand Down Expand Up @@ -121,14 +203,26 @@ def _reallocate_buffers(self):

dtype = self.pixel_format
shape = (self.height, self.width)

self._host_buffer = np.empty(shape=shape, dtype=dtype)

if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
self._host_buffer = np.empty(shape=shape, dtype=dtype)
self._device_buffer = cp.empty(shape=shape, dtype=dtype)
if self._pbo is not None:
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo)
gl.glBufferData(gl.GL_ARRAY_BUFFER, self._host_buffer, gl.GL_STREAM_DRAW)
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
elif buffer_type is CudaOutputBufferType.GL_INTEROP:
self._pbo = gl.glGenBuffers(1) if self._pbo is None else self._pbo

gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo)
gl.glBufferData(gl.GL_ARRAY_BUFFER, self.width*self.height*dtype.itemsize, None, gl.GL_STREAM_DRAW)
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)

self.cuda_gfx_ressource = check_cudart_err(
cudart.cudaGraphicsGLRegisterBuffer(self._pbo,
cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard)
)
else:
msg = f'Buffer type {buffer_type} has not been implemented yet.'
raise NotImplementedError(msg)
Expand Down Expand Up @@ -215,3 +309,15 @@ def _set_stream(self, value):
assert isinstance(value, cp.cuda.Stream), type(value)
self._stream = value
stream = property(_get_stream, _set_stream)

def _get_cuda_gfx_ressource(self):
assert self._cuda_gfx_ressource is not None
return self._cuda_gfx_ressource
def _set_cuda_gfx_ressource(self, value):
if (self._cuda_gfx_ressource is not None) and (self._cuda_gfx_ressource != value):
check_cudart_err(
cudart.cudaGraphicsUnregisterResource(self._cuda_gfx_ressource)
)
self._cuda_gfx_ressource = value

cuda_gfx_ressource = property(_get_cuda_gfx_ressource, _set_cuda_gfx_ressource)
1 change: 0 additions & 1 deletion examples/sutil/gl_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ class GLDisplay:
'_quad_vertex_buffer', '_image_format']

def __init__(self, image_format):
print(image_format, type(image_format), isinstance(BufferImageFormat.UCHAR4, BufferImageFormat))
assert isinstance(image_format, BufferImageFormat)

vertex_array = gl.glGenVertexArrays(1)
Expand Down
6 changes: 5 additions & 1 deletion examples/triangle.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
import optix as ox
import cupy as cp
import numpy as np
from PIL import Image, ImageOps

script_dir = os.path.dirname(__file__)
cuda_src = os.path.join(script_dir, "cuda", "triangle.cu")

img_size = (1024, 768)

# use a regular function for logging
Expand All @@ -19,7 +23,7 @@ def create_acceleration_structure(ctx, vertices):

def create_module(ctx, pipeline_opts):
compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
module = ox.Module(ctx, 'cuda/triangle.cu', compile_opts, pipeline_opts)
module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
return module


Expand Down