This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,9 @@
"""CUDA Driver
- Driver API binding
- NVVM API binding
- Device array implementation
"""
from numba.core import config
assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'

View File

@@ -0,0 +1,904 @@
"""
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object. If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
"""
import math
import functools
import operator
import copy
from ctypes import c_void_p
import numpy as np
import numba
from numba import _devicearray
from numba.cuda.cudadrv import devices, dummyarray
from numba.cuda.cudadrv import driver as _driver
from numba.core import types, config
from numba.np.unsafe.ndarray import to_fixed_tuple
from numba.np.numpy_support import numpy_version
from numba.np import numpy_support
from numba.cuda.api_util import prepare_shape_strides_dtype
from numba.core.errors import NumbaPerformanceWarning
from warnings import warn
try:
lru_cache = getattr(functools, 'lru_cache')(None)
except AttributeError:
# Python 3.1 or lower
def lru_cache(func):
return func
def is_cuda_ndarray(obj):
"Check if an object is a CUDA ndarray"
return getattr(obj, '__cuda_ndarray__', False)
def verify_cuda_ndarray_interface(obj):
"Verify the CUDA ndarray interface for an obj"
require_cuda_ndarray(obj)
def requires_attr(attr, typ):
if not hasattr(obj, attr):
raise AttributeError(attr)
if not isinstance(getattr(obj, attr), typ):
raise AttributeError('%s must be of type %s' % (attr, typ))
requires_attr('shape', tuple)
requires_attr('strides', tuple)
requires_attr('dtype', np.dtype)
requires_attr('size', int)
def require_cuda_ndarray(obj):
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
if not is_cuda_ndarray(obj):
raise ValueError('require an cuda ndarray object')
class DeviceNDArrayBase(_devicearray.DeviceArray):
"""A on GPU NDArray representation
"""
__cuda_memory__ = True
__cuda_ndarray__ = True # There must be gpu_data attribute
def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
"""
Args
----
shape
array shape.
strides
array strides.
dtype
data type as np.dtype coercible object.
stream
cuda stream.
gpu_data
user provided device memory for the ndarray data buffer
"""
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
dtype = np.dtype(dtype)
self.ndim = len(shape)
if len(strides) != self.ndim:
raise ValueError('strides not match ndim')
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
dtype.itemsize)
self.shape = tuple(shape)
self.strides = tuple(strides)
self.dtype = dtype
self.size = int(functools.reduce(operator.mul, self.shape, 1))
# prepare gpu memory
if self.size > 0:
if gpu_data is None:
self.alloc_size = _driver.memory_size_from_info(
self.shape, self.strides, self.dtype.itemsize)
gpu_data = devices.get_context().memalloc(self.alloc_size)
else:
self.alloc_size = _driver.device_memory_size(gpu_data)
else:
# Make NULL pointer for empty allocation
if _driver.USE_NV_BINDING:
null = _driver.binding.CUdeviceptr(0)
else:
null = c_void_p(0)
gpu_data = _driver.MemoryPointer(context=devices.get_context(),
pointer=null, size=0)
self.alloc_size = 0
self.gpu_data = gpu_data
self.stream = stream
@property
def __cuda_array_interface__(self):
if _driver.USE_NV_BINDING:
if self.device_ctypes_pointer is not None:
ptr = int(self.device_ctypes_pointer)
else:
ptr = 0
else:
if self.device_ctypes_pointer.value is not None:
ptr = self.device_ctypes_pointer.value
else:
ptr = 0
return {
'shape': tuple(self.shape),
'strides': None if is_contiguous(self) else tuple(self.strides),
'data': (ptr, False),
'typestr': self.dtype.str,
'stream': int(self.stream) if self.stream != 0 else None,
'version': 3,
}
def bind(self, stream=0):
"""Bind a CUDA stream to this object so that all subsequent operation
on this array defaults to the given stream.
"""
clone = copy.copy(self)
clone.stream = stream
return clone
@property
def T(self):
return self.transpose()
def transpose(self, axes=None):
if axes and tuple(axes) == tuple(range(self.ndim)):
return self
elif self.ndim != 2:
msg = "transposing a non-2D DeviceNDArray isn't supported"
raise NotImplementedError(msg)
elif axes is not None and set(axes) != set(range(self.ndim)):
raise ValueError("invalid axes list %r" % (axes,))
else:
from numba.cuda.kernels.transpose import transpose
return transpose(self)
def _default_stream(self, stream):
return self.stream if not stream else stream
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
# Typing considerations:
#
# 1. The preference is to use 'C' or 'F' layout since this enables
# hardcoding stride values into compiled kernels, which is more
# efficient than storing a passed-in value in a register.
#
# 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
# the more likely / common case.
#
# 3. If an array is broadcast then it must be typed as 'A' - using 'C'
# or 'F' does not apply for broadcast arrays, because the strides, some
# of which will be 0, will not match those hardcoded in for 'C' or 'F'
# layouts.
broadcast = 0 in self.strides
if self.flags['C_CONTIGUOUS'] and not broadcast:
layout = 'C'
elif self.flags['F_CONTIGUOUS'] and not broadcast:
layout = 'F'
else:
layout = 'A'
dtype = numpy_support.from_dtype(self.dtype)
return types.Array(dtype, self.ndim, layout)
@property
def device_ctypes_pointer(self):
"""Returns the ctypes pointer to the GPU data buffer
"""
if self.gpu_data is None:
if _driver.USE_NV_BINDING:
return _driver.binding.CUdeviceptr(0)
else:
return c_void_p(0)
else:
return self.gpu_data.device_ctypes_pointer
@devices.require_context
def copy_to_device(self, ary, stream=0):
"""Copy `ary` to `self`.
If `ary` is a CUDA memory, perform a device-to-device transfer.
Otherwise, perform a a host-to-device transfer.
"""
if ary.size == 0:
# Nothing to do
return
sentry_contiguous(self)
stream = self._default_stream(stream)
self_core, ary_core = array_core(self), array_core(ary)
if _driver.is_device_memory(ary):
sentry_contiguous(ary)
check_array_compatibility(self_core, ary_core)
_driver.device_to_device(self, ary, self.alloc_size, stream=stream)
else:
# Ensure same contiguity. Only makes a host-side copy if necessary
# (i.e., in order to materialize a writable strided view)
ary_core = np.array(
ary_core,
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
subok=True,
copy=(not ary_core.flags['WRITEABLE'])
if numpy_version < (2, 0) else None)
check_array_compatibility(self_core, ary_core)
_driver.host_to_device(self, ary_core, self.alloc_size,
stream=stream)
@devices.require_context
def copy_to_host(self, ary=None, stream=0):
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
if ``ary`` is ``None``.
If a CUDA ``stream`` is given, then the transfer will be made
asynchronously as part as the given stream. Otherwise, the transfer is
synchronous: the function returns after the copy is finished.
Always returns the host array.
Example::
import numpy as np
from numba import cuda
arr = np.arange(1000)
d_arr = cuda.to_device(arr)
my_kernel[100, 100](d_arr)
result_array = d_arr.copy_to_host()
"""
if any(s < 0 for s in self.strides):
msg = 'D->H copy not implemented for negative strides: {}'
raise NotImplementedError(msg.format(self.strides))
assert self.alloc_size >= 0, "Negative memory size"
stream = self._default_stream(stream)
if ary is None:
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
else:
check_array_compatibility(self, ary)
hostary = ary
if self.alloc_size != 0:
_driver.device_to_host(hostary, self, self.alloc_size,
stream=stream)
if ary is None:
if self.size == 0:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
buffer=hostary)
else:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
strides=self.strides, buffer=hostary)
return hostary
def split(self, section, stream=0):
"""Split the array into equal partition of the `section` size.
If the array cannot be equally divided, the last section will be
smaller.
"""
stream = self._default_stream(stream)
if self.ndim != 1:
raise ValueError("only support 1d array")
if self.strides[0] != self.dtype.itemsize:
raise ValueError("only support unit stride")
nsect = int(math.ceil(float(self.size) / section))
strides = self.strides
itemsize = self.dtype.itemsize
for i in range(nsect):
begin = i * section
end = min(begin + section, self.size)
shape = (end - begin,)
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
gpu_data=gpu_data)
def as_cuda_arg(self):
"""Returns a device memory object that is used as the argument.
"""
return self.gpu_data
def get_ipc_handle(self):
"""
Returns a *IpcArrayHandle* object that is safe to serialize and transfer
to another process to share the local allocation.
Note: this feature is only available on Linux.
"""
ipch = devices.get_context().get_ipc_handle(self.gpu_data)
desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
def squeeze(self, axis=None, stream=0):
"""
Remove axes of size one from the array shape.
Parameters
----------
axis : None or int or tuple of ints, optional
Subset of dimensions to remove. A `ValueError` is raised if an axis
with size greater than one is selected. If `None`, all axes with
size one are removed.
stream : cuda stream or 0, optional
Default stream for the returned view of the array.
Returns
-------
DeviceNDArray
Squeezed view into the array.
"""
new_dummy, _ = self._dummy.squeeze(axis=axis)
return DeviceNDArray(
shape=new_dummy.shape,
strides=new_dummy.strides,
dtype=self.dtype,
stream=self._default_stream(stream),
gpu_data=self.gpu_data,
)
def view(self, dtype):
"""Returns a new object by reinterpretting the dtype without making a
copy of the data.
"""
dtype = np.dtype(dtype)
shape = list(self.shape)
strides = list(self.strides)
if self.dtype.itemsize != dtype.itemsize:
if not self.is_c_contiguous():
raise ValueError(
"To change to a dtype of a different size,"
" the array must be C-contiguous"
)
shape[-1], rem = divmod(
shape[-1] * self.dtype.itemsize,
dtype.itemsize
)
if rem != 0:
raise ValueError(
"When changing to a larger dtype,"
" its size must be a divisor of the total size in bytes"
" of the last axis of the array."
)
strides[-1] = dtype.itemsize
return DeviceNDArray(
shape=shape,
strides=strides,
dtype=dtype,
stream=self.stream,
gpu_data=self.gpu_data,
)
@property
def nbytes(self):
# Note: not using `alloc_size`. `alloc_size` reports memory
# consumption of the allocation, not the size of the array
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
return self.dtype.itemsize * self.size
class DeviceRecord(DeviceNDArrayBase):
'''
An on-GPU record type
'''
def __init__(self, dtype, stream=0, gpu_data=None):
shape = ()
strides = ()
super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
gpu_data)
@property
def flags(self):
"""
For `numpy.ndarray` compatibility. Ideally this would return a
`np.core.multiarray.flagsobj`, but that needs to be constructed
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
aren't writeable).
"""
return dict(self._dummy.flags) # defensive copy
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
return numpy_support.from_dtype(self.dtype)
@devices.require_context
def __getitem__(self, item):
return self._do_getitem(item)
@devices.require_context
def getitem(self, item, stream=0):
"""Do `__getitem__(item)` with CUDA stream
"""
return self._do_getitem(item, stream)
def _do_getitem(self, item, stream=0):
stream = self._default_stream(stream)
typ, offset = self.dtype.fields[item]
newdata = self.gpu_data.view(offset)
if typ.shape == ():
if typ.names is not None:
return DeviceRecord(dtype=typ, stream=stream,
gpu_data=newdata)
else:
hostary = np.empty(1, dtype=typ)
_driver.device_to_host(dst=hostary, src=newdata,
size=typ.itemsize,
stream=stream)
return hostary[0]
else:
shape, strides, dtype = \
prepare_shape_strides_dtype(typ.shape,
None,
typ.subdtype[0], 'C')
return DeviceNDArray(shape=shape, strides=strides,
dtype=dtype, gpu_data=newdata,
stream=stream)
@devices.require_context
def __setitem__(self, key, value):
return self._do_setitem(key, value)
@devices.require_context
def setitem(self, key, value, stream=0):
"""Do `__setitem__(key, value)` with CUDA stream
"""
return self._do_setitem(key, value, stream=stream)
def _do_setitem(self, key, value, stream=0):
stream = self._default_stream(stream)
# If the record didn't have a default stream, and the user didn't
# provide a stream, then we will use the default stream for the
# assignment kernel and synchronize on it.
synchronous = not stream
if synchronous:
ctx = devices.get_context()
stream = ctx.get_default_stream()
# (1) prepare LHS
typ, offset = self.dtype.fields[key]
newdata = self.gpu_data.view(offset)
lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
# (2) prepare RHS
rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
# (3) do the copy
_driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
if synchronous:
stream.synchronize()
@lru_cache
def _assign_kernel(ndim):
"""
A separate method so we don't need to compile code every assignment (!).
:param ndim: We need to have static array sizes for cuda.local.array, so
bake in the number of dimensions into the kernel
"""
from numba import cuda # circular!
if ndim == 0:
# the (2, ndim) allocation below is not yet supported, so avoid it
@cuda.jit
def kernel(lhs, rhs):
lhs[()] = rhs[()]
return kernel
@cuda.jit
def kernel(lhs, rhs):
location = cuda.grid(1)
n_elements = 1
for i in range(lhs.ndim):
n_elements *= lhs.shape[i]
if location >= n_elements:
# bake n_elements into the kernel, better than passing it in
# as another argument.
return
# [0, :] is the to-index (into `lhs`)
# [1, :] is the from-index (into `rhs`)
idx = cuda.local.array(
shape=(2, ndim),
dtype=types.int64)
for i in range(ndim - 1, -1, -1):
idx[0, i] = location % lhs.shape[i]
idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
location //= lhs.shape[i]
lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
return kernel
class DeviceNDArray(DeviceNDArrayBase):
'''
An on-GPU array type
'''
def is_f_contiguous(self):
'''
Return true if the array is Fortran-contiguous.
'''
return self._dummy.is_f_contig
@property
def flags(self):
"""
For `numpy.ndarray` compatibility. Ideally this would return a
`np.core.multiarray.flagsobj`, but that needs to be constructed
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
aren't writeable).
"""
return dict(self._dummy.flags) # defensive copy
def is_c_contiguous(self):
'''
Return true if the array is C-contiguous.
'''
return self._dummy.is_c_contig
def __array__(self, dtype=None):
"""
:return: an `numpy.ndarray`, so copies to the host.
"""
if dtype:
return self.copy_to_host().__array__(dtype)
else:
return self.copy_to_host().__array__()
def __len__(self):
return self.shape[0]
def reshape(self, *newshape, **kws):
"""
Reshape the array without changing its contents, similarly to
:meth:`numpy.ndarray.reshape`. Example::
d_arr = d_arr.reshape(20, 50, order='F')
"""
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
newshape = newshape[0]
cls = type(self)
if newshape == self.shape:
# nothing to do
return cls(shape=self.shape, strides=self.strides,
dtype=self.dtype, gpu_data=self.gpu_data)
newarr, extents = self._dummy.reshape(*newshape, **kws)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, gpu_data=self.gpu_data)
else:
raise NotImplementedError("operation requires copying")
def ravel(self, order='C', stream=0):
'''
Flattens a contiguous array without changing its contents, similar to
:meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
exception.
'''
stream = self._default_stream(stream)
cls = type(self)
newarr, extents = self._dummy.ravel(order=order)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, gpu_data=self.gpu_data,
stream=stream)
else:
raise NotImplementedError("operation requires copying")
@devices.require_context
def __getitem__(self, item):
return self._do_getitem(item)
@devices.require_context
def getitem(self, item, stream=0):
"""Do `__getitem__(item)` with CUDA stream
"""
return self._do_getitem(item, stream)
def _do_getitem(self, item, stream=0):
stream = self._default_stream(stream)
arr = self._dummy.__getitem__(item)
extents = list(arr.iter_contiguous_extent())
cls = type(self)
if len(extents) == 1:
newdata = self.gpu_data.view(*extents[0])
if not arr.is_array:
# Check for structured array type (record)
if self.dtype.names is not None:
return DeviceRecord(dtype=self.dtype, stream=stream,
gpu_data=newdata)
else:
# Element indexing
hostary = np.empty(1, dtype=self.dtype)
_driver.device_to_host(dst=hostary, src=newdata,
size=self._dummy.itemsize,
stream=stream)
return hostary[0]
else:
return cls(shape=arr.shape, strides=arr.strides,
dtype=self.dtype, gpu_data=newdata, stream=stream)
else:
newdata = self.gpu_data.view(*arr.extent)
return cls(shape=arr.shape, strides=arr.strides,
dtype=self.dtype, gpu_data=newdata, stream=stream)
@devices.require_context
def __setitem__(self, key, value):
return self._do_setitem(key, value)
@devices.require_context
def setitem(self, key, value, stream=0):
"""Do `__setitem__(key, value)` with CUDA stream
"""
return self._do_setitem(key, value, stream=stream)
def _do_setitem(self, key, value, stream=0):
stream = self._default_stream(stream)
# If the array didn't have a default stream, and the user didn't provide
# a stream, then we will use the default stream for the assignment
# kernel and synchronize on it.
synchronous = not stream
if synchronous:
ctx = devices.get_context()
stream = ctx.get_default_stream()
# (1) prepare LHS
arr = self._dummy.__getitem__(key)
newdata = self.gpu_data.view(*arr.extent)
if isinstance(arr, dummyarray.Element):
# convert to a 0d array
shape = ()
strides = ()
else:
shape = arr.shape
strides = arr.strides
lhs = type(self)(
shape=shape,
strides=strides,
dtype=self.dtype,
gpu_data=newdata,
stream=stream)
# (2) prepare RHS
rhs, _ = auto_device(value, stream=stream, user_explicit=True)
if rhs.ndim > lhs.ndim:
raise ValueError("Can't assign %s-D array to %s-D self" % (
rhs.ndim,
lhs.ndim))
rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
# negative indices would not work if rhs.ndim == 0
rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
rhs = rhs.reshape(*rhs_shape)
for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
if r != 1 and l != r:
raise ValueError("Can't copy sequence with size %d to array "
"axis %d with dimension %d" % ( r, i, l))
# (3) do the copy
n_elements = functools.reduce(operator.mul, lhs.shape, 1)
_assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
if synchronous:
stream.synchronize()
class IpcArrayHandle(object):
"""
An IPC array handle that can be serialized and transfer to another process
in the same machine for share a GPU allocation.
On the destination process, use the *.open()* method to creates a new
*DeviceNDArray* object that shares the allocation from the original process.
To release the resources, call the *.close()* method. After that, the
destination can no longer use the shared array object. (Note: the
underlying weakref to the resource is now dead.)
This object implements the context-manager interface that calls the
*.open()* and *.close()* method automatically::
with the_ipc_array_handle as ipc_array:
# use ipc_array here as a normal gpu array object
some_code(ipc_array)
# ipc_array is dead at this point
"""
def __init__(self, ipc_handle, array_desc):
self._array_desc = array_desc
self._ipc_handle = ipc_handle
def open(self):
"""
Returns a new *DeviceNDArray* that shares the allocation from the
original process. Must not be used on the original process.
"""
dptr = self._ipc_handle.open(devices.get_context())
return DeviceNDArray(gpu_data=dptr, **self._array_desc)
def close(self):
"""
Closes the IPC handle to the array.
"""
self._ipc_handle.close()
def __enter__(self):
return self.open()
def __exit__(self, type, value, traceback):
self.close()
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
"""
A host array that uses CUDA mapped memory.
"""
def device_setup(self, gpu_data, stream=0):
self.gpu_data = gpu_data
self.stream = stream
class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
"""
A host array that uses CUDA managed memory.
"""
def device_setup(self, gpu_data, stream=0):
self.gpu_data = gpu_data
self.stream = stream
def from_array_like(ary, stream=0, gpu_data=None):
"Create a DeviceNDArray object that is like ary."
return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
gpu_data=gpu_data)
def from_record_like(rec, stream=0, gpu_data=None):
"Create a DeviceRecord object that is like rec."
return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
def array_core(ary):
"""
Extract the repeated core of a broadcast array.
Broadcast arrays are by definition non-contiguous due to repeated
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
contiguity and copy the underlying data from such arrays, we must create
a view without the repeated dimensions.
"""
if not ary.strides or not ary.size:
return ary
core_index = []
for stride in ary.strides:
core_index.append(0 if stride == 0 else slice(None))
return ary[tuple(core_index)]
def is_contiguous(ary):
"""
Returns True iff `ary` is C-style contiguous while ignoring
broadcasted and 1-sized dimensions.
As opposed to array_core(), it does not call require_context(),
which can be quite expensive.
"""
size = ary.dtype.itemsize
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
if shape > 1 and stride != 0:
if size != stride:
return False
size *= shape
return True
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
"be transferred as a single memory region. Please "
"ensure contiguous buffer with numpy "
".ascontiguousarray()")
def sentry_contiguous(ary):
core = array_core(ary)
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
raise ValueError(errmsg_contiguous_buffer)
def auto_device(obj, stream=0, copy=True, user_explicit=False):
"""
Create a DeviceRecord or DeviceArray like obj and optionally copy data from
host to device. If obj already represents device memory, it is returned and
no copy is made.
"""
if _driver.is_device_memory(obj):
return obj, False
elif hasattr(obj, '__cuda_array_interface__'):
return numba.cuda.as_cuda_array(obj), False
else:
if isinstance(obj, np.void):
devobj = from_record_like(obj, stream=stream)
else:
# This allows you to pass non-array objects like constants and
# objects implementing the array interface
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
# into this function (with no overhead -- copies -- for `obj`s
# that are already `ndarray`s.
obj = np.array(
obj,
copy=False if numpy_version < (2, 0) else None,
subok=True)
sentry_contiguous(obj)
devobj = from_array_like(obj, stream=stream)
if copy:
if config.CUDA_WARN_ON_IMPLICIT_COPY:
if (
not user_explicit and
(not isinstance(obj, DeviceNDArray)
and isinstance(obj, np.ndarray))
):
msg = ("Host array used in CUDA kernel will incur "
"copy overhead to/from device.")
warn(NumbaPerformanceWarning(msg))
devobj.copy_to_device(obj, stream=stream)
return devobj, True
def check_array_compatibility(ary1, ary2):
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
if ary1.dtype != ary2.dtype:
raise TypeError('incompatible dtype: %s vs. %s' %
(ary1.dtype, ary2.dtype))
if ary1sq.shape != ary2sq.shape:
raise ValueError('incompatible shape: %s vs. %s' %
(ary1.shape, ary2.shape))
# We check strides only if the size is nonzero, because strides are
# irrelevant (and can differ) for zero-length copies.
if ary1.size and ary1sq.strides != ary2sq.strides:
raise ValueError('incompatible strides: %s vs. %s' %
(ary1.strides, ary2.strides))

View File

@@ -0,0 +1,248 @@
"""
Expose each GPU devices directly.
This module implements a API that is like the "CUDA runtime" context manager
for managing CUDA context stack and clean up. It relies on thread-local globals
to separate the context stack management of each thread. Contexts are also
shareable among threads. Only the main thread can destroy Contexts.
Note:
- This module must be imported by the main-thread.
"""
import functools
import threading
from contextlib import contextmanager
from .driver import driver, USE_NV_BINDING
class _DeviceList(object):
def __getattr__(self, attr):
# First time looking at "lst" attribute.
if attr == "lst":
# Device list is not initialized.
# Query all CUDA devices.
numdev = driver.get_device_count()
gpus = [_DeviceContextManager(driver.get_device(devid))
for devid in range(numdev)]
# Define "lst" to avoid re-initialization
self.lst = gpus
return gpus
# Other attributes
return super(_DeviceList, self).__getattr__(attr)
def __getitem__(self, devnum):
'''
Returns the context manager for device *devnum*.
'''
return self.lst[devnum]
def __str__(self):
return ', '.join([str(d) for d in self.lst])
def __iter__(self):
return iter(self.lst)
def __len__(self):
return len(self.lst)
@property
def current(self):
"""Returns the active device or None if there's no active device
"""
with driver.get_active_context() as ac:
devnum = ac.devnum
if devnum is not None:
return self[devnum]
class _DeviceContextManager(object):
"""
Provides a context manager for executing in the context of the chosen
device. The normal use of instances of this type is from
``numba.cuda.gpus``. For example, to execute on device 2::
with numba.cuda.gpus[2]:
d_a = numba.cuda.to_device(a)
to copy the array *a* onto device 2, referred to by *d_a*.
"""
def __init__(self, device):
self._device = device
def __getattr__(self, item):
return getattr(self._device, item)
def __enter__(self):
_runtime.get_or_create_context(self._device.id)
def __exit__(self, exc_type, exc_val, exc_tb):
# this will verify that we are popping the right device context.
self._device.get_primary_context().pop()
def __str__(self):
return "<Managed Device {self.id}>".format(self=self)
class _Runtime(object):
"""Emulate the CUDA runtime context management.
It owns all Devices and Contexts.
Keeps at most one Context per Device
"""
def __init__(self):
self.gpus = _DeviceList()
# For caching the attached CUDA Context
self._tls = threading.local()
# Remember the main thread
# Only the main thread can *actually* destroy
self._mainthread = threading.current_thread()
# Avoid mutation of runtime state in multithreaded programs
self._lock = threading.RLock()
@contextmanager
def ensure_context(self):
"""Ensure a CUDA context is available inside the context.
On entrance, queries the CUDA driver for an active CUDA context and
attaches it in TLS for subsequent calls so they do not need to query
the CUDA driver again. On exit, detach the CUDA context from the TLS.
This will allow us to pickup thirdparty activated CUDA context in
any top-level Numba CUDA API.
"""
with driver.get_active_context():
oldctx = self._get_attached_context()
newctx = self.get_or_create_context(None)
self._set_attached_context(newctx)
try:
yield
finally:
self._set_attached_context(oldctx)
def get_or_create_context(self, devnum):
"""Returns the primary context and push+create it if needed
for *devnum*. If *devnum* is None, use the active CUDA context (must
be primary) or create a new one with ``devnum=0``.
"""
if devnum is None:
attached_ctx = self._get_attached_context()
if attached_ctx is None:
return self._get_or_create_context_uncached(devnum)
else:
return attached_ctx
else:
if USE_NV_BINDING:
devnum = int(devnum)
return self._activate_context_for(devnum)
def _get_or_create_context_uncached(self, devnum):
"""See also ``get_or_create_context(devnum)``.
This version does not read the cache.
"""
with self._lock:
# Try to get the active context in the CUDA stack or
# activate GPU-0 with the primary context
with driver.get_active_context() as ac:
if not ac:
return self._activate_context_for(0)
else:
# Get primary context for the active device
ctx = self.gpus[ac.devnum].get_primary_context()
# Is active context the primary context?
if USE_NV_BINDING:
ctx_handle = int(ctx.handle)
ac_ctx_handle = int(ac.context_handle)
else:
ctx_handle = ctx.handle.value
ac_ctx_handle = ac.context_handle.value
if ctx_handle != ac_ctx_handle:
msg = ('Numba cannot operate on non-primary'
' CUDA context {:x}')
raise RuntimeError(msg.format(ac_ctx_handle))
# Ensure the context is ready
ctx.prepare_for_use()
return ctx
def _activate_context_for(self, devnum):
with self._lock:
gpu = self.gpus[devnum]
newctx = gpu.get_primary_context()
# Detect unexpected context switch
cached_ctx = self._get_attached_context()
if cached_ctx is not None and cached_ctx is not newctx:
raise RuntimeError('Cannot switch CUDA-context.')
newctx.push()
return newctx
def _get_attached_context(self):
return getattr(self._tls, 'attached_context', None)
def _set_attached_context(self, ctx):
self._tls.attached_context = ctx
def reset(self):
"""Clear all contexts in the thread. Destroy the context if and only
if we are in the main thread.
"""
# Pop all active context.
while driver.pop_active_context() is not None:
pass
# If it is the main thread
if threading.current_thread() == self._mainthread:
self._destroy_all_contexts()
def _destroy_all_contexts(self):
# Reset all devices
for gpu in self.gpus:
gpu.reset()
_runtime = _Runtime()
# ================================ PUBLIC API ================================
gpus = _runtime.gpus
def get_context(devnum=None):
"""Get the current device or use a device by device number, and
return the CUDA context.
"""
return _runtime.get_or_create_context(devnum)
def require_context(fn):
"""
A decorator that ensures a CUDA context is available when *fn* is executed.
Note: The function *fn* cannot switch CUDA-context.
"""
@functools.wraps(fn)
def _require_cuda_context(*args, **kws):
with _runtime.ensure_context():
return fn(*args, **kws)
return _require_cuda_context
def reset():
"""Reset the CUDA subsystem for the current thread.
In the main thread:
This removes all CUDA contexts. Only use this at shutdown or for
cleaning up between tests.
In non-main threads:
This clear the CUDA context stack only.
"""
_runtime.reset()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,394 @@
from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
from numba.cuda.cudadrv import _extras
cu_device = c_int
cu_device_attribute = c_int # enum
cu_context = c_void_p # an opaque handle
cu_module = c_void_p # an opaque handle
cu_jit_option = c_int # enum
cu_jit_input_type = c_int # enum
cu_function = c_void_p # an opaque handle
cu_device_ptr = c_size_t # defined as unsigned long long
cu_stream = c_void_p # an opaque handle
cu_event = c_void_p
cu_link_state = c_void_p
cu_function_attribute = c_int
cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide
cu_uuid = (c_byte * 16) # Device UUID
cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
CU_STREAM_DEFAULT = 0
CU_STREAM_LEGACY = 1
CU_STREAM_PER_THREAD = 2
API_PROTOTYPES = {
# CUresult cuInit(unsigned int Flags);
'cuInit' : (c_int, c_uint),
# CUresult cuDriverGetVersion (int* driverVersion )
'cuDriverGetVersion': (c_int, POINTER(c_int)),
# CUresult cuDeviceGetCount(int *count);
'cuDeviceGetCount': (c_int, POINTER(c_int)),
# CUresult cuDeviceGet(CUdevice *device, int ordinal);
'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
# CUresult cuDeviceGetName ( char* name, int len, CUdevice dev )
'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
# CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
# CUdevice dev);
'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
cu_device),
# CUresult cuDeviceComputeCapability(int *major, int *minor,
# CUdevice dev);
'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
cu_device),
# CUresult cuDevicePrimaryCtxGetState(
# CUdevice dev,
# unsigned int* flags,
# int* active)
'cuDevicePrimaryCtxGetState': (c_int,
cu_device, POINTER(c_uint), POINTER(c_int)),
# CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
'cuDevicePrimaryCtxRelease': (c_int, cu_device),
# CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
'cuDevicePrimaryCtxReset': (c_int, cu_device),
# CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
# CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int flags )
'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
# CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
# CUdevice dev);
'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
# CUresult cuCtxGetDevice ( CUdevice * device )
'cuCtxGetDevice': (c_int, POINTER(cu_device)),
# CUresult cuCtxGetCurrent (CUcontext *pctx);
'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
# CUresult cuCtxPushCurrent (CUcontext pctx);
'cuCtxPushCurrent': (c_int, cu_context),
# CUresult cuCtxPopCurrent (CUcontext *pctx);
'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
# CUresult cuCtxDestroy(CUcontext pctx);
'cuCtxDestroy': (c_int, cu_context),
# CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
# unsigned int numOptions,
# CUjit_option *options,
# void **optionValues);
'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
POINTER(cu_jit_option), POINTER(c_void_p)),
# CUresult cuModuleUnload(CUmodule hmod);
'cuModuleUnload': (c_int, cu_module),
# CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
# const char *name);
'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
# CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
# hmod, const char* name )
'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
cu_module, c_char_p),
# CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
# CUfunc_cache config);
'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
# CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
# CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
# unsigned int flags);
'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
# CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
# CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
# size_t N, CUstream hStream);
'cuMemsetD8Async': (c_int,
cu_device_ptr, c_uint8, c_size_t, cu_stream),
# CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
# size_t ByteCount);
'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
# CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
# size_t ByteCount, CUstream hStream);
'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
cu_stream),
# CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
# size_t ByteCount);
'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
# CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
# size_t ByteCount, CUstream hStream);
'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
cu_stream),
# CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
# size_t ByteCount);
'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
# CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
# size_t ByteCount, CUstream hStream);
'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
cu_stream),
# CUresult cuMemFree(CUdeviceptr dptr);
'cuMemFree': (c_int, cu_device_ptr),
# CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
# CUresult cuStreamDestroy(CUstream hStream);
'cuStreamDestroy': (c_int, cu_stream),
# CUresult cuStreamSynchronize(CUstream hStream);
'cuStreamSynchronize': (c_int, cu_stream),
# CUresult cuStreamAddCallback(
# CUstream hStream,
# CUstreamCallback callback,
# void* userData,
# unsigned int flags)
'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
py_object, c_uint),
# CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
# unsigned int gridDimY,
# unsigned int gridDimZ,
# unsigned int blockDimX,
# unsigned int blockDimY,
# unsigned int blockDimZ,
# unsigned int sharedMemBytes,
# CUstream hStream, void **kernelParams,
# void ** extra)
'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
c_uint, c_uint, c_uint, c_uint, cu_stream,
POINTER(c_void_p), POINTER(c_void_p)),
# CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
# unsigned int gridDimY,
# unsigned int gridDimZ,
# unsigned int blockDimX,
# unsigned int blockDimY,
# unsigned int blockDimZ,
# unsigned int sharedMemBytes,
# CUstream hStream, void **kernelParams)
'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
c_uint, c_uint, c_uint, c_uint, cu_stream,
POINTER(c_void_p)),
# CUresult cuMemHostAlloc ( void ** pp,
# size_t bytesize,
# unsigned int Flags
# )
'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
# CUresult cuMemFreeHost ( void * p )
'cuMemFreeHost': (c_int, c_void_p),
# CUresult cuMemHostRegister(void * p,
# size_t bytesize,
# unsigned int Flags)
'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
# CUresult cuMemHostUnregister(void * p)
'cuMemHostUnregister': (c_int, c_void_p),
# CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
# void * p,
# unsigned int Flags)
'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
c_void_p, c_uint),
# CUresult cuMemGetInfo(size_t * free, size_t * total)
'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
# CUresult cuEventCreate ( CUevent * phEvent,
# unsigned int Flags )
'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
# CUresult cuEventDestroy ( CUevent hEvent )
'cuEventDestroy': (c_int, cu_event),
# CUresult cuEventElapsedTime ( float * pMilliseconds,
# CUevent hStart,
# CUevent hEnd )
'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
# CUresult cuEventQuery ( CUevent hEvent )
'cuEventQuery': (c_int, cu_event),
# CUresult cuEventRecord ( CUevent hEvent,
# CUstream hStream )
'cuEventRecord': (c_int, cu_event, cu_stream),
# CUresult cuEventSynchronize ( CUevent hEvent )
'cuEventSynchronize': (c_int, cu_event),
# CUresult cuStreamWaitEvent ( CUstream hStream,
# CUevent hEvent,
# unsigned int Flags )
'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
# CUresult cuPointerGetAttribute (
# void *data,
# CUpointer_attribute attribute,
# CUdeviceptr ptr)
'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
# CUresult cuMemGetAddressRange ( CUdeviceptr * pbase,
# size_t * psize,
# CUdeviceptr dptr
# )
'cuMemGetAddressRange': (c_int,
POINTER(cu_device_ptr),
POINTER(c_size_t),
cu_device_ptr),
# CUresult cuMemHostGetFlags ( unsigned int * pFlags,
# void * p )
'cuMemHostGetFlags': (c_int,
POINTER(c_uint),
c_void_p),
# CUresult cuCtxSynchronize ( void )
'cuCtxSynchronize' : (c_int,),
# CUresult
# cuLinkCreate(unsigned int numOptions, CUjit_option *options,
# void **optionValues, CUlinkState *stateOut);
'cuLinkCreate': (c_int,
c_uint, POINTER(cu_jit_option),
POINTER(c_void_p), POINTER(cu_link_state)),
# CUresult
# cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
# size_t size, const char *name, unsigned
# int numOptions, CUjit_option *options,
# void **optionValues);
'cuLinkAddData': (c_int,
cu_link_state, cu_jit_input_type, c_void_p,
c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
POINTER(c_void_p)),
# CUresult
# cuLinkAddFile(CUlinkState state, CUjitInputType type,
# const char *path, unsigned int numOptions,
# CUjit_option *options, void **optionValues);
'cuLinkAddFile': (c_int,
cu_link_state, cu_jit_input_type, c_char_p, c_uint,
POINTER(cu_jit_option), POINTER(c_void_p)),
# CUresult CUDAAPI
# cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
'cuLinkComplete': (c_int,
cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
# CUresult CUDAAPI
# cuLinkDestroy(CUlinkState state)
'cuLinkDestroy': (c_int, cu_link_state),
# cuProfilerStart ( void )
'cuProfilerStart': (c_int,),
# cuProfilerStop ( void )
'cuProfilerStop': (c_int,),
# CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
# CUfunction hfunc )
'cuFuncGetAttribute': (c_int,
POINTER(c_int), cu_function_attribute, cu_function),
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
# int *numBlocks,
# CUfunction func,
# int blockSize,
# size_t dynamicSMemSize);
'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
cu_function, c_size_t,
c_uint),
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
# int *numBlocks,
# CUfunction func,
# int blockSize,
# size_t dynamicSMemSize,
# unsigned int flags);
'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
POINTER(c_int),
cu_function,
c_size_t, c_uint),
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
# int *minGridSize, int *blockSize,
# CUfunction func,
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
# size_t dynamicSMemSize, int blockSizeLimit);
'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
cu_function, cu_occupancy_b2d_size,
c_size_t, c_int),
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
# int *minGridSize, int *blockSize,
# CUfunction func,
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
# size_t dynamicSMemSize, int blockSizeLimit,
# unsigned int flags);
'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
POINTER(c_int), cu_function,
cu_occupancy_b2d_size,
c_size_t, c_int, c_uint),
# CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
'cuIpcGetMemHandle': (c_int,
POINTER(cu_ipc_mem_handle), cu_device_ptr),
# CUresult cuIpcOpenMemHandle(
# CUdeviceptr* pdptr,
# CUipcMemHandle handle,
# unsigned int Flags)
'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
c_uint),
# CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
# CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
# CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
# CUdevice dev, CUdevice peerDev )
'cuDeviceCanAccessPeer': (c_int,
POINTER(c_int), cu_device, cu_device),
# CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
}

View File

@@ -0,0 +1,452 @@
from collections import namedtuple
import itertools
import functools
import operator
import ctypes
import numpy as np
from numba import _helperlib
Extent = namedtuple("Extent", ["begin", "end"])
attempt_nocopy_reshape = ctypes.CFUNCTYPE(
ctypes.c_int,
ctypes.c_long, # nd
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # dims
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # strides
ctypes.c_long, # newnd
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newdims
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newstrides
ctypes.c_long, # itemsize
ctypes.c_int, # is_f_order
)(_helperlib.c_helpers['attempt_nocopy_reshape'])
class Dim(object):
"""A single dimension of the array
Attributes
----------
start:
start offset
stop:
stop offset
size:
number of items
stride:
item stride
"""
__slots__ = 'start', 'stop', 'size', 'stride', 'single'
def __init__(self, start, stop, size, stride, single):
self.start = start
self.stop = stop
self.size = size
self.stride = stride
self.single = single
assert not single or size == 1
def __getitem__(self, item):
if isinstance(item, slice):
start, stop, step = item.indices(self.size)
stride = step * self.stride
start = self.start + start * abs(self.stride)
stop = self.start + stop * abs(self.stride)
if stride == 0:
size = 1
else:
size = _compute_size(start, stop, stride)
ret = Dim(
start=start,
stop=stop,
size=size,
stride=stride,
single=False
)
return ret
else:
sliced = self[item:item + 1] if item != -1 else self[-1:]
if sliced.size != 1:
raise IndexError
return Dim(
start=sliced.start,
stop=sliced.stop,
size=sliced.size,
stride=sliced.stride,
single=True,
)
def get_offset(self, idx):
return self.start + idx * self.stride
def __repr__(self):
strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
return strfmt % (self.start, self.stop, self.size, self.stride)
def normalize(self, base):
return Dim(start=self.start - base, stop=self.stop - base,
size=self.size, stride=self.stride, single=self.single)
def copy(self, start=None, stop=None, size=None, stride=None, single=None):
if start is None:
start = self.start
if stop is None:
stop = self.stop
if size is None:
size = self.size
if stride is None:
stride = self.stride
if single is None:
single = self.single
return Dim(start, stop, size, stride, single)
def is_contiguous(self, itemsize):
return self.stride == itemsize
def compute_index(indices, dims):
return sum(d.get_offset(i) for i, d in zip(indices, dims))
class Element(object):
is_array = False
def __init__(self, extent):
self.extent = extent
def iter_contiguous_extent(self):
yield self.extent
class Array(object):
"""A dummy numpy array-like object. Consider it an array without the
actual data, but offset from the base data pointer.
Attributes
----------
dims: tuple of Dim
describing each dimension of the array
ndim: int
number of dimension
shape: tuple of int
size of each dimension
strides: tuple of int
stride of each dimension
itemsize: int
itemsize
extent: (start, end)
start and end offset containing the memory region
"""
is_array = True
@classmethod
def from_desc(cls, offset, shape, strides, itemsize):
dims = []
for ashape, astride in zip(shape, strides):
dim = Dim(offset, offset + ashape * astride, ashape, astride,
single=False)
dims.append(dim)
offset = 0 # offset only applies to first dimension
return cls(dims, itemsize)
def __init__(self, dims, itemsize):
self.dims = tuple(dims)
self.ndim = len(self.dims)
self.shape = tuple(dim.size for dim in self.dims)
self.strides = tuple(dim.stride for dim in self.dims)
self.itemsize = itemsize
self.size = functools.reduce(operator.mul, self.shape, 1)
self.extent = self._compute_extent()
self.flags = self._compute_layout()
def _compute_layout(self):
# The logic here is based on that in _UpdateContiguousFlags from
# numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
# 13661ac70).
# https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
# Records have no dims, and we can treat them as contiguous
if not self.dims:
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
# If this is a broadcast array then it is not contiguous
if any([dim.stride == 0 for dim in self.dims]):
return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
# Check C contiguity
sd = self.itemsize
for dim in reversed(self.dims):
if dim.size == 0:
# Contiguous by definition
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
if dim.size != 1:
if dim.stride != sd:
flags['C_CONTIGUOUS'] = False
sd *= dim.size
# Check F contiguity
sd = self.itemsize
for dim in self.dims:
if dim.size != 1:
if dim.stride != sd:
flags['F_CONTIGUOUS'] = False
return flags
sd *= dim.size
return flags
def _compute_extent(self):
firstidx = [0] * self.ndim
lastidx = [s - 1 for s in self.shape]
start = compute_index(firstidx, self.dims)
stop = compute_index(lastidx, self.dims) + self.itemsize
stop = max(stop, start) # ensure positive extent
return Extent(start, stop)
def __repr__(self):
return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
def __getitem__(self, item):
if not isinstance(item, tuple):
item = [item]
else:
item = list(item)
nitem = len(item)
ndim = len(self.dims)
if nitem > ndim:
raise IndexError("%d extra indices given" % (nitem - ndim,))
# Add empty slices for missing indices
while len(item) < ndim:
item.append(slice(None, None))
dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
newshape = [d.size for d in dims if not d.single]
arr = Array(dims, self.itemsize)
if newshape:
return arr.reshape(*newshape)[0]
else:
return Element(arr.extent)
@property
def is_c_contig(self):
return self.flags['C_CONTIGUOUS']
@property
def is_f_contig(self):
return self.flags['F_CONTIGUOUS']
def iter_contiguous_extent(self):
""" Generates extents
"""
if self.is_c_contig or self.is_f_contig:
yield self.extent
else:
if self.dims[0].stride < self.dims[-1].stride:
innerdim = self.dims[0]
outerdims = self.dims[1:]
outershape = self.shape[1:]
else:
innerdim = self.dims[-1]
outerdims = self.dims[:-1]
outershape = self.shape[:-1]
if innerdim.is_contiguous(self.itemsize):
oslen = [range(s) for s in outershape]
for indices in itertools.product(*oslen):
base = compute_index(indices, outerdims)
yield base + innerdim.start, base + innerdim.stop
else:
oslen = [range(s) for s in self.shape]
for indices in itertools.product(*oslen):
offset = compute_index(indices, self.dims)
yield offset, offset + self.itemsize
def reshape(self, *newdims, **kws):
oldnd = self.ndim
newnd = len(newdims)
if newdims == self.shape:
return self, None
order = kws.pop('order', 'C')
if kws:
raise TypeError('unknown keyword arguments %s' % kws.keys())
if order not in 'CFA':
raise ValueError('order not C|F|A')
# check for exactly one instance of -1 in newdims
# https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515 # noqa: E501
unknownidx = -1
knownsize = 1
for i, dim in enumerate(newdims):
if dim < 0:
if unknownidx == -1:
unknownidx = i
else:
raise ValueError("can only specify one unknown dimension")
else:
knownsize *= dim
# compute the missing dimension
if unknownidx >= 0:
if knownsize == 0 or self.size % knownsize != 0:
raise ValueError("cannot infer valid shape "
"for unknown dimension")
else:
newdims = newdims[0:unknownidx] \
+ (self.size // knownsize,) \
+ newdims[unknownidx + 1:]
newsize = functools.reduce(operator.mul, newdims, 1)
if order == 'A':
order = 'F' if self.is_f_contig else 'C'
if newsize != self.size:
raise ValueError("reshape changes the size of the array")
if self.is_c_contig or self.is_f_contig:
if order == 'C':
newstrides = list(iter_strides_c_contig(self, newdims))
elif order == 'F':
newstrides = list(iter_strides_f_contig(self, newdims))
else:
raise AssertionError("unreachable")
else:
newstrides = np.empty(newnd, np.ctypeslib.c_intp)
# need to keep these around in variables, not temporaries, so they
# don't get GC'ed before we call into the C code
olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
if not attempt_nocopy_reshape(
oldnd,
olddims,
oldstrides,
newnd,
newdims,
newstrides,
self.itemsize,
order == 'F',
):
raise NotImplementedError('reshape would require copy')
ret = self.from_desc(self.extent.begin, shape=newdims,
strides=newstrides, itemsize=self.itemsize)
return ret, list(self.iter_contiguous_extent())
def squeeze(self, axis=None):
newshape, newstrides = [], []
if axis is None:
for length, stride in zip(self.shape, self.strides):
if length != 1:
newshape.append(length)
newstrides.append(stride)
else:
if not isinstance(axis, tuple):
axis = (axis,)
for ax in axis:
if self.shape[ax] != 1:
raise ValueError(
"cannot select an axis to squeeze out which has size "
"not equal to one"
)
for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
if i not in axis:
newshape.append(length)
newstrides.append(stride)
newarr = self.from_desc(
self.extent.begin,
shape=newshape,
strides=newstrides,
itemsize=self.itemsize,
)
return newarr, list(self.iter_contiguous_extent())
def ravel(self, order='C'):
if order not in 'CFA':
raise ValueError('order not C|F|A')
if (order in 'CA' and self.is_c_contig
or order in 'FA' and self.is_f_contig):
newshape = (self.size,)
newstrides = (self.itemsize,)
arr = self.from_desc(self.extent.begin, newshape, newstrides,
self.itemsize)
return arr, list(self.iter_contiguous_extent())
else:
raise NotImplementedError("ravel on non-contiguous array")
def iter_strides_f_contig(arr, shape=None):
"""yields the f-contiguous strides
"""
shape = arr.shape if shape is None else shape
itemsize = arr.itemsize
yield itemsize
sum = 1
for s in shape[:-1]:
sum *= s
yield sum * itemsize
def iter_strides_c_contig(arr, shape=None):
"""yields the c-contiguous strides
"""
shape = arr.shape if shape is None else shape
itemsize = arr.itemsize
def gen():
yield itemsize
sum = 1
for s in reversed(shape[1:]):
sum *= s
yield sum * itemsize
for i in reversed(list(gen())):
yield i
def is_element_indexing(item, ndim):
if isinstance(item, slice):
return False
elif isinstance(item, tuple):
if len(item) == ndim:
if not any(isinstance(it, slice) for it in item):
return True
else:
return True
return False
def _compute_size(start, stop, step):
"""Algorithm adapted from cpython rangeobject.c
"""
if step > 0:
lo = start
hi = stop
else:
lo = stop
hi = start
step = -step
if lo >= hi:
return 0
return (hi - lo - 1) // step + 1

View File

@@ -0,0 +1,607 @@
"""
Enum values for CUDA driver. Information about the values
can be found on the official NVIDIA documentation website.
ref: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
anchor: #group__CUDA__TYPES
"""
# Error codes
CUDA_SUCCESS = 0
CUDA_ERROR_INVALID_VALUE = 1
CUDA_ERROR_OUT_OF_MEMORY = 2
CUDA_ERROR_NOT_INITIALIZED = 3
CUDA_ERROR_DEINITIALIZED = 4
CUDA_ERROR_PROFILER_DISABLED = 5
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
CUDA_ERROR_STUB_LIBRARY = 34
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
CUDA_ERROR_NO_DEVICE = 100
CUDA_ERROR_INVALID_DEVICE = 101
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
CUDA_ERROR_INVALID_IMAGE = 200
CUDA_ERROR_INVALID_CONTEXT = 201
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
CUDA_ERROR_MAP_FAILED = 205
CUDA_ERROR_UNMAP_FAILED = 206
CUDA_ERROR_ARRAY_IS_MAPPED = 207
CUDA_ERROR_ALREADY_MAPPED = 208
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
CUDA_ERROR_ALREADY_ACQUIRED = 210
CUDA_ERROR_NOT_MAPPED = 211
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
CUDA_ERROR_ECC_UNCORRECTABLE = 214
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
CUDA_ERROR_INVALID_PTX = 218
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
CUDA_ERROR_INVALID_SOURCE = 300
CUDA_ERROR_FILE_NOT_FOUND = 301
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
CUDA_ERROR_OPERATING_SYSTEM = 304
CUDA_ERROR_INVALID_HANDLE = 400
CUDA_ERROR_ILLEGAL_STATE = 401
CUDA_ERROR_NOT_FOUND = 500
CUDA_ERROR_NOT_READY = 600
CUDA_ERROR_LAUNCH_FAILED = 700
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
CUDA_ERROR_LAUNCH_TIMEOUT = 702
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
CUDA_ERROR_ASSERT = 710
CUDA_ERROR_TOO_MANY_PEERS = 711
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
CUDA_ERROR_MISALIGNED_ADDRESS = 716
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
CUDA_ERROR_INVALID_PC = 718
CUDA_ERROR_LAUNCH_FAILED = 719
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
CUDA_ERROR_NOT_PERMITTED = 800
CUDA_ERROR_NOT_SUPPORTED = 801
CUDA_ERROR_SYSTEM_NOT_READY = 802
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
CUDA_ERROR_MPS_RPC_FAILURE = 806
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
CUDA_ERROR_CDP_NOT_SUPPORTED = 811
CUDA_ERROR_CDP_VERSION_MISMATCH = 812
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
CUDA_ERROR_CAPTURED_EVENT = 907
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
CUDA_ERROR_TIMEOUT = 909
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
CUDA_ERROR_EXTERNAL_DEVICE = 911
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
CUDA_ERROR_UNKNOWN = 999
# Function cache configurations
# no preference for shared memory or L1 (default)
CU_FUNC_CACHE_PREFER_NONE = 0x00
# prefer larger shared memory and smaller L1 cache
CU_FUNC_CACHE_PREFER_SHARED = 0x01
# prefer larger L1 cache and smaller shared memory
CU_FUNC_CACHE_PREFER_L1 = 0x02
# prefer equal sized L1 cache and shared memory
CU_FUNC_CACHE_PREFER_EQUAL = 0x03
# Context creation flags
# Automatic scheduling
CU_CTX_SCHED_AUTO = 0x00
# Set spin as default scheduling
CU_CTX_SCHED_SPIN = 0x01
# Set yield as default scheduling
CU_CTX_SCHED_YIELD = 0x02
# Set blocking synchronization as default scheduling
CU_CTX_SCHED_BLOCKING_SYNC = 0x04
CU_CTX_SCHED_MASK = 0x07
# Support mapped pinned allocations
# This flag was deprecated as of CUDA 11.0 and it no longer has effect.
# All contexts as of CUDA 3.2 behave as though the flag is enabled.
CU_CTX_MAP_HOST = 0x08
# Keep local memory allocation after launch
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10
# Trigger coredumps from exceptions in this context
CU_CTX_COREDUMP_ENABLE = 0x20
# Enable user pipe to trigger coredumps in this context
CU_CTX_USER_COREDUMP_ENABLE = 0x40
# Force synchronous blocking on cudaMemcpy/cudaMemset
CU_CTX_SYNC_MEMOPS = 0x80
CU_CTX_FLAGS_MASK = 0xff
# DEFINES
# If set, host memory is portable between CUDA contexts.
# Flag for cuMemHostAlloc()
CU_MEMHOSTALLOC_PORTABLE = 0x01
# If set, host memory is mapped into CUDA address space and
# cuMemHostGetDevicePointer() may be called on the host pointer.
# Flag for cuMemHostAlloc()
CU_MEMHOSTALLOC_DEVICEMAP = 0x02
# If set, host memory is allocated as write-combined - fast to write,
# faster to DMA, slow to read except via SSE4 streaming load instruction
# (MOVNTDQA).
# Flag for cuMemHostAlloc()
CU_MEMHOSTALLOC_WRITECOMBINED = 0x04
# If set, host memory is portable between CUDA contexts.
# Flag for cuMemHostRegister()
CU_MEMHOSTREGISTER_PORTABLE = 0x01
# If set, host memory is mapped into CUDA address space and
# cuMemHostGetDevicePointer() may be called on the host pointer.
# Flag for cuMemHostRegister()
CU_MEMHOSTREGISTER_DEVICEMAP = 0x02
# If set, the passed memory pointer is treated as pointing to some
# memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
# On Windows the flag is a no-op. On Linux that memory is marked
# as non cache-coherent for the GPU and is expected
# to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED
# if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older
# Linux kernel versions. On all other platforms, it is not supported
# and CUDA_ERROR_NOT_SUPPORTED is returned.
# Flag for cuMemHostRegister()
CU_MEMHOSTREGISTER_IOMEMORY = 0x04
# If set, the passed memory pointer is treated as pointing to memory
# that is considered read-only by the device. On platforms without
# CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
# this flag is required in order to register memory mapped
# to the CPU as read-only. Support for the use of this flag can be
# queried from the device attribute
# CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.
# Using this flag with a current context associated with a device
# that does not have this attribute set will cause cuMemHostRegister
# to error with CUDA_ERROR_NOT_SUPPORTED.
CU_MEMHOSTREGISTER_READ_ONLY = 0x08
# CUDA Mem Attach Flags
# If set, managed memory is accessible from all streams on all devices.
CU_MEM_ATTACH_GLOBAL = 0x01
# If set on a platform where the device attribute
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory is
# only accessible on the host (unless explicitly attached to a stream
# with cudaStreamAttachMemAsync, in which case it can be used in kernels
# launched on that stream).
CU_MEM_ATTACH_HOST = 0x02
# If set on a platform where the device attribute
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory accesses
# on the associated device must only be from a single stream.
CU_MEM_ATTACH_SINGLE = 0x04
# Event creation flags
# Default event flag
CU_EVENT_DEFAULT = 0x0
# Event uses blocking synchronization
CU_EVENT_BLOCKING_SYNC = 0x1
# Event will not record timing data
CU_EVENT_DISABLE_TIMING = 0x2
# Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
CU_EVENT_INTERPROCESS = 0x4
# Pointer information
# The CUcontext on which a pointer was allocated or registered
CU_POINTER_ATTRIBUTE_CONTEXT = 1
# The CUmemorytype describing the physical location of a pointer
CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2
# The address at which a pointer's memory may be accessed on the device
CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3
# The address at which a pointer's memory may be accessed on the host
CU_POINTER_ATTRIBUTE_HOST_POINTER = 4
# A pair of tokens for use with the nv-p2p.h Linux kernel interface
CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5
# Synchronize every synchronous memory operation initiated on this region
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6
# A process-wide unique ID for an allocated memory region
CU_POINTER_ATTRIBUTE_BUFFER_ID = 7
# Indicates if the pointer points to managed memory
CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
# A device ordinal of a device on which a pointer was allocated or registered
CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9
# 1 if this pointer maps to an allocation
# that is suitable for cudaIpcGetMemHandle, 0 otherwise
CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10
# Starting address for this requested pointer
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11
# Size of the address range for this requested pointer
CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12
# 1 if this pointer is in a valid address range
# that is mapped to a backing allocation, 0 otherwise
CU_POINTER_ATTRIBUTE_MAPPED = 13
# Bitmask of allowed CUmemAllocationHandleType for this allocation
CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14
# 1 if the memory this pointer is referencing
# can be used with the GPUDirect RDMA API
CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
# Returns the access flags the device associated
# with the current context has on the corresponding
# memory referenced by the pointer given
CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16
# Returns the mempool handle for the allocation
# if it was allocated from a mempool. Otherwise returns NULL
CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17
# Size of the actual underlying mapping that the pointer belongs to
CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18
# The start address of the mapping that the pointer belongs to
CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19
# A process-wide unique id corresponding to the
# physical allocation the pointer belongs to
CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20
# Memory types
# Host memory
CU_MEMORYTYPE_HOST = 0x01
# Device memory
CU_MEMORYTYPE_DEVICE = 0x02
# Array memory
CU_MEMORYTYPE_ARRAY = 0x03
# Unified device or host memory
CU_MEMORYTYPE_UNIFIED = 0x04
# Device code formats
# Compiled device-class-specific device code
# Applicable options: none
CU_JIT_INPUT_CUBIN = 0
# PTX source code
# Applicable options: PTX compiler options
CU_JIT_INPUT_PTX = 1
# Bundle of multiple cubins and/or PTX of some device code
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
CU_JIT_INPUT_FATBINARY = 2
# Host object with embedded device code
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
CU_JIT_INPUT_OBJECT = 3
# Archive of host objects with embedded device code
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
CU_JIT_INPUT_LIBRARY = 4
CU_JIT_NUM_INPUT_TYPES = 6
# Online compiler and linker options
# Max number of registers that a thread may use.
# Option type: unsigned int
# Applies to: compiler only
CU_JIT_MAX_REGISTERS = 0
# IN: Specifies minimum number of threads per block to target compilation
# for
# OUT: Returns the number of threads the compiler actually targeted.
# This restricts the resource utilization fo the compiler (e.g. max
# registers) such that a block with the given number of threads should be
# able to launch based on register limitations. Note, this option does not
# currently take into account any other resource limitations, such as
# shared memory utilization.
# Cannot be combined with ::CU_JIT_TARGET.
# Option type: unsigned int
# Applies to: compiler only
CU_JIT_THREADS_PER_BLOCK = 1
# Overwrites the option value with the total wall clock time, in
# milliseconds, spent in the compiler and linker
# Option type: float
# Applies to: compiler and linker
CU_JIT_WALL_TIME = 2
# Pointer to a buffer in which to print any log messages
# that are informational in nature (the buffer size is specified via
# option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
# Option type: char *
# Applies to: compiler and linker
CU_JIT_INFO_LOG_BUFFER = 3
# IN: Log buffer size in bytes. Log messages will be capped at this size
# (including null terminator)
# OUT: Amount of log buffer filled with messages
# Option type: unsigned int
# Applies to: compiler and linker
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4
# Pointer to a buffer in which to print any log messages that
# reflect errors (the buffer size is specified via option
# ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
# Option type: char *
# Applies to: compiler and linker
CU_JIT_ERROR_LOG_BUFFER = 5
# IN: Log buffer size in bytes. Log messages will be capped at this size
# (including null terminator)
# OUT: Amount of log buffer filled with messages
# Option type: unsigned int
# Applies to: compiler and linker
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6
# Level of optimizations to apply to generated code (0 - 4), with 4
# being the default and highest level of optimizations.
# Option type: unsigned int
# Applies to: compiler only
CU_JIT_OPTIMIZATION_LEVEL = 7
# No option value required. Determines the target based on the current
# attached context (default)
# Option type: No option value needed
# Applies to: compiler and linker
CU_JIT_TARGET_FROM_CUCONTEXT = 8
# Target is chosen based on supplied ::CUjit_target. Cannot be
# combined with ::CU_JIT_THREADS_PER_BLOCK.
# Option type: unsigned int for enumerated type ::CUjit_target
# Applies to: compiler and linker
CU_JIT_TARGET = 9
# Specifies choice of fallback strategy if matching cubin is not found.
# Choice is based on supplied ::CUjit_fallback.
# Option type: unsigned int for enumerated type ::CUjit_fallback
# Applies to: compiler only
CU_JIT_FALLBACK_STRATEGY = 10
# Specifies whether to create debug information in output (-g)
# (0: false, default)
# Option type: int
# Applies to: compiler and linker
CU_JIT_GENERATE_DEBUG_INFO = 11
# Generate verbose log messages (0: false, default)
# Option type: int
# Applies to: compiler and linker
CU_JIT_LOG_VERBOSE = 12
# Generate line number information (-lineinfo) (0: false, default)
# Option type: int
# Applies to: compiler only
CU_JIT_GENERATE_LINE_INFO = 13
# Specifies whether to enable caching explicitly (-dlcm)
# Choice is based on supplied ::CUjit_cacheMode_enum.
# Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum
# Applies to: compiler only
CU_JIT_CACHE_MODE = 14
# CUfunction_attribute
# The maximum number of threads per block, beyond which a launch of the
# function would fail. This number depends on both the function and the
# device on which the function is currently loaded.
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0
# The size in bytes of statically-allocated shared memory required by
# this function. This does not include dynamically-allocated shared
# memory requested by the user at runtime.
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1
# The size in bytes of user-allocated constant memory required by this
# function.
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2
# The size in bytes of local memory used by each thread of this function.
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3
# The number of registers used by each thread of this function.
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
# The PTX virtual architecture version for which the function was
# compiled. This value is the major PTX version * 10 + the minor PTX
# version, so a PTX version 1.3 function would return the value 13.
# Note that this may return the undefined value of 0 for cubins
# compiled prior to CUDA 3.0.
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5
# The binary architecture version for which the function was compiled.
# This value is the major binary version * 10 + the minor binary version,
# so a binary version 1.3 function would return the value 13. Note that
# this will return a value of 10 for legacy cubins that do not have a
# properly-encoded binary architecture version.
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6
# The attribute to indicate whether the function has been compiled
# with user specified option "-Xptxas --dlcm=ca" set
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
# The maximum size in bytes of dynamically-allocated shared memory
# that can be used by this function. If the user-specified
# dynamic shared memory size is larger than this value,
# the launch will fail. See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8
# On devices where the L1 cache and shared memory use the same
# hardware resources, this sets the shared memory carveout preference,
# in percent of the total shared memory. Refer to
# CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
# This is only a hint, and the driver can choose a different ratio
# if required to execute the function.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9
# If this attribute is set, the kernel must launch with a valid cluster
# size specified. See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10
# The required cluster width in blocks. The values must either all be 0
# or all be positive. The validity of the cluster dimensions
# is otherwise checked at launch time. If the value is set during
# compile time, it cannot be set at runtime.
# Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11
# The required cluster height in blocks. The values must either all be 0
# or all be positive. The validity of the cluster dimensions
# is otherwise checked at launch time.If the value is set during
# compile time, it cannot be set at runtime.
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12
# The required cluster depth in blocks. The values must either all be 0
# or all be positive. The validity of the cluster dimensions
# is otherwise checked at launch time.If the value is set during
# compile time, it cannot be set at runtime.
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
# Whether the function can be launched with non-portable cluster size.
# 1 is allowed, 0 is disallowed. A non-portable cluster size may only
# function on the specific SKUs the program is tested on.
# The launch might fail if the program is run on a different hardware platform.
# For more details refer to link :
# https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
# The block scheduling policy of a function.
# The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
# Device attributes
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_WIDTH = 21
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_WIDTH = 22
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_HEIGHT = 23
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH = 24
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT = 25
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH = 26
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_WIDTH = 27
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_HEIGHT = 28
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_LAYERS = 29
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTI_PROCESSOR = 39
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_WIDTH = 42
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_LAYERS = 43
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_WIDTH = 45
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_HEIGHT = 46
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH_ALT = 47
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT_ALT = 48
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH_ALT = 49
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_WIDTH = 52
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_WIDTH = 53
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_LAYERS = 54
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_WIDTH = 55
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_WIDTH = 56
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_HEIGHT = 57
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_WIDTH = 58
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_HEIGHT = 59
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_DEPTH = 60
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_WIDTH = 61
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_LAYERS = 62
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_WIDTH = 63
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_HEIGHT = 64
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_LAYERS = 65
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_WIDTH = 66
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_WIDTH = 67
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_LAYERS = 68
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LINEAR_WIDTH = 69
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_WIDTH = 70
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_HEIGHT = 71
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_PITCH = 72
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_MIPMAPPED_WIDTH = 73
CU_DEVICE_ATTRIBUTE_MAX_MAX_TEXTURE_2D_MIPMAPPED_HEIGHT = 74
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_MIPMAPPED_WIDTH = 77
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83
CU_DEVICE_ATTRIBUTE_IS_MULTI_GPU_BOARD = 84
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97

View File

@@ -0,0 +1,36 @@
class CudaDriverError(Exception):
pass
class CudaRuntimeError(Exception):
pass
class CudaSupportError(ImportError):
pass
class NvvmError(Exception):
def __str__(self):
return '\n'.join(map(str, self.args))
class NvvmSupportError(ImportError):
pass
class NvvmWarning(Warning):
pass
class NvrtcError(Exception):
def __str__(self):
return '\n'.join(map(str, self.args))
class NvrtcCompilationError(NvrtcError):
pass
class NvrtcSupportError(ImportError):
pass

View File

@@ -0,0 +1,176 @@
"""CUDA Toolkit libraries lookup utilities.
CUDA Toolkit libraries can be available via either:
- the `cuda-nvcc` and `cuda-nvrtc` conda packages for CUDA 12,
- the `cudatoolkit` conda package for CUDA 11,
- a user supplied location from CUDA_HOME,
- a system wide location,
- package-specific locations (e.g. the Debian NVIDIA packages),
- or can be discovered by the system loader.
"""
import os
import sys
import ctypes
from numba.misc.findlib import find_lib
from numba.cuda.cuda_paths import get_cuda_paths
from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
from numba.cuda.cudadrv.error import CudaSupportError
if sys.platform == 'win32':
_dllnamepattern = '%s.dll'
_staticnamepattern = '%s.lib'
elif sys.platform == 'darwin':
_dllnamepattern = 'lib%s.dylib'
_staticnamepattern = 'lib%s.a'
else:
_dllnamepattern = 'lib%s.so'
_staticnamepattern = 'lib%s.a'
def get_libdevice():
d = get_cuda_paths()
paths = d['libdevice'].info
return paths
def open_libdevice():
with open(get_libdevice(), 'rb') as bcfile:
return bcfile.read()
def get_cudalib(lib, static=False):
"""
Find the path of a CUDA library based on a search of known locations. If
the search fails, return a generic filename for the library (e.g.
'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
loader's search mechanism.
"""
if lib == 'nvvm':
return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
else:
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
libdir = get_cuda_paths()[dir_type].info
candidates = find_lib(lib, libdir, static=static)
namepattern = _staticnamepattern if static else _dllnamepattern
return max(candidates) if candidates else namepattern % lib
def open_cudalib(lib):
path = get_cudalib(lib)
return ctypes.CDLL(path)
def check_static_lib(path):
if not os.path.isfile(path):
raise FileNotFoundError(f'{path} not found')
def _get_source_variable(lib, static=False):
if lib == 'nvvm':
return get_cuda_paths()['nvvm'].by
elif lib == 'libdevice':
return get_cuda_paths()['libdevice'].by
else:
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
return get_cuda_paths()[dir_type].by
def test():
"""Test library lookup. Path info is printed to stdout.
"""
failed = False
# Check for the driver
try:
dlloader, candidates = locate_driver_and_loader()
print('Finding driver from candidates:')
for location in candidates:
print(f'\t{location}')
print(f'Using loader {dlloader}')
print('\tTrying to load driver', end='...')
dll, path = load_driver(dlloader, candidates)
print('\tok')
print(f'\t\tLoaded from {path}')
except CudaSupportError as e:
print(f'\tERROR: failed to open driver: {e}')
failed = True
# Find the absolute location of the driver on Linux. Various driver-related
# issues have been reported by WSL2 users, and it is almost always due to a
# Linux (i.e. not- WSL2) driver being installed in a WSL2 system.
# Providing the absolute location of the driver indicates its version
# number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
# look up whether the driver was intended for "native" Linux.
if sys.platform == 'linux' and not failed:
pid = os.getpid()
mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
try:
with open(mapsfile) as f:
maps = f.read()
# It's difficult to predict all that might go wrong reading the maps
# file - in case various error conditions ensue (the file is not found,
# not readable, etc.) we use OSError to hopefully catch any of them.
except OSError:
# It's helpful to report that this went wrong to the user, but we
# don't set failed to True because this doesn't have any connection
# to actual CUDA functionality.
print(f'\tERROR: Could not open {mapsfile} to determine absolute '
'path to libcuda.so')
else:
# In this case we could read the maps, so we can report the
# relevant ones to the user
locations = set(s for s in maps.split() if 'libcuda.so' in s)
print('\tMapped libcuda.so paths:')
for location in locations:
print(f'\t\t{location}')
# Checks for dynamic libraries
libs = 'nvvm nvrtc cudart'.split()
for lib in libs:
path = get_cudalib(lib)
print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
print('\tLocated at', path)
try:
print('\tTrying to open library', end='...')
open_cudalib(lib)
print('\tok')
except OSError as e:
print('\tERROR: failed to open %s:\n%s' % (lib, e))
failed = True
# Check for cudadevrt (the only static library)
lib = 'cudadevrt'
path = get_cudalib(lib, static=True)
print('Finding {} from {}'.format(lib, _get_source_variable(lib,
static=True)))
print('\tLocated at', path)
try:
print('\tChecking library', end='...')
check_static_lib(path)
print('\tok')
except FileNotFoundError as e:
print('\tERROR: failed to find %s:\n%s' % (lib, e))
failed = True
# Check for libdevice
where = _get_source_variable('libdevice')
print(f'Finding libdevice from {where}')
path = get_libdevice()
print('\tLocated at', path)
try:
print('\tChecking library', end='...')
check_static_lib(path)
print('\tok')
except FileNotFoundError as e:
print('\tERROR: failed to find %s:\n%s' % (lib, e))
failed = True
return not failed

View File

@@ -0,0 +1,20 @@
from numba.cuda.cudadrv import devices, driver
from numba.core.registry import cpu_target
def _calc_array_sizeof(ndim):
"""
Use the ABI size in the CPU target
"""
ctx = cpu_target.target_context
return ctx.calc_array_sizeof(ndim)
def ndarray_device_allocate_data(ary):
"""
Allocate gpu data buffer
"""
datasize = driver.host_memory_size(ary)
# allocate
gpu_data = devices.get_context().memalloc(datasize)
return gpu_data

View File

@@ -0,0 +1,260 @@
from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
from enum import IntEnum
from numba.core import config
from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
NvrtcSupportError)
import functools
import os
import threading
import warnings
# Opaque handle for compilation unit
nvrtc_program = c_void_p
# Result code
nvrtc_result = c_int
class NvrtcResult(IntEnum):
NVRTC_SUCCESS = 0
NVRTC_ERROR_OUT_OF_MEMORY = 1
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
NVRTC_ERROR_INVALID_INPUT = 3
NVRTC_ERROR_INVALID_PROGRAM = 4
NVRTC_ERROR_INVALID_OPTION = 5
NVRTC_ERROR_COMPILATION = 6
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
NVRTC_ERROR_INTERNAL_ERROR = 11
_nvrtc_lock = threading.Lock()
class NvrtcProgram:
"""
A class for managing the lifetime of nvrtcProgram instances. Instances of
the class own an nvrtcProgram; when an instance is deleted, the underlying
nvrtcProgram is destroyed using the appropriate NVRTC API.
"""
def __init__(self, nvrtc, handle):
self._nvrtc = nvrtc
self._handle = handle
@property
def handle(self):
return self._handle
def __del__(self):
if self._handle:
self._nvrtc.destroy_program(self)
class NVRTC:
"""
Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
calls.
The sole instance of this class is a process-wide singleton, similar to the
NVVM interface. Initialization is protected by a lock and uses the standard
(for Numba) open_cudalib function to load the NVRTC library.
"""
_PROTOTYPES = {
# nvrtcResult nvrtcVersion(int *major, int *minor)
'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
# nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
# const char *src,
# const char *name,
# int numHeaders,
# const char * const *headers,
# const char * const *includeNames)
'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
c_int, POINTER(c_char_p), POINTER(c_char_p)),
# nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
# nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
# int numOptions,
# const char * const *options)
'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
POINTER(c_char_p)),
# nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
# nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
# nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
# size_t *cubinSizeRet);
'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
# nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
# nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
# size_t *logSizeRet);
'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
POINTER(c_size_t)),
# nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
}
# Singleton reference
__INSTANCE = None
def __new__(cls):
with _nvrtc_lock:
if cls.__INSTANCE is None:
from numba.cuda.cudadrv.libs import open_cudalib
cls.__INSTANCE = inst = object.__new__(cls)
try:
lib = open_cudalib('nvrtc')
except OSError as e:
cls.__INSTANCE = None
raise NvrtcSupportError("NVRTC cannot be loaded") from e
# Find & populate functions
for name, proto in inst._PROTOTYPES.items():
func = getattr(lib, name)
func.restype = proto[0]
func.argtypes = proto[1:]
@functools.wraps(func)
def checked_call(*args, func=func, name=name):
error = func(*args)
if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
raise NvrtcCompilationError()
elif error != NvrtcResult.NVRTC_SUCCESS:
try:
error_name = NvrtcResult(error).name
except ValueError:
error_name = ('Unknown nvrtc_result '
f'(error code: {error})')
msg = f'Failed to call {name}: {error_name}'
raise NvrtcError(msg)
setattr(inst, name, checked_call)
return cls.__INSTANCE
def get_version(self):
"""
Get the NVRTC version as a tuple (major, minor).
"""
major = c_int()
minor = c_int()
self.nvrtcVersion(byref(major), byref(minor))
return major.value, minor.value
def create_program(self, src, name):
"""
Create an NVRTC program with managed lifetime.
"""
if isinstance(src, str):
src = src.encode()
if isinstance(name, str):
name = name.encode()
handle = nvrtc_program()
# The final three arguments are for passing the contents of headers -
# this is not supported, so there are 0 headers and the header names
# and contents are null.
self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
return NvrtcProgram(self, handle)
def compile_program(self, program, options):
"""
Compile an NVRTC program. Compilation may fail due to a user error in
the source; this function returns ``True`` if there is a compilation
error and ``False`` on success.
"""
# We hold a list of encoded options to ensure they can't be collected
# prior to the call to nvrtcCompileProgram
encoded_options = [opt.encode() for opt in options]
option_pointers = [c_char_p(opt) for opt in encoded_options]
c_options_type = (c_char_p * len(options))
c_options = c_options_type(*option_pointers)
try:
self.nvrtcCompileProgram(program.handle, len(options), c_options)
return False
except NvrtcCompilationError:
return True
def destroy_program(self, program):
"""
Destroy an NVRTC program.
"""
self.nvrtcDestroyProgram(byref(program.handle))
def get_compile_log(self, program):
"""
Get the compile log as a Python string.
"""
log_size = c_size_t()
self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
log = (c_char * log_size.value)()
self.nvrtcGetProgramLog(program.handle, log)
return log.value.decode()
def get_ptx(self, program):
"""
Get the compiled PTX as a Python string.
"""
ptx_size = c_size_t()
self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
ptx = (c_char * ptx_size.value)()
self.nvrtcGetPTX(program.handle, ptx)
return ptx.value.decode()
def compile(src, name, cc):
"""
Compile a CUDA C/C++ source to PTX for a given compute capability.
:param src: The source code to compile
:type src: str
:param name: The filename of the source (for information only)
:type name: str
:param cc: A tuple ``(major, minor)`` of the compute capability
:type cc: tuple
:return: The compiled PTX and compilation log
:rtype: tuple
"""
nvrtc = NVRTC()
program = nvrtc.create_program(src, name)
# Compilation options:
# - Compile for the current device's compute capability.
# - The CUDA include path is added.
# - Relocatable Device Code (rdc) is needed to prevent device functions
# being optimized away.
major, minor = cc
arch = f'--gpu-architecture=compute_{major}{minor}'
include = f'-I{config.CUDA_INCLUDE_PATH}'
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
numba_cuda_path = os.path.dirname(cudadrv_path)
numba_include = f'-I{numba_cuda_path}'
options = [arch, include, numba_include, '-rdc', 'true']
# Compile the program
compile_error = nvrtc.compile_program(program, options)
# Get log from compilation
log = nvrtc.get_compile_log(program)
# If the compile failed, provide the log in an exception
if compile_error:
msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
raise NvrtcError(msg)
# Otherwise, if there's any content in the log, present it as a warning
if log:
msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
warnings.warn(msg)
ptx = nvrtc.get_ptx(program)
return ptx, log

View File

@@ -0,0 +1,707 @@
"""
This is a direct translation of nvvm.h
"""
import logging
import re
import sys
import warnings
from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
c_char)
import threading
from llvmlite import ir
from .error import NvvmError, NvvmSupportError, NvvmWarning
from .libs import get_libdevice, open_libdevice, open_cudalib
from numba.core import cgutils, config
logger = logging.getLogger(__name__)
ADDRSPACE_GENERIC = 0
ADDRSPACE_GLOBAL = 1
ADDRSPACE_SHARED = 3
ADDRSPACE_CONSTANT = 4
ADDRSPACE_LOCAL = 5
# Opaque handle for compilation unit
nvvm_program = c_void_p
# Result code
nvvm_result = c_int
RESULT_CODE_NAMES = '''
NVVM_SUCCESS
NVVM_ERROR_OUT_OF_MEMORY
NVVM_ERROR_PROGRAM_CREATION_FAILURE
NVVM_ERROR_IR_VERSION_MISMATCH
NVVM_ERROR_INVALID_INPUT
NVVM_ERROR_INVALID_PROGRAM
NVVM_ERROR_INVALID_IR
NVVM_ERROR_INVALID_OPTION
NVVM_ERROR_NO_MODULE_IN_PROGRAM
NVVM_ERROR_COMPILATION
'''.split()
for i, k in enumerate(RESULT_CODE_NAMES):
setattr(sys.modules[__name__], k, i)
# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
'v64:64:64-v128:128:128-n16:32:64')
_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
'v64:64:64-v128:128:128-n16:32:64')
def is_available():
"""
Return if libNVVM is available
"""
try:
NVVM()
except NvvmSupportError:
return False
else:
return True
_nvvm_lock = threading.Lock()
class NVVM(object):
'''Process-wide singleton.
'''
_PROTOTYPES = {
# nvvmResult nvvmVersion(int *major, int *minor)
'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
# nvvmResult nvvmCreateProgram(nvvmProgram *cu)
'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
# nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
# nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
# size_t size, const char *name)
'nvvmAddModuleToProgram': (
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
# nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
# const char* buffer,
# size_t size,
# const char *name)
'nvvmLazyAddModuleToProgram': (
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
# nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
# const char **options)
'nvvmCompileProgram': (
nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
# nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
# size_t *bufferSizeRet)
'nvvmGetCompiledResultSize': (
nvvm_result, nvvm_program, POINTER(c_size_t)),
# nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
# nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
# size_t *bufferSizeRet)
'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
# int* minorDbg )
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
POINTER(c_int), POINTER(c_int)),
# nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
# const char** options)
'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
POINTER(c_char_p))
}
# Singleton reference
__INSTANCE = None
def __new__(cls):
with _nvvm_lock:
if cls.__INSTANCE is None:
cls.__INSTANCE = inst = object.__new__(cls)
try:
inst.driver = open_cudalib('nvvm')
except OSError as e:
cls.__INSTANCE = None
errmsg = ("libNVVM cannot be found. Do `conda install "
"cudatoolkit`:\n%s")
raise NvvmSupportError(errmsg % e)
# Find & populate functions
for name, proto in inst._PROTOTYPES.items():
func = getattr(inst.driver, name)
func.restype = proto[0]
func.argtypes = proto[1:]
setattr(inst, name, func)
return cls.__INSTANCE
def __init__(self):
ir_versions = self.get_ir_version()
self._majorIR = ir_versions[0]
self._minorIR = ir_versions[1]
self._majorDbg = ir_versions[2]
self._minorDbg = ir_versions[3]
self._supported_ccs = get_supported_ccs()
@property
def data_layout(self):
if (self._majorIR, self._minorIR) < (1, 8):
return _datalayout_original
else:
return _datalayout_i128
@property
def supported_ccs(self):
return self._supported_ccs
def get_version(self):
major = c_int()
minor = c_int()
err = self.nvvmVersion(byref(major), byref(minor))
self.check_error(err, 'Failed to get version.')
return major.value, minor.value
def get_ir_version(self):
majorIR = c_int()
minorIR = c_int()
majorDbg = c_int()
minorDbg = c_int()
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
byref(majorDbg), byref(minorDbg))
self.check_error(err, 'Failed to get IR version.')
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
def check_error(self, error, msg, exit=False):
if error:
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
if exit:
print(exc)
sys.exit(1)
else:
raise exc
class CompilationUnit(object):
def __init__(self):
self.driver = NVVM()
self._handle = nvvm_program()
err = self.driver.nvvmCreateProgram(byref(self._handle))
self.driver.check_error(err, 'Failed to create CU')
def __del__(self):
driver = NVVM()
err = driver.nvvmDestroyProgram(byref(self._handle))
driver.check_error(err, 'Failed to destroy CU', exit=True)
def add_module(self, buffer):
"""
Add a module level NVVM IR to a compilation unit.
- The buffer should contain an NVVM module IR either in the bitcode
representation (LLVM3.0) or in the text representation.
"""
err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
len(buffer), None)
self.driver.check_error(err, 'Failed to add module')
def lazy_add_module(self, buffer):
"""
Lazily add an NVVM IR module to a compilation unit.
The buffer should contain NVVM module IR either in the bitcode
representation or in the text representation.
"""
err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
len(buffer), None)
self.driver.check_error(err, 'Failed to add module')
def compile(self, **options):
"""Perform Compilation.
Compilation options are accepted as keyword arguments, with the
following considerations:
- Underscores (`_`) in option names are converted to dashes (`-`), to
match NVVM's option name format.
- Options that take a value will be emitted in the form
"-<name>=<value>".
- Booleans passed as option values will be converted to integers.
- Options which take no value (such as `-gen-lto`) should have a value
of `None` passed in and will be emitted in the form "-<name>".
For documentation on NVVM compilation options, see the CUDA Toolkit
Documentation:
https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
"""
def stringify_option(k, v):
k = k.replace('_', '-')
if v is None:
return f'-{k}'
if isinstance(v, bool):
v = int(v)
return f'-{k}={v}'
options = [stringify_option(k, v) for k, v in options.items()]
c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
for x in options])
# verify
err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
self._try_error(err, 'Failed to verify\n')
# compile
err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
self._try_error(err, 'Failed to compile\n')
# get result
reslen = c_size_t()
err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
self._try_error(err, 'Failed to get size of compiled result.')
output_buffer = (c_char * reslen.value)()
err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
self._try_error(err, 'Failed to get compiled result.')
# get log
self.log = self.get_log()
if self.log:
warnings.warn(self.log, category=NvvmWarning)
return output_buffer[:]
def _try_error(self, err, msg):
self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
def get_log(self):
reslen = c_size_t()
err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
self.driver.check_error(err, 'Failed to get compilation log size.')
if reslen.value > 1:
logbuf = (c_char * reslen.value)()
err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
self.driver.check_error(err, 'Failed to get compilation log.')
return logbuf.value.decode('utf8') # populate log attribute
return ''
COMPUTE_CAPABILITIES = (
(3, 5), (3, 7),
(5, 0), (5, 2), (5, 3),
(6, 0), (6, 1), (6, 2),
(7, 0), (7, 2), (7, 5),
(8, 0), (8, 6), (8, 7), (8, 9),
(9, 0)
)
# Maps CTK version -> (min supported cc, max supported cc) inclusive
CTK_SUPPORTED = {
(11, 2): ((3, 5), (8, 6)),
(11, 3): ((3, 5), (8, 6)),
(11, 4): ((3, 5), (8, 7)),
(11, 5): ((3, 5), (8, 7)),
(11, 6): ((3, 5), (8, 7)),
(11, 7): ((3, 5), (8, 7)),
(11, 8): ((3, 5), (9, 0)),
(12, 0): ((5, 0), (9, 0)),
(12, 1): ((5, 0), (9, 0)),
(12, 2): ((5, 0), (9, 0)),
(12, 3): ((5, 0), (9, 0)),
(12, 4): ((5, 0), (9, 0)),
}
def ccs_supported_by_ctk(ctk_version):
try:
# For supported versions, we look up the range of supported CCs
min_cc, max_cc = CTK_SUPPORTED[ctk_version]
return tuple([cc for cc in COMPUTE_CAPABILITIES
if min_cc <= cc <= max_cc])
except KeyError:
# For unsupported CUDA toolkit versions, all we can do is assume all
# non-deprecated versions we are aware of are supported.
return tuple([cc for cc in COMPUTE_CAPABILITIES
if cc >= config.CUDA_DEFAULT_PTX_CC])
def get_supported_ccs():
try:
from numba.cuda.cudadrv.runtime import runtime
cudart_version = runtime.get_version()
except: # noqa: E722
# We can't support anything if there's an error getting the runtime
# version (e.g. if it's not present or there's another issue)
_supported_cc = ()
return _supported_cc
# Ensure the minimum CTK version requirement is met
min_cudart = min(CTK_SUPPORTED)
if cudart_version < min_cudart:
_supported_cc = ()
ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
"required version.")
warnings.warn(unsupported_ver)
return _supported_cc
_supported_cc = ccs_supported_by_ctk(cudart_version)
return _supported_cc
def find_closest_arch(mycc):
"""
Given a compute capability, return the closest compute capability supported
by the CUDA toolkit.
:param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
:return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
"""
supported_ccs = NVVM().supported_ccs
if not supported_ccs:
msg = "No supported GPU compute capabilities found. " \
"Please check your cudatoolkit version matches your CUDA version."
raise NvvmSupportError(msg)
for i, cc in enumerate(supported_ccs):
if cc == mycc:
# Matches
return cc
elif cc > mycc:
# Exceeded
if i == 0:
# CC lower than supported
msg = "GPU compute capability %d.%d is not supported" \
"(requires >=%d.%d)" % (mycc + cc)
raise NvvmSupportError(msg)
else:
# return the previous CC
return supported_ccs[i - 1]
# CC higher than supported
return supported_ccs[-1] # Choose the highest
def get_arch_option(major, minor):
"""Matches with the closest architecture option
"""
if config.FORCE_CUDA_CC:
arch = config.FORCE_CUDA_CC
else:
arch = find_closest_arch((major, minor))
return 'compute_%d%d' % arch
MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
Please ensure you have a CUDA Toolkit 11.2 or higher.
For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
$ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
For CUDA 11, ``cudatoolkit`` is required:
$ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
'''
class LibDevice(object):
_cache_ = None
def __init__(self):
if self._cache_ is None:
if get_libdevice() is None:
raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
self._cache_ = open_libdevice()
self.bc = self._cache_
def get(self):
return self.bc
cas_nvvm = """
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
""" # noqa: E501
# Translation of code from CUDA Programming Guide v6.5, section B.12
ir_numba_atomic_binary_template = """
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
%iptr = bitcast {T}* %ptr to {Ti}*
%old2 = load volatile {Ti}, {Ti}* %iptr
br label %attempt
attempt:
%old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
%dold = bitcast {Ti} %old to {T}
%dnew = {OP} {T} %dold, %val
%new = bitcast {T} %dnew to {Ti}
{CAS}
%repeat = icmp ne {Ti} %cas, %old
br i1 %repeat, label %attempt, label %done
done:
%result = bitcast {Ti} %old to {T}
ret {T} %result
}}
""" # noqa: E501
ir_numba_atomic_inc_template = """
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
entry:
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
attempt:
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%bndchk = icmp ult {T} %old, %val
%inc = add {T} %old, 1
%new = select i1 %bndchk, {T} %inc, {T} 0
{CAS}
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
done:
ret {T} %old
}}
""" # noqa: E501
ir_numba_atomic_dec_template = """
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
entry:
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
attempt:
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%dec = add {T} %old, -1
%bndchk = icmp ult {T} %dec, %val
%new = select i1 %bndchk, {T} %dec, {T} %val
{CAS}
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
done:
ret {T} %old
}}
""" # noqa: E501
ir_numba_atomic_minmax_template = """
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
%ptrval = load volatile {T}, {T}* %ptr
; Return early when:
; - For nanmin / nanmax when val is a NaN
; - For min / max when val or ptr is a NaN
%early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
br i1 %early_return, label %done, label %lt_check
lt_check:
%dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
; Continue attempts if dold less or greater than val (depending on whether min or max)
; or if dold is NaN (for nanmin / nanmax)
%cmp = fcmp {OP} {T} %dold, %val
br i1 %cmp, label %attempt, label %done
attempt:
; Attempt to swap in the value
%old = bitcast {T} %dold to {Ti}
%iptr = bitcast {T}* %ptr to {Ti}*
%new = bitcast {T} %val to {Ti}
{CAS}
%dcas = bitcast {Ti} %cas to {T}
br label %lt_check
done:
ret {T} %ptrval
}}
""" # noqa: E501
def ir_cas(Ti):
return cas_nvvm.format(Ti=Ti)
def ir_numba_atomic_binary(T, Ti, OP, FUNC):
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_binary_template.format(**params)
def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_minmax_template.format(**params)
def ir_numba_atomic_inc(T, Tu):
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
def ir_numba_atomic_dec(T, Tu):
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
def llvm_replace(llvmir):
replacements = [
('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
ir_numba_atomic_inc(T='i64', Tu='u64')),
('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
ir_numba_atomic_dec(T='i64', Tu='u64')),
('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('immarg', '')
]
for decl, fn in replacements:
llvmir = llvmir.replace(decl, fn)
llvmir = llvm140_to_70_ir(llvmir)
return llvmir
def compile_ir(llvmir, **opts):
if isinstance(llvmir, str):
llvmir = [llvmir]
if opts.pop('fastmath', False):
opts.update({
'ftz': True,
'fma': True,
'prec_div': False,
'prec_sqrt': False,
})
cu = CompilationUnit()
libdevice = LibDevice()
for mod in llvmir:
mod = llvm_replace(mod)
cu.add_module(mod.encode('utf8'))
cu.lazy_add_module(libdevice.get())
return cu.compile(**opts)
re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
def llvm140_to_70_ir(ir):
"""
Convert LLVM 14.0 IR for LLVM 7.0.
"""
buf = []
for line in ir.splitlines():
if line.startswith('attributes #'):
# Remove function attributes unsupported by LLVM 7.0
m = re_attributes_def.match(line)
attrs = m.group(1).split()
attrs = ' '.join(a for a in attrs if a != 'willreturn')
line = line.replace(m.group(1), attrs)
buf.append(line)
return '\n'.join(buf)
def set_cuda_kernel(function):
"""
Mark a function as a CUDA kernel. Kernels have the following requirements:
- Metadata that marks them as a kernel.
- Addition to the @llvm.used list, so that they will not be discarded.
- The noinline attribute is not permitted, because this causes NVVM to emit
a warning, which counts as failing IR verification.
Presently it is assumed that there is one kernel per module, which holds
for Numba-jitted functions. If this changes in future or this function is
to be used externally, this function may need modification to add to the
@llvm.used list rather than creating it.
"""
module = function.module
# Add kernel metadata
mdstr = ir.MetaDataString(module, "kernel")
mdvalue = ir.Constant(ir.IntType(32), 1)
md = module.add_metadata((function, mdstr, mdvalue))
nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
nmd.add(md)
# Create the used list
ptrty = ir.IntType(8).as_pointer()
usedty = ir.ArrayType(ptrty, 1)
fnptr = function.bitcast(ptrty)
llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
llvm_used.linkage = 'appending'
llvm_used.section = 'llvm.metadata'
llvm_used.initializer = ir.Constant(usedty, [fnptr])
# Remove 'noinline' if it is present.
function.attributes.discard('noinline')
def add_ir_version(mod):
"""Add NVVM IR version to module"""
# We specify the IR version to match the current NVVM's IR version
i32 = ir.IntType(32)
ir_versions = [i32(v) for v in NVVM().get_ir_version()]
md_ver = mod.add_metadata(ir_versions)
mod.add_named_metadata('nvvmir.version', md_ver)

View File

@@ -0,0 +1,10 @@
"""
Declarations of the Runtime API functions.
"""
from ctypes import c_int, POINTER
API_PROTOTYPES = {
# cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
}

View File

@@ -0,0 +1,142 @@
"""
CUDA Runtime wrapper.
This provides a very minimal set of bindings, since the Runtime API is not
really used in Numba except for querying the Runtime version.
"""
import ctypes
import functools
import sys
from numba.core import config
from numba.cuda.cudadrv.driver import ERROR_MAP, make_logger
from numba.cuda.cudadrv.error import CudaSupportError, CudaRuntimeError
from numba.cuda.cudadrv.libs import open_cudalib
from numba.cuda.cudadrv.rtapi import API_PROTOTYPES
from numba.cuda.cudadrv import enums
class CudaRuntimeAPIError(CudaRuntimeError):
"""
Raised when there is an error accessing a C API from the CUDA Runtime.
"""
def __init__(self, code, msg):
self.code = code
self.msg = msg
super().__init__(code, msg)
def __str__(self):
return "[%s] %s" % (self.code, self.msg)
class Runtime:
"""
Runtime object that lazily binds runtime API functions.
"""
def __init__(self):
self.is_initialized = False
def _initialize(self):
# lazily initialize logger
global _logger
_logger = make_logger()
if config.DISABLE_CUDA:
msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
"in the environment, or because CUDA is unsupported on "
"32-bit systems.")
raise CudaSupportError(msg)
self.lib = open_cudalib('cudart')
self.is_initialized = True
def __getattr__(self, fname):
# First request of a runtime API function
try:
proto = API_PROTOTYPES[fname]
except KeyError:
raise AttributeError(fname)
restype = proto[0]
argtypes = proto[1:]
if not self.is_initialized:
self._initialize()
# Find function in runtime library
libfn = self._find_api(fname)
libfn.restype = restype
libfn.argtypes = argtypes
safe_call = self._wrap_api_call(fname, libfn)
setattr(self, fname, safe_call)
return safe_call
def _wrap_api_call(self, fname, libfn):
@functools.wraps(libfn)
def safe_cuda_api_call(*args):
_logger.debug('call runtime api: %s', libfn.__name__)
retcode = libfn(*args)
self._check_error(fname, retcode)
return safe_cuda_api_call
def _check_error(self, fname, retcode):
if retcode != enums.CUDA_SUCCESS:
errname = ERROR_MAP.get(retcode, "cudaErrorUnknown")
msg = "Call to %s results in %s" % (fname, errname)
_logger.error(msg)
raise CudaRuntimeAPIError(retcode, msg)
def _find_api(self, fname):
try:
return getattr(self.lib, fname)
except AttributeError:
pass
# Not found.
# Delay missing function error to use
def absent_function(*args, **kws):
msg = "runtime missing function: %s."
raise CudaRuntimeError(msg % fname)
setattr(self, fname, absent_function)
return absent_function
def get_version(self):
"""
Returns the CUDA Runtime version as a tuple (major, minor).
"""
rtver = ctypes.c_int()
self.cudaRuntimeGetVersion(ctypes.byref(rtver))
# The version is encoded as (1000 * major) + (10 * minor)
major = rtver.value // 1000
minor = (rtver.value - (major * 1000)) // 10
return (major, minor)
def is_supported_version(self):
"""
Returns True if the CUDA Runtime is a supported version.
"""
return self.get_version() in self.supported_versions
@property
def supported_versions(self):
"""A tuple of all supported CUDA toolkit versions. Versions are given in
the form ``(major_version, minor_version)``."""
if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
# Only 64-bit Linux and Windows are supported
return ()
return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
(11, 7))
runtime = Runtime()
def get_version():
"""
Return the runtime version as a tuple of (major, minor)
"""
return runtime.get_version()