Videre
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
"""CUDA Driver
|
||||
|
||||
- Driver API binding
|
||||
- NVVM API binding
|
||||
- Device array implementation
|
||||
|
||||
"""
|
||||
from numba.core import config
|
||||
assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,904 @@
|
||||
"""
|
||||
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
|
||||
on the object. If it exists and evaluate to True, it must define shape,
|
||||
strides, dtype and size attributes similar to a NumPy ndarray.
|
||||
"""
|
||||
|
||||
import math
|
||||
import functools
|
||||
import operator
|
||||
import copy
|
||||
from ctypes import c_void_p
|
||||
|
||||
import numpy as np
|
||||
|
||||
import numba
|
||||
from numba import _devicearray
|
||||
from numba.cuda.cudadrv import devices, dummyarray
|
||||
from numba.cuda.cudadrv import driver as _driver
|
||||
from numba.core import types, config
|
||||
from numba.np.unsafe.ndarray import to_fixed_tuple
|
||||
from numba.np.numpy_support import numpy_version
|
||||
from numba.np import numpy_support
|
||||
from numba.cuda.api_util import prepare_shape_strides_dtype
|
||||
from numba.core.errors import NumbaPerformanceWarning
|
||||
from warnings import warn
|
||||
|
||||
try:
|
||||
lru_cache = getattr(functools, 'lru_cache')(None)
|
||||
except AttributeError:
|
||||
# Python 3.1 or lower
|
||||
def lru_cache(func):
|
||||
return func
|
||||
|
||||
|
||||
def is_cuda_ndarray(obj):
|
||||
"Check if an object is a CUDA ndarray"
|
||||
return getattr(obj, '__cuda_ndarray__', False)
|
||||
|
||||
|
||||
def verify_cuda_ndarray_interface(obj):
|
||||
"Verify the CUDA ndarray interface for an obj"
|
||||
require_cuda_ndarray(obj)
|
||||
|
||||
def requires_attr(attr, typ):
|
||||
if not hasattr(obj, attr):
|
||||
raise AttributeError(attr)
|
||||
if not isinstance(getattr(obj, attr), typ):
|
||||
raise AttributeError('%s must be of type %s' % (attr, typ))
|
||||
|
||||
requires_attr('shape', tuple)
|
||||
requires_attr('strides', tuple)
|
||||
requires_attr('dtype', np.dtype)
|
||||
requires_attr('size', int)
|
||||
|
||||
|
||||
def require_cuda_ndarray(obj):
|
||||
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
|
||||
if not is_cuda_ndarray(obj):
|
||||
raise ValueError('require an cuda ndarray object')
|
||||
|
||||
|
||||
class DeviceNDArrayBase(_devicearray.DeviceArray):
|
||||
"""A on GPU NDArray representation
|
||||
"""
|
||||
__cuda_memory__ = True
|
||||
__cuda_ndarray__ = True # There must be gpu_data attribute
|
||||
|
||||
def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
|
||||
"""
|
||||
Args
|
||||
----
|
||||
|
||||
shape
|
||||
array shape.
|
||||
strides
|
||||
array strides.
|
||||
dtype
|
||||
data type as np.dtype coercible object.
|
||||
stream
|
||||
cuda stream.
|
||||
gpu_data
|
||||
user provided device memory for the ndarray data buffer
|
||||
"""
|
||||
if isinstance(shape, int):
|
||||
shape = (shape,)
|
||||
if isinstance(strides, int):
|
||||
strides = (strides,)
|
||||
dtype = np.dtype(dtype)
|
||||
self.ndim = len(shape)
|
||||
if len(strides) != self.ndim:
|
||||
raise ValueError('strides not match ndim')
|
||||
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
|
||||
dtype.itemsize)
|
||||
self.shape = tuple(shape)
|
||||
self.strides = tuple(strides)
|
||||
self.dtype = dtype
|
||||
self.size = int(functools.reduce(operator.mul, self.shape, 1))
|
||||
# prepare gpu memory
|
||||
if self.size > 0:
|
||||
if gpu_data is None:
|
||||
self.alloc_size = _driver.memory_size_from_info(
|
||||
self.shape, self.strides, self.dtype.itemsize)
|
||||
gpu_data = devices.get_context().memalloc(self.alloc_size)
|
||||
else:
|
||||
self.alloc_size = _driver.device_memory_size(gpu_data)
|
||||
else:
|
||||
# Make NULL pointer for empty allocation
|
||||
if _driver.USE_NV_BINDING:
|
||||
null = _driver.binding.CUdeviceptr(0)
|
||||
else:
|
||||
null = c_void_p(0)
|
||||
gpu_data = _driver.MemoryPointer(context=devices.get_context(),
|
||||
pointer=null, size=0)
|
||||
self.alloc_size = 0
|
||||
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
@property
|
||||
def __cuda_array_interface__(self):
|
||||
if _driver.USE_NV_BINDING:
|
||||
if self.device_ctypes_pointer is not None:
|
||||
ptr = int(self.device_ctypes_pointer)
|
||||
else:
|
||||
ptr = 0
|
||||
else:
|
||||
if self.device_ctypes_pointer.value is not None:
|
||||
ptr = self.device_ctypes_pointer.value
|
||||
else:
|
||||
ptr = 0
|
||||
|
||||
return {
|
||||
'shape': tuple(self.shape),
|
||||
'strides': None if is_contiguous(self) else tuple(self.strides),
|
||||
'data': (ptr, False),
|
||||
'typestr': self.dtype.str,
|
||||
'stream': int(self.stream) if self.stream != 0 else None,
|
||||
'version': 3,
|
||||
}
|
||||
|
||||
def bind(self, stream=0):
|
||||
"""Bind a CUDA stream to this object so that all subsequent operation
|
||||
on this array defaults to the given stream.
|
||||
"""
|
||||
clone = copy.copy(self)
|
||||
clone.stream = stream
|
||||
return clone
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.transpose()
|
||||
|
||||
def transpose(self, axes=None):
|
||||
if axes and tuple(axes) == tuple(range(self.ndim)):
|
||||
return self
|
||||
elif self.ndim != 2:
|
||||
msg = "transposing a non-2D DeviceNDArray isn't supported"
|
||||
raise NotImplementedError(msg)
|
||||
elif axes is not None and set(axes) != set(range(self.ndim)):
|
||||
raise ValueError("invalid axes list %r" % (axes,))
|
||||
else:
|
||||
from numba.cuda.kernels.transpose import transpose
|
||||
return transpose(self)
|
||||
|
||||
def _default_stream(self, stream):
|
||||
return self.stream if not stream else stream
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
"""
|
||||
Magic attribute expected by Numba to get the numba type that
|
||||
represents this object.
|
||||
"""
|
||||
# Typing considerations:
|
||||
#
|
||||
# 1. The preference is to use 'C' or 'F' layout since this enables
|
||||
# hardcoding stride values into compiled kernels, which is more
|
||||
# efficient than storing a passed-in value in a register.
|
||||
#
|
||||
# 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
|
||||
# the more likely / common case.
|
||||
#
|
||||
# 3. If an array is broadcast then it must be typed as 'A' - using 'C'
|
||||
# or 'F' does not apply for broadcast arrays, because the strides, some
|
||||
# of which will be 0, will not match those hardcoded in for 'C' or 'F'
|
||||
# layouts.
|
||||
|
||||
broadcast = 0 in self.strides
|
||||
if self.flags['C_CONTIGUOUS'] and not broadcast:
|
||||
layout = 'C'
|
||||
elif self.flags['F_CONTIGUOUS'] and not broadcast:
|
||||
layout = 'F'
|
||||
else:
|
||||
layout = 'A'
|
||||
|
||||
dtype = numpy_support.from_dtype(self.dtype)
|
||||
return types.Array(dtype, self.ndim, layout)
|
||||
|
||||
@property
|
||||
def device_ctypes_pointer(self):
|
||||
"""Returns the ctypes pointer to the GPU data buffer
|
||||
"""
|
||||
if self.gpu_data is None:
|
||||
if _driver.USE_NV_BINDING:
|
||||
return _driver.binding.CUdeviceptr(0)
|
||||
else:
|
||||
return c_void_p(0)
|
||||
else:
|
||||
return self.gpu_data.device_ctypes_pointer
|
||||
|
||||
@devices.require_context
|
||||
def copy_to_device(self, ary, stream=0):
|
||||
"""Copy `ary` to `self`.
|
||||
|
||||
If `ary` is a CUDA memory, perform a device-to-device transfer.
|
||||
Otherwise, perform a a host-to-device transfer.
|
||||
"""
|
||||
if ary.size == 0:
|
||||
# Nothing to do
|
||||
return
|
||||
|
||||
sentry_contiguous(self)
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
self_core, ary_core = array_core(self), array_core(ary)
|
||||
if _driver.is_device_memory(ary):
|
||||
sentry_contiguous(ary)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
_driver.device_to_device(self, ary, self.alloc_size, stream=stream)
|
||||
else:
|
||||
# Ensure same contiguity. Only makes a host-side copy if necessary
|
||||
# (i.e., in order to materialize a writable strided view)
|
||||
ary_core = np.array(
|
||||
ary_core,
|
||||
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
|
||||
subok=True,
|
||||
copy=(not ary_core.flags['WRITEABLE'])
|
||||
if numpy_version < (2, 0) else None)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
_driver.host_to_device(self, ary_core, self.alloc_size,
|
||||
stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def copy_to_host(self, ary=None, stream=0):
|
||||
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
|
||||
if ``ary`` is ``None``.
|
||||
|
||||
If a CUDA ``stream`` is given, then the transfer will be made
|
||||
asynchronously as part as the given stream. Otherwise, the transfer is
|
||||
synchronous: the function returns after the copy is finished.
|
||||
|
||||
Always returns the host array.
|
||||
|
||||
Example::
|
||||
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
arr = np.arange(1000)
|
||||
d_arr = cuda.to_device(arr)
|
||||
|
||||
my_kernel[100, 100](d_arr)
|
||||
|
||||
result_array = d_arr.copy_to_host()
|
||||
"""
|
||||
if any(s < 0 for s in self.strides):
|
||||
msg = 'D->H copy not implemented for negative strides: {}'
|
||||
raise NotImplementedError(msg.format(self.strides))
|
||||
assert self.alloc_size >= 0, "Negative memory size"
|
||||
stream = self._default_stream(stream)
|
||||
if ary is None:
|
||||
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
|
||||
else:
|
||||
check_array_compatibility(self, ary)
|
||||
hostary = ary
|
||||
|
||||
if self.alloc_size != 0:
|
||||
_driver.device_to_host(hostary, self, self.alloc_size,
|
||||
stream=stream)
|
||||
|
||||
if ary is None:
|
||||
if self.size == 0:
|
||||
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
|
||||
buffer=hostary)
|
||||
else:
|
||||
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
|
||||
strides=self.strides, buffer=hostary)
|
||||
return hostary
|
||||
|
||||
def split(self, section, stream=0):
|
||||
"""Split the array into equal partition of the `section` size.
|
||||
If the array cannot be equally divided, the last section will be
|
||||
smaller.
|
||||
"""
|
||||
stream = self._default_stream(stream)
|
||||
if self.ndim != 1:
|
||||
raise ValueError("only support 1d array")
|
||||
if self.strides[0] != self.dtype.itemsize:
|
||||
raise ValueError("only support unit stride")
|
||||
nsect = int(math.ceil(float(self.size) / section))
|
||||
strides = self.strides
|
||||
itemsize = self.dtype.itemsize
|
||||
for i in range(nsect):
|
||||
begin = i * section
|
||||
end = min(begin + section, self.size)
|
||||
shape = (end - begin,)
|
||||
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
|
||||
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
|
||||
gpu_data=gpu_data)
|
||||
|
||||
def as_cuda_arg(self):
|
||||
"""Returns a device memory object that is used as the argument.
|
||||
"""
|
||||
return self.gpu_data
|
||||
|
||||
def get_ipc_handle(self):
|
||||
"""
|
||||
Returns a *IpcArrayHandle* object that is safe to serialize and transfer
|
||||
to another process to share the local allocation.
|
||||
|
||||
Note: this feature is only available on Linux.
|
||||
"""
|
||||
ipch = devices.get_context().get_ipc_handle(self.gpu_data)
|
||||
desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
|
||||
return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
|
||||
|
||||
def squeeze(self, axis=None, stream=0):
|
||||
"""
|
||||
Remove axes of size one from the array shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : None or int or tuple of ints, optional
|
||||
Subset of dimensions to remove. A `ValueError` is raised if an axis
|
||||
with size greater than one is selected. If `None`, all axes with
|
||||
size one are removed.
|
||||
stream : cuda stream or 0, optional
|
||||
Default stream for the returned view of the array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DeviceNDArray
|
||||
Squeezed view into the array.
|
||||
|
||||
"""
|
||||
new_dummy, _ = self._dummy.squeeze(axis=axis)
|
||||
return DeviceNDArray(
|
||||
shape=new_dummy.shape,
|
||||
strides=new_dummy.strides,
|
||||
dtype=self.dtype,
|
||||
stream=self._default_stream(stream),
|
||||
gpu_data=self.gpu_data,
|
||||
)
|
||||
|
||||
def view(self, dtype):
|
||||
"""Returns a new object by reinterpretting the dtype without making a
|
||||
copy of the data.
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
shape = list(self.shape)
|
||||
strides = list(self.strides)
|
||||
|
||||
if self.dtype.itemsize != dtype.itemsize:
|
||||
if not self.is_c_contiguous():
|
||||
raise ValueError(
|
||||
"To change to a dtype of a different size,"
|
||||
" the array must be C-contiguous"
|
||||
)
|
||||
|
||||
shape[-1], rem = divmod(
|
||||
shape[-1] * self.dtype.itemsize,
|
||||
dtype.itemsize
|
||||
)
|
||||
|
||||
if rem != 0:
|
||||
raise ValueError(
|
||||
"When changing to a larger dtype,"
|
||||
" its size must be a divisor of the total size in bytes"
|
||||
" of the last axis of the array."
|
||||
)
|
||||
|
||||
strides[-1] = dtype.itemsize
|
||||
|
||||
return DeviceNDArray(
|
||||
shape=shape,
|
||||
strides=strides,
|
||||
dtype=dtype,
|
||||
stream=self.stream,
|
||||
gpu_data=self.gpu_data,
|
||||
)
|
||||
|
||||
@property
|
||||
def nbytes(self):
|
||||
# Note: not using `alloc_size`. `alloc_size` reports memory
|
||||
# consumption of the allocation, not the size of the array
|
||||
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
|
||||
return self.dtype.itemsize * self.size
|
||||
|
||||
|
||||
class DeviceRecord(DeviceNDArrayBase):
|
||||
'''
|
||||
An on-GPU record type
|
||||
'''
|
||||
def __init__(self, dtype, stream=0, gpu_data=None):
|
||||
shape = ()
|
||||
strides = ()
|
||||
super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
|
||||
gpu_data)
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
"""
|
||||
For `numpy.ndarray` compatibility. Ideally this would return a
|
||||
`np.core.multiarray.flagsobj`, but that needs to be constructed
|
||||
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
|
||||
aren't writeable).
|
||||
"""
|
||||
return dict(self._dummy.flags) # defensive copy
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
"""
|
||||
Magic attribute expected by Numba to get the numba type that
|
||||
represents this object.
|
||||
"""
|
||||
return numpy_support.from_dtype(self.dtype)
|
||||
|
||||
@devices.require_context
|
||||
def __getitem__(self, item):
|
||||
return self._do_getitem(item)
|
||||
|
||||
@devices.require_context
|
||||
def getitem(self, item, stream=0):
|
||||
"""Do `__getitem__(item)` with CUDA stream
|
||||
"""
|
||||
return self._do_getitem(item, stream)
|
||||
|
||||
def _do_getitem(self, item, stream=0):
|
||||
stream = self._default_stream(stream)
|
||||
typ, offset = self.dtype.fields[item]
|
||||
newdata = self.gpu_data.view(offset)
|
||||
|
||||
if typ.shape == ():
|
||||
if typ.names is not None:
|
||||
return DeviceRecord(dtype=typ, stream=stream,
|
||||
gpu_data=newdata)
|
||||
else:
|
||||
hostary = np.empty(1, dtype=typ)
|
||||
_driver.device_to_host(dst=hostary, src=newdata,
|
||||
size=typ.itemsize,
|
||||
stream=stream)
|
||||
return hostary[0]
|
||||
else:
|
||||
shape, strides, dtype = \
|
||||
prepare_shape_strides_dtype(typ.shape,
|
||||
None,
|
||||
typ.subdtype[0], 'C')
|
||||
return DeviceNDArray(shape=shape, strides=strides,
|
||||
dtype=dtype, gpu_data=newdata,
|
||||
stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def __setitem__(self, key, value):
|
||||
return self._do_setitem(key, value)
|
||||
|
||||
@devices.require_context
|
||||
def setitem(self, key, value, stream=0):
|
||||
"""Do `__setitem__(key, value)` with CUDA stream
|
||||
"""
|
||||
return self._do_setitem(key, value, stream=stream)
|
||||
|
||||
def _do_setitem(self, key, value, stream=0):
|
||||
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
# If the record didn't have a default stream, and the user didn't
|
||||
# provide a stream, then we will use the default stream for the
|
||||
# assignment kernel and synchronize on it.
|
||||
synchronous = not stream
|
||||
if synchronous:
|
||||
ctx = devices.get_context()
|
||||
stream = ctx.get_default_stream()
|
||||
|
||||
# (1) prepare LHS
|
||||
|
||||
typ, offset = self.dtype.fields[key]
|
||||
newdata = self.gpu_data.view(offset)
|
||||
|
||||
lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
|
||||
|
||||
# (2) prepare RHS
|
||||
|
||||
rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
|
||||
|
||||
# (3) do the copy
|
||||
|
||||
_driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
|
||||
|
||||
if synchronous:
|
||||
stream.synchronize()
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _assign_kernel(ndim):
|
||||
"""
|
||||
A separate method so we don't need to compile code every assignment (!).
|
||||
|
||||
:param ndim: We need to have static array sizes for cuda.local.array, so
|
||||
bake in the number of dimensions into the kernel
|
||||
"""
|
||||
from numba import cuda # circular!
|
||||
|
||||
if ndim == 0:
|
||||
# the (2, ndim) allocation below is not yet supported, so avoid it
|
||||
@cuda.jit
|
||||
def kernel(lhs, rhs):
|
||||
lhs[()] = rhs[()]
|
||||
return kernel
|
||||
|
||||
@cuda.jit
|
||||
def kernel(lhs, rhs):
|
||||
location = cuda.grid(1)
|
||||
|
||||
n_elements = 1
|
||||
for i in range(lhs.ndim):
|
||||
n_elements *= lhs.shape[i]
|
||||
if location >= n_elements:
|
||||
# bake n_elements into the kernel, better than passing it in
|
||||
# as another argument.
|
||||
return
|
||||
|
||||
# [0, :] is the to-index (into `lhs`)
|
||||
# [1, :] is the from-index (into `rhs`)
|
||||
idx = cuda.local.array(
|
||||
shape=(2, ndim),
|
||||
dtype=types.int64)
|
||||
|
||||
for i in range(ndim - 1, -1, -1):
|
||||
idx[0, i] = location % lhs.shape[i]
|
||||
idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
|
||||
location //= lhs.shape[i]
|
||||
|
||||
lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
|
||||
return kernel
|
||||
|
||||
|
||||
class DeviceNDArray(DeviceNDArrayBase):
|
||||
'''
|
||||
An on-GPU array type
|
||||
'''
|
||||
def is_f_contiguous(self):
|
||||
'''
|
||||
Return true if the array is Fortran-contiguous.
|
||||
'''
|
||||
return self._dummy.is_f_contig
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
"""
|
||||
For `numpy.ndarray` compatibility. Ideally this would return a
|
||||
`np.core.multiarray.flagsobj`, but that needs to be constructed
|
||||
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
|
||||
aren't writeable).
|
||||
"""
|
||||
return dict(self._dummy.flags) # defensive copy
|
||||
|
||||
def is_c_contiguous(self):
|
||||
'''
|
||||
Return true if the array is C-contiguous.
|
||||
'''
|
||||
return self._dummy.is_c_contig
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
"""
|
||||
:return: an `numpy.ndarray`, so copies to the host.
|
||||
"""
|
||||
if dtype:
|
||||
return self.copy_to_host().__array__(dtype)
|
||||
else:
|
||||
return self.copy_to_host().__array__()
|
||||
|
||||
def __len__(self):
|
||||
return self.shape[0]
|
||||
|
||||
def reshape(self, *newshape, **kws):
|
||||
"""
|
||||
Reshape the array without changing its contents, similarly to
|
||||
:meth:`numpy.ndarray.reshape`. Example::
|
||||
|
||||
d_arr = d_arr.reshape(20, 50, order='F')
|
||||
"""
|
||||
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
|
||||
newshape = newshape[0]
|
||||
|
||||
cls = type(self)
|
||||
if newshape == self.shape:
|
||||
# nothing to do
|
||||
return cls(shape=self.shape, strides=self.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data)
|
||||
|
||||
newarr, extents = self._dummy.reshape(*newshape, **kws)
|
||||
|
||||
if extents == [self._dummy.extent]:
|
||||
return cls(shape=newarr.shape, strides=newarr.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data)
|
||||
else:
|
||||
raise NotImplementedError("operation requires copying")
|
||||
|
||||
def ravel(self, order='C', stream=0):
|
||||
'''
|
||||
Flattens a contiguous array without changing its contents, similar to
|
||||
:meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
|
||||
exception.
|
||||
'''
|
||||
stream = self._default_stream(stream)
|
||||
cls = type(self)
|
||||
newarr, extents = self._dummy.ravel(order=order)
|
||||
|
||||
if extents == [self._dummy.extent]:
|
||||
return cls(shape=newarr.shape, strides=newarr.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data,
|
||||
stream=stream)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("operation requires copying")
|
||||
|
||||
@devices.require_context
|
||||
def __getitem__(self, item):
|
||||
return self._do_getitem(item)
|
||||
|
||||
@devices.require_context
|
||||
def getitem(self, item, stream=0):
|
||||
"""Do `__getitem__(item)` with CUDA stream
|
||||
"""
|
||||
return self._do_getitem(item, stream)
|
||||
|
||||
def _do_getitem(self, item, stream=0):
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
arr = self._dummy.__getitem__(item)
|
||||
extents = list(arr.iter_contiguous_extent())
|
||||
cls = type(self)
|
||||
if len(extents) == 1:
|
||||
newdata = self.gpu_data.view(*extents[0])
|
||||
|
||||
if not arr.is_array:
|
||||
# Check for structured array type (record)
|
||||
if self.dtype.names is not None:
|
||||
return DeviceRecord(dtype=self.dtype, stream=stream,
|
||||
gpu_data=newdata)
|
||||
else:
|
||||
# Element indexing
|
||||
hostary = np.empty(1, dtype=self.dtype)
|
||||
_driver.device_to_host(dst=hostary, src=newdata,
|
||||
size=self._dummy.itemsize,
|
||||
stream=stream)
|
||||
return hostary[0]
|
||||
else:
|
||||
return cls(shape=arr.shape, strides=arr.strides,
|
||||
dtype=self.dtype, gpu_data=newdata, stream=stream)
|
||||
else:
|
||||
newdata = self.gpu_data.view(*arr.extent)
|
||||
return cls(shape=arr.shape, strides=arr.strides,
|
||||
dtype=self.dtype, gpu_data=newdata, stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def __setitem__(self, key, value):
|
||||
return self._do_setitem(key, value)
|
||||
|
||||
@devices.require_context
|
||||
def setitem(self, key, value, stream=0):
|
||||
"""Do `__setitem__(key, value)` with CUDA stream
|
||||
"""
|
||||
return self._do_setitem(key, value, stream=stream)
|
||||
|
||||
def _do_setitem(self, key, value, stream=0):
|
||||
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
# If the array didn't have a default stream, and the user didn't provide
|
||||
# a stream, then we will use the default stream for the assignment
|
||||
# kernel and synchronize on it.
|
||||
synchronous = not stream
|
||||
if synchronous:
|
||||
ctx = devices.get_context()
|
||||
stream = ctx.get_default_stream()
|
||||
|
||||
# (1) prepare LHS
|
||||
|
||||
arr = self._dummy.__getitem__(key)
|
||||
newdata = self.gpu_data.view(*arr.extent)
|
||||
|
||||
if isinstance(arr, dummyarray.Element):
|
||||
# convert to a 0d array
|
||||
shape = ()
|
||||
strides = ()
|
||||
else:
|
||||
shape = arr.shape
|
||||
strides = arr.strides
|
||||
|
||||
lhs = type(self)(
|
||||
shape=shape,
|
||||
strides=strides,
|
||||
dtype=self.dtype,
|
||||
gpu_data=newdata,
|
||||
stream=stream)
|
||||
|
||||
# (2) prepare RHS
|
||||
|
||||
rhs, _ = auto_device(value, stream=stream, user_explicit=True)
|
||||
if rhs.ndim > lhs.ndim:
|
||||
raise ValueError("Can't assign %s-D array to %s-D self" % (
|
||||
rhs.ndim,
|
||||
lhs.ndim))
|
||||
rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
|
||||
# negative indices would not work if rhs.ndim == 0
|
||||
rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
|
||||
rhs = rhs.reshape(*rhs_shape)
|
||||
for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
|
||||
if r != 1 and l != r:
|
||||
raise ValueError("Can't copy sequence with size %d to array "
|
||||
"axis %d with dimension %d" % ( r, i, l))
|
||||
|
||||
# (3) do the copy
|
||||
|
||||
n_elements = functools.reduce(operator.mul, lhs.shape, 1)
|
||||
_assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
|
||||
if synchronous:
|
||||
stream.synchronize()
|
||||
|
||||
|
||||
class IpcArrayHandle(object):
|
||||
"""
|
||||
An IPC array handle that can be serialized and transfer to another process
|
||||
in the same machine for share a GPU allocation.
|
||||
|
||||
On the destination process, use the *.open()* method to creates a new
|
||||
*DeviceNDArray* object that shares the allocation from the original process.
|
||||
To release the resources, call the *.close()* method. After that, the
|
||||
destination can no longer use the shared array object. (Note: the
|
||||
underlying weakref to the resource is now dead.)
|
||||
|
||||
This object implements the context-manager interface that calls the
|
||||
*.open()* and *.close()* method automatically::
|
||||
|
||||
with the_ipc_array_handle as ipc_array:
|
||||
# use ipc_array here as a normal gpu array object
|
||||
some_code(ipc_array)
|
||||
# ipc_array is dead at this point
|
||||
"""
|
||||
def __init__(self, ipc_handle, array_desc):
|
||||
self._array_desc = array_desc
|
||||
self._ipc_handle = ipc_handle
|
||||
|
||||
def open(self):
|
||||
"""
|
||||
Returns a new *DeviceNDArray* that shares the allocation from the
|
||||
original process. Must not be used on the original process.
|
||||
"""
|
||||
dptr = self._ipc_handle.open(devices.get_context())
|
||||
return DeviceNDArray(gpu_data=dptr, **self._array_desc)
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Closes the IPC handle to the array.
|
||||
"""
|
||||
self._ipc_handle.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self.open()
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.close()
|
||||
|
||||
|
||||
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
|
||||
"""
|
||||
A host array that uses CUDA mapped memory.
|
||||
"""
|
||||
|
||||
def device_setup(self, gpu_data, stream=0):
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
|
||||
class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
|
||||
"""
|
||||
A host array that uses CUDA managed memory.
|
||||
"""
|
||||
|
||||
def device_setup(self, gpu_data, stream=0):
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
|
||||
def from_array_like(ary, stream=0, gpu_data=None):
|
||||
"Create a DeviceNDArray object that is like ary."
|
||||
return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
|
||||
gpu_data=gpu_data)
|
||||
|
||||
|
||||
def from_record_like(rec, stream=0, gpu_data=None):
|
||||
"Create a DeviceRecord object that is like rec."
|
||||
return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
|
||||
|
||||
|
||||
def array_core(ary):
|
||||
"""
|
||||
Extract the repeated core of a broadcast array.
|
||||
|
||||
Broadcast arrays are by definition non-contiguous due to repeated
|
||||
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
|
||||
contiguity and copy the underlying data from such arrays, we must create
|
||||
a view without the repeated dimensions.
|
||||
|
||||
"""
|
||||
if not ary.strides or not ary.size:
|
||||
return ary
|
||||
core_index = []
|
||||
for stride in ary.strides:
|
||||
core_index.append(0 if stride == 0 else slice(None))
|
||||
return ary[tuple(core_index)]
|
||||
|
||||
|
||||
def is_contiguous(ary):
|
||||
"""
|
||||
Returns True iff `ary` is C-style contiguous while ignoring
|
||||
broadcasted and 1-sized dimensions.
|
||||
As opposed to array_core(), it does not call require_context(),
|
||||
which can be quite expensive.
|
||||
"""
|
||||
size = ary.dtype.itemsize
|
||||
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
|
||||
if shape > 1 and stride != 0:
|
||||
if size != stride:
|
||||
return False
|
||||
size *= shape
|
||||
return True
|
||||
|
||||
|
||||
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
|
||||
"be transferred as a single memory region. Please "
|
||||
"ensure contiguous buffer with numpy "
|
||||
".ascontiguousarray()")
|
||||
|
||||
|
||||
def sentry_contiguous(ary):
|
||||
core = array_core(ary)
|
||||
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
|
||||
raise ValueError(errmsg_contiguous_buffer)
|
||||
|
||||
|
||||
def auto_device(obj, stream=0, copy=True, user_explicit=False):
|
||||
"""
|
||||
Create a DeviceRecord or DeviceArray like obj and optionally copy data from
|
||||
host to device. If obj already represents device memory, it is returned and
|
||||
no copy is made.
|
||||
"""
|
||||
if _driver.is_device_memory(obj):
|
||||
return obj, False
|
||||
elif hasattr(obj, '__cuda_array_interface__'):
|
||||
return numba.cuda.as_cuda_array(obj), False
|
||||
else:
|
||||
if isinstance(obj, np.void):
|
||||
devobj = from_record_like(obj, stream=stream)
|
||||
else:
|
||||
# This allows you to pass non-array objects like constants and
|
||||
# objects implementing the array interface
|
||||
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
|
||||
# into this function (with no overhead -- copies -- for `obj`s
|
||||
# that are already `ndarray`s.
|
||||
obj = np.array(
|
||||
obj,
|
||||
copy=False if numpy_version < (2, 0) else None,
|
||||
subok=True)
|
||||
sentry_contiguous(obj)
|
||||
devobj = from_array_like(obj, stream=stream)
|
||||
if copy:
|
||||
if config.CUDA_WARN_ON_IMPLICIT_COPY:
|
||||
if (
|
||||
not user_explicit and
|
||||
(not isinstance(obj, DeviceNDArray)
|
||||
and isinstance(obj, np.ndarray))
|
||||
):
|
||||
msg = ("Host array used in CUDA kernel will incur "
|
||||
"copy overhead to/from device.")
|
||||
warn(NumbaPerformanceWarning(msg))
|
||||
devobj.copy_to_device(obj, stream=stream)
|
||||
return devobj, True
|
||||
|
||||
|
||||
def check_array_compatibility(ary1, ary2):
|
||||
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
|
||||
if ary1.dtype != ary2.dtype:
|
||||
raise TypeError('incompatible dtype: %s vs. %s' %
|
||||
(ary1.dtype, ary2.dtype))
|
||||
if ary1sq.shape != ary2sq.shape:
|
||||
raise ValueError('incompatible shape: %s vs. %s' %
|
||||
(ary1.shape, ary2.shape))
|
||||
# We check strides only if the size is nonzero, because strides are
|
||||
# irrelevant (and can differ) for zero-length copies.
|
||||
if ary1.size and ary1sq.strides != ary2sq.strides:
|
||||
raise ValueError('incompatible strides: %s vs. %s' %
|
||||
(ary1.strides, ary2.strides))
|
||||
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Expose each GPU devices directly.
|
||||
|
||||
This module implements a API that is like the "CUDA runtime" context manager
|
||||
for managing CUDA context stack and clean up. It relies on thread-local globals
|
||||
to separate the context stack management of each thread. Contexts are also
|
||||
shareable among threads. Only the main thread can destroy Contexts.
|
||||
|
||||
Note:
|
||||
- This module must be imported by the main-thread.
|
||||
|
||||
"""
|
||||
import functools
|
||||
import threading
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .driver import driver, USE_NV_BINDING
|
||||
|
||||
|
||||
class _DeviceList(object):
|
||||
def __getattr__(self, attr):
|
||||
# First time looking at "lst" attribute.
|
||||
if attr == "lst":
|
||||
# Device list is not initialized.
|
||||
# Query all CUDA devices.
|
||||
numdev = driver.get_device_count()
|
||||
gpus = [_DeviceContextManager(driver.get_device(devid))
|
||||
for devid in range(numdev)]
|
||||
# Define "lst" to avoid re-initialization
|
||||
self.lst = gpus
|
||||
return gpus
|
||||
|
||||
# Other attributes
|
||||
return super(_DeviceList, self).__getattr__(attr)
|
||||
|
||||
def __getitem__(self, devnum):
|
||||
'''
|
||||
Returns the context manager for device *devnum*.
|
||||
'''
|
||||
return self.lst[devnum]
|
||||
|
||||
def __str__(self):
|
||||
return ', '.join([str(d) for d in self.lst])
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.lst)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lst)
|
||||
|
||||
@property
|
||||
def current(self):
|
||||
"""Returns the active device or None if there's no active device
|
||||
"""
|
||||
with driver.get_active_context() as ac:
|
||||
devnum = ac.devnum
|
||||
if devnum is not None:
|
||||
return self[devnum]
|
||||
|
||||
|
||||
class _DeviceContextManager(object):
|
||||
"""
|
||||
Provides a context manager for executing in the context of the chosen
|
||||
device. The normal use of instances of this type is from
|
||||
``numba.cuda.gpus``. For example, to execute on device 2::
|
||||
|
||||
with numba.cuda.gpus[2]:
|
||||
d_a = numba.cuda.to_device(a)
|
||||
|
||||
to copy the array *a* onto device 2, referred to by *d_a*.
|
||||
"""
|
||||
|
||||
def __init__(self, device):
|
||||
self._device = device
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self._device, item)
|
||||
|
||||
def __enter__(self):
|
||||
_runtime.get_or_create_context(self._device.id)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
# this will verify that we are popping the right device context.
|
||||
self._device.get_primary_context().pop()
|
||||
|
||||
def __str__(self):
|
||||
return "<Managed Device {self.id}>".format(self=self)
|
||||
|
||||
|
||||
class _Runtime(object):
|
||||
"""Emulate the CUDA runtime context management.
|
||||
|
||||
It owns all Devices and Contexts.
|
||||
Keeps at most one Context per Device
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.gpus = _DeviceList()
|
||||
|
||||
# For caching the attached CUDA Context
|
||||
self._tls = threading.local()
|
||||
|
||||
# Remember the main thread
|
||||
# Only the main thread can *actually* destroy
|
||||
self._mainthread = threading.current_thread()
|
||||
|
||||
# Avoid mutation of runtime state in multithreaded programs
|
||||
self._lock = threading.RLock()
|
||||
|
||||
@contextmanager
|
||||
def ensure_context(self):
|
||||
"""Ensure a CUDA context is available inside the context.
|
||||
|
||||
On entrance, queries the CUDA driver for an active CUDA context and
|
||||
attaches it in TLS for subsequent calls so they do not need to query
|
||||
the CUDA driver again. On exit, detach the CUDA context from the TLS.
|
||||
|
||||
This will allow us to pickup thirdparty activated CUDA context in
|
||||
any top-level Numba CUDA API.
|
||||
"""
|
||||
with driver.get_active_context():
|
||||
oldctx = self._get_attached_context()
|
||||
newctx = self.get_or_create_context(None)
|
||||
self._set_attached_context(newctx)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._set_attached_context(oldctx)
|
||||
|
||||
def get_or_create_context(self, devnum):
|
||||
"""Returns the primary context and push+create it if needed
|
||||
for *devnum*. If *devnum* is None, use the active CUDA context (must
|
||||
be primary) or create a new one with ``devnum=0``.
|
||||
"""
|
||||
if devnum is None:
|
||||
attached_ctx = self._get_attached_context()
|
||||
if attached_ctx is None:
|
||||
return self._get_or_create_context_uncached(devnum)
|
||||
else:
|
||||
return attached_ctx
|
||||
else:
|
||||
if USE_NV_BINDING:
|
||||
devnum = int(devnum)
|
||||
return self._activate_context_for(devnum)
|
||||
|
||||
def _get_or_create_context_uncached(self, devnum):
|
||||
"""See also ``get_or_create_context(devnum)``.
|
||||
This version does not read the cache.
|
||||
"""
|
||||
with self._lock:
|
||||
# Try to get the active context in the CUDA stack or
|
||||
# activate GPU-0 with the primary context
|
||||
with driver.get_active_context() as ac:
|
||||
if not ac:
|
||||
return self._activate_context_for(0)
|
||||
else:
|
||||
# Get primary context for the active device
|
||||
ctx = self.gpus[ac.devnum].get_primary_context()
|
||||
# Is active context the primary context?
|
||||
if USE_NV_BINDING:
|
||||
ctx_handle = int(ctx.handle)
|
||||
ac_ctx_handle = int(ac.context_handle)
|
||||
else:
|
||||
ctx_handle = ctx.handle.value
|
||||
ac_ctx_handle = ac.context_handle.value
|
||||
if ctx_handle != ac_ctx_handle:
|
||||
msg = ('Numba cannot operate on non-primary'
|
||||
' CUDA context {:x}')
|
||||
raise RuntimeError(msg.format(ac_ctx_handle))
|
||||
# Ensure the context is ready
|
||||
ctx.prepare_for_use()
|
||||
return ctx
|
||||
|
||||
def _activate_context_for(self, devnum):
|
||||
with self._lock:
|
||||
gpu = self.gpus[devnum]
|
||||
newctx = gpu.get_primary_context()
|
||||
# Detect unexpected context switch
|
||||
cached_ctx = self._get_attached_context()
|
||||
if cached_ctx is not None and cached_ctx is not newctx:
|
||||
raise RuntimeError('Cannot switch CUDA-context.')
|
||||
newctx.push()
|
||||
return newctx
|
||||
|
||||
def _get_attached_context(self):
|
||||
return getattr(self._tls, 'attached_context', None)
|
||||
|
||||
def _set_attached_context(self, ctx):
|
||||
self._tls.attached_context = ctx
|
||||
|
||||
def reset(self):
|
||||
"""Clear all contexts in the thread. Destroy the context if and only
|
||||
if we are in the main thread.
|
||||
"""
|
||||
# Pop all active context.
|
||||
while driver.pop_active_context() is not None:
|
||||
pass
|
||||
|
||||
# If it is the main thread
|
||||
if threading.current_thread() == self._mainthread:
|
||||
self._destroy_all_contexts()
|
||||
|
||||
def _destroy_all_contexts(self):
|
||||
# Reset all devices
|
||||
for gpu in self.gpus:
|
||||
gpu.reset()
|
||||
|
||||
|
||||
_runtime = _Runtime()
|
||||
|
||||
# ================================ PUBLIC API ================================
|
||||
|
||||
gpus = _runtime.gpus
|
||||
|
||||
|
||||
def get_context(devnum=None):
|
||||
"""Get the current device or use a device by device number, and
|
||||
return the CUDA context.
|
||||
"""
|
||||
return _runtime.get_or_create_context(devnum)
|
||||
|
||||
|
||||
def require_context(fn):
|
||||
"""
|
||||
A decorator that ensures a CUDA context is available when *fn* is executed.
|
||||
|
||||
Note: The function *fn* cannot switch CUDA-context.
|
||||
"""
|
||||
@functools.wraps(fn)
|
||||
def _require_cuda_context(*args, **kws):
|
||||
with _runtime.ensure_context():
|
||||
return fn(*args, **kws)
|
||||
|
||||
return _require_cuda_context
|
||||
|
||||
|
||||
def reset():
|
||||
"""Reset the CUDA subsystem for the current thread.
|
||||
|
||||
In the main thread:
|
||||
This removes all CUDA contexts. Only use this at shutdown or for
|
||||
cleaning up between tests.
|
||||
|
||||
In non-main threads:
|
||||
This clear the CUDA context stack only.
|
||||
|
||||
"""
|
||||
_runtime.reset()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,394 @@
|
||||
from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
|
||||
c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
|
||||
|
||||
from numba.cuda.cudadrv import _extras
|
||||
|
||||
cu_device = c_int
|
||||
cu_device_attribute = c_int # enum
|
||||
cu_context = c_void_p # an opaque handle
|
||||
cu_module = c_void_p # an opaque handle
|
||||
cu_jit_option = c_int # enum
|
||||
cu_jit_input_type = c_int # enum
|
||||
cu_function = c_void_p # an opaque handle
|
||||
cu_device_ptr = c_size_t # defined as unsigned long long
|
||||
cu_stream = c_void_p # an opaque handle
|
||||
cu_event = c_void_p
|
||||
cu_link_state = c_void_p
|
||||
cu_function_attribute = c_int
|
||||
cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide
|
||||
cu_uuid = (c_byte * 16) # Device UUID
|
||||
|
||||
cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
|
||||
|
||||
cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
|
||||
|
||||
# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
|
||||
CU_STREAM_DEFAULT = 0
|
||||
CU_STREAM_LEGACY = 1
|
||||
CU_STREAM_PER_THREAD = 2
|
||||
|
||||
API_PROTOTYPES = {
|
||||
# CUresult cuInit(unsigned int Flags);
|
||||
'cuInit' : (c_int, c_uint),
|
||||
|
||||
# CUresult cuDriverGetVersion (int* driverVersion )
|
||||
'cuDriverGetVersion': (c_int, POINTER(c_int)),
|
||||
|
||||
# CUresult cuDeviceGetCount(int *count);
|
||||
'cuDeviceGetCount': (c_int, POINTER(c_int)),
|
||||
|
||||
# CUresult cuDeviceGet(CUdevice *device, int ordinal);
|
||||
'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
|
||||
|
||||
# CUresult cuDeviceGetName ( char* name, int len, CUdevice dev )
|
||||
'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
|
||||
|
||||
# CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
|
||||
# CUdevice dev);
|
||||
'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
|
||||
cu_device),
|
||||
|
||||
# CUresult cuDeviceComputeCapability(int *major, int *minor,
|
||||
# CUdevice dev);
|
||||
'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
|
||||
cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxGetState(
|
||||
# CUdevice dev,
|
||||
# unsigned int* flags,
|
||||
# int* active)
|
||||
'cuDevicePrimaryCtxGetState': (c_int,
|
||||
cu_device, POINTER(c_uint), POINTER(c_int)),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
|
||||
'cuDevicePrimaryCtxRelease': (c_int, cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
|
||||
'cuDevicePrimaryCtxReset': (c_int, cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
|
||||
'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int flags )
|
||||
'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
|
||||
|
||||
# CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
|
||||
# CUdevice dev);
|
||||
'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
|
||||
|
||||
# CUresult cuCtxGetDevice ( CUdevice * device )
|
||||
'cuCtxGetDevice': (c_int, POINTER(cu_device)),
|
||||
|
||||
# CUresult cuCtxGetCurrent (CUcontext *pctx);
|
||||
'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
|
||||
|
||||
# CUresult cuCtxPushCurrent (CUcontext pctx);
|
||||
'cuCtxPushCurrent': (c_int, cu_context),
|
||||
|
||||
# CUresult cuCtxPopCurrent (CUcontext *pctx);
|
||||
'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
|
||||
|
||||
# CUresult cuCtxDestroy(CUcontext pctx);
|
||||
'cuCtxDestroy': (c_int, cu_context),
|
||||
|
||||
# CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
|
||||
# unsigned int numOptions,
|
||||
# CUjit_option *options,
|
||||
# void **optionValues);
|
||||
'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
|
||||
POINTER(cu_jit_option), POINTER(c_void_p)),
|
||||
|
||||
# CUresult cuModuleUnload(CUmodule hmod);
|
||||
'cuModuleUnload': (c_int, cu_module),
|
||||
|
||||
# CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
|
||||
# const char *name);
|
||||
'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
|
||||
|
||||
# CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
|
||||
# hmod, const char* name )
|
||||
'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
|
||||
cu_module, c_char_p),
|
||||
|
||||
# CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
|
||||
# CUfunc_cache config);
|
||||
'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
|
||||
|
||||
# CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
|
||||
'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
|
||||
|
||||
# CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
|
||||
# unsigned int flags);
|
||||
'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
|
||||
|
||||
# CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
|
||||
'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
|
||||
|
||||
# CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
|
||||
# size_t N, CUstream hStream);
|
||||
'cuMemsetD8Async': (c_int,
|
||||
cu_device_ptr, c_uint8, c_size_t, cu_stream),
|
||||
|
||||
# CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
|
||||
# size_t ByteCount);
|
||||
'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
|
||||
|
||||
# CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
|
||||
# size_t ByteCount, CUstream hStream);
|
||||
'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
|
||||
cu_stream),
|
||||
|
||||
# CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
|
||||
# size_t ByteCount);
|
||||
'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
|
||||
|
||||
# CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
|
||||
# size_t ByteCount, CUstream hStream);
|
||||
'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
|
||||
cu_stream),
|
||||
|
||||
|
||||
# CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
|
||||
# size_t ByteCount);
|
||||
'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
|
||||
|
||||
# CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
|
||||
# size_t ByteCount, CUstream hStream);
|
||||
'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
|
||||
cu_stream),
|
||||
|
||||
# CUresult cuMemFree(CUdeviceptr dptr);
|
||||
'cuMemFree': (c_int, cu_device_ptr),
|
||||
|
||||
# CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
|
||||
'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
|
||||
|
||||
# CUresult cuStreamDestroy(CUstream hStream);
|
||||
'cuStreamDestroy': (c_int, cu_stream),
|
||||
|
||||
# CUresult cuStreamSynchronize(CUstream hStream);
|
||||
'cuStreamSynchronize': (c_int, cu_stream),
|
||||
|
||||
# CUresult cuStreamAddCallback(
|
||||
# CUstream hStream,
|
||||
# CUstreamCallback callback,
|
||||
# void* userData,
|
||||
# unsigned int flags)
|
||||
'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
|
||||
py_object, c_uint),
|
||||
|
||||
# CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
|
||||
# unsigned int gridDimY,
|
||||
# unsigned int gridDimZ,
|
||||
# unsigned int blockDimX,
|
||||
# unsigned int blockDimY,
|
||||
# unsigned int blockDimZ,
|
||||
# unsigned int sharedMemBytes,
|
||||
# CUstream hStream, void **kernelParams,
|
||||
# void ** extra)
|
||||
'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
|
||||
c_uint, c_uint, c_uint, c_uint, cu_stream,
|
||||
POINTER(c_void_p), POINTER(c_void_p)),
|
||||
|
||||
# CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
|
||||
# unsigned int gridDimY,
|
||||
# unsigned int gridDimZ,
|
||||
# unsigned int blockDimX,
|
||||
# unsigned int blockDimY,
|
||||
# unsigned int blockDimZ,
|
||||
# unsigned int sharedMemBytes,
|
||||
# CUstream hStream, void **kernelParams)
|
||||
'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
|
||||
c_uint, c_uint, c_uint, c_uint, cu_stream,
|
||||
POINTER(c_void_p)),
|
||||
|
||||
# CUresult cuMemHostAlloc ( void ** pp,
|
||||
# size_t bytesize,
|
||||
# unsigned int Flags
|
||||
# )
|
||||
'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
|
||||
|
||||
# CUresult cuMemFreeHost ( void * p )
|
||||
'cuMemFreeHost': (c_int, c_void_p),
|
||||
|
||||
# CUresult cuMemHostRegister(void * p,
|
||||
# size_t bytesize,
|
||||
# unsigned int Flags)
|
||||
'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
|
||||
|
||||
# CUresult cuMemHostUnregister(void * p)
|
||||
'cuMemHostUnregister': (c_int, c_void_p),
|
||||
|
||||
# CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
|
||||
# void * p,
|
||||
# unsigned int Flags)
|
||||
'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
|
||||
c_void_p, c_uint),
|
||||
|
||||
# CUresult cuMemGetInfo(size_t * free, size_t * total)
|
||||
'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
|
||||
|
||||
# CUresult cuEventCreate ( CUevent * phEvent,
|
||||
# unsigned int Flags )
|
||||
'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
|
||||
|
||||
# CUresult cuEventDestroy ( CUevent hEvent )
|
||||
'cuEventDestroy': (c_int, cu_event),
|
||||
|
||||
# CUresult cuEventElapsedTime ( float * pMilliseconds,
|
||||
# CUevent hStart,
|
||||
# CUevent hEnd )
|
||||
'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
|
||||
|
||||
# CUresult cuEventQuery ( CUevent hEvent )
|
||||
'cuEventQuery': (c_int, cu_event),
|
||||
|
||||
# CUresult cuEventRecord ( CUevent hEvent,
|
||||
# CUstream hStream )
|
||||
'cuEventRecord': (c_int, cu_event, cu_stream),
|
||||
|
||||
# CUresult cuEventSynchronize ( CUevent hEvent )
|
||||
'cuEventSynchronize': (c_int, cu_event),
|
||||
|
||||
|
||||
# CUresult cuStreamWaitEvent ( CUstream hStream,
|
||||
# CUevent hEvent,
|
||||
# unsigned int Flags )
|
||||
'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
|
||||
|
||||
# CUresult cuPointerGetAttribute (
|
||||
# void *data,
|
||||
# CUpointer_attribute attribute,
|
||||
# CUdeviceptr ptr)
|
||||
'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
|
||||
|
||||
# CUresult cuMemGetAddressRange ( CUdeviceptr * pbase,
|
||||
# size_t * psize,
|
||||
# CUdeviceptr dptr
|
||||
# )
|
||||
'cuMemGetAddressRange': (c_int,
|
||||
POINTER(cu_device_ptr),
|
||||
POINTER(c_size_t),
|
||||
cu_device_ptr),
|
||||
|
||||
# CUresult cuMemHostGetFlags ( unsigned int * pFlags,
|
||||
# void * p )
|
||||
'cuMemHostGetFlags': (c_int,
|
||||
POINTER(c_uint),
|
||||
c_void_p),
|
||||
|
||||
# CUresult cuCtxSynchronize ( void )
|
||||
'cuCtxSynchronize' : (c_int,),
|
||||
|
||||
# CUresult
|
||||
# cuLinkCreate(unsigned int numOptions, CUjit_option *options,
|
||||
# void **optionValues, CUlinkState *stateOut);
|
||||
'cuLinkCreate': (c_int,
|
||||
c_uint, POINTER(cu_jit_option),
|
||||
POINTER(c_void_p), POINTER(cu_link_state)),
|
||||
|
||||
# CUresult
|
||||
# cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
|
||||
# size_t size, const char *name, unsigned
|
||||
# int numOptions, CUjit_option *options,
|
||||
# void **optionValues);
|
||||
'cuLinkAddData': (c_int,
|
||||
cu_link_state, cu_jit_input_type, c_void_p,
|
||||
c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
|
||||
POINTER(c_void_p)),
|
||||
|
||||
# CUresult
|
||||
# cuLinkAddFile(CUlinkState state, CUjitInputType type,
|
||||
# const char *path, unsigned int numOptions,
|
||||
# CUjit_option *options, void **optionValues);
|
||||
|
||||
'cuLinkAddFile': (c_int,
|
||||
cu_link_state, cu_jit_input_type, c_char_p, c_uint,
|
||||
POINTER(cu_jit_option), POINTER(c_void_p)),
|
||||
|
||||
# CUresult CUDAAPI
|
||||
# cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
|
||||
'cuLinkComplete': (c_int,
|
||||
cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
|
||||
|
||||
# CUresult CUDAAPI
|
||||
# cuLinkDestroy(CUlinkState state)
|
||||
'cuLinkDestroy': (c_int, cu_link_state),
|
||||
|
||||
# cuProfilerStart ( void )
|
||||
'cuProfilerStart': (c_int,),
|
||||
|
||||
# cuProfilerStop ( void )
|
||||
'cuProfilerStop': (c_int,),
|
||||
|
||||
# CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
|
||||
# CUfunction hfunc )
|
||||
'cuFuncGetAttribute': (c_int,
|
||||
POINTER(c_int), cu_function_attribute, cu_function),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
# int *numBlocks,
|
||||
# CUfunction func,
|
||||
# int blockSize,
|
||||
# size_t dynamicSMemSize);
|
||||
'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
|
||||
cu_function, c_size_t,
|
||||
c_uint),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
# int *numBlocks,
|
||||
# CUfunction func,
|
||||
# int blockSize,
|
||||
# size_t dynamicSMemSize,
|
||||
# unsigned int flags);
|
||||
'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
|
||||
POINTER(c_int),
|
||||
cu_function,
|
||||
c_size_t, c_uint),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
|
||||
# int *minGridSize, int *blockSize,
|
||||
# CUfunction func,
|
||||
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
|
||||
# size_t dynamicSMemSize, int blockSizeLimit);
|
||||
'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
|
||||
cu_function, cu_occupancy_b2d_size,
|
||||
c_size_t, c_int),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
|
||||
# int *minGridSize, int *blockSize,
|
||||
# CUfunction func,
|
||||
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
|
||||
# size_t dynamicSMemSize, int blockSizeLimit,
|
||||
# unsigned int flags);
|
||||
'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
|
||||
POINTER(c_int), cu_function,
|
||||
cu_occupancy_b2d_size,
|
||||
c_size_t, c_int, c_uint),
|
||||
|
||||
# CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
|
||||
'cuIpcGetMemHandle': (c_int,
|
||||
POINTER(cu_ipc_mem_handle), cu_device_ptr),
|
||||
|
||||
# CUresult cuIpcOpenMemHandle(
|
||||
# CUdeviceptr* pdptr,
|
||||
# CUipcMemHandle handle,
|
||||
# unsigned int Flags)
|
||||
'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
|
||||
c_uint),
|
||||
|
||||
# CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
|
||||
|
||||
'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
|
||||
|
||||
# CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
|
||||
'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
|
||||
|
||||
# CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
|
||||
# CUdevice dev, CUdevice peerDev )
|
||||
'cuDeviceCanAccessPeer': (c_int,
|
||||
POINTER(c_int), cu_device, cu_device),
|
||||
|
||||
# CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
|
||||
'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
|
||||
}
|
||||
@@ -0,0 +1,452 @@
|
||||
from collections import namedtuple
|
||||
import itertools
|
||||
import functools
|
||||
import operator
|
||||
import ctypes
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numba import _helperlib
|
||||
|
||||
Extent = namedtuple("Extent", ["begin", "end"])
|
||||
|
||||
attempt_nocopy_reshape = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int,
|
||||
ctypes.c_long, # nd
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # dims
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # strides
|
||||
ctypes.c_long, # newnd
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newdims
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newstrides
|
||||
ctypes.c_long, # itemsize
|
||||
ctypes.c_int, # is_f_order
|
||||
)(_helperlib.c_helpers['attempt_nocopy_reshape'])
|
||||
|
||||
|
||||
class Dim(object):
|
||||
"""A single dimension of the array
|
||||
|
||||
Attributes
|
||||
----------
|
||||
start:
|
||||
start offset
|
||||
stop:
|
||||
stop offset
|
||||
size:
|
||||
number of items
|
||||
stride:
|
||||
item stride
|
||||
"""
|
||||
__slots__ = 'start', 'stop', 'size', 'stride', 'single'
|
||||
|
||||
def __init__(self, start, stop, size, stride, single):
|
||||
self.start = start
|
||||
self.stop = stop
|
||||
self.size = size
|
||||
self.stride = stride
|
||||
self.single = single
|
||||
assert not single or size == 1
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, slice):
|
||||
start, stop, step = item.indices(self.size)
|
||||
stride = step * self.stride
|
||||
start = self.start + start * abs(self.stride)
|
||||
stop = self.start + stop * abs(self.stride)
|
||||
if stride == 0:
|
||||
size = 1
|
||||
else:
|
||||
size = _compute_size(start, stop, stride)
|
||||
ret = Dim(
|
||||
start=start,
|
||||
stop=stop,
|
||||
size=size,
|
||||
stride=stride,
|
||||
single=False
|
||||
)
|
||||
return ret
|
||||
else:
|
||||
sliced = self[item:item + 1] if item != -1 else self[-1:]
|
||||
if sliced.size != 1:
|
||||
raise IndexError
|
||||
return Dim(
|
||||
start=sliced.start,
|
||||
stop=sliced.stop,
|
||||
size=sliced.size,
|
||||
stride=sliced.stride,
|
||||
single=True,
|
||||
)
|
||||
|
||||
def get_offset(self, idx):
|
||||
return self.start + idx * self.stride
|
||||
|
||||
def __repr__(self):
|
||||
strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
|
||||
return strfmt % (self.start, self.stop, self.size, self.stride)
|
||||
|
||||
def normalize(self, base):
|
||||
return Dim(start=self.start - base, stop=self.stop - base,
|
||||
size=self.size, stride=self.stride, single=self.single)
|
||||
|
||||
def copy(self, start=None, stop=None, size=None, stride=None, single=None):
|
||||
if start is None:
|
||||
start = self.start
|
||||
if stop is None:
|
||||
stop = self.stop
|
||||
if size is None:
|
||||
size = self.size
|
||||
if stride is None:
|
||||
stride = self.stride
|
||||
if single is None:
|
||||
single = self.single
|
||||
return Dim(start, stop, size, stride, single)
|
||||
|
||||
def is_contiguous(self, itemsize):
|
||||
return self.stride == itemsize
|
||||
|
||||
|
||||
def compute_index(indices, dims):
|
||||
return sum(d.get_offset(i) for i, d in zip(indices, dims))
|
||||
|
||||
|
||||
class Element(object):
|
||||
is_array = False
|
||||
|
||||
def __init__(self, extent):
|
||||
self.extent = extent
|
||||
|
||||
def iter_contiguous_extent(self):
|
||||
yield self.extent
|
||||
|
||||
|
||||
class Array(object):
|
||||
"""A dummy numpy array-like object. Consider it an array without the
|
||||
actual data, but offset from the base data pointer.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dims: tuple of Dim
|
||||
describing each dimension of the array
|
||||
|
||||
ndim: int
|
||||
number of dimension
|
||||
|
||||
shape: tuple of int
|
||||
size of each dimension
|
||||
|
||||
strides: tuple of int
|
||||
stride of each dimension
|
||||
|
||||
itemsize: int
|
||||
itemsize
|
||||
|
||||
extent: (start, end)
|
||||
start and end offset containing the memory region
|
||||
"""
|
||||
is_array = True
|
||||
|
||||
@classmethod
|
||||
def from_desc(cls, offset, shape, strides, itemsize):
|
||||
dims = []
|
||||
for ashape, astride in zip(shape, strides):
|
||||
dim = Dim(offset, offset + ashape * astride, ashape, astride,
|
||||
single=False)
|
||||
dims.append(dim)
|
||||
offset = 0 # offset only applies to first dimension
|
||||
return cls(dims, itemsize)
|
||||
|
||||
def __init__(self, dims, itemsize):
|
||||
self.dims = tuple(dims)
|
||||
self.ndim = len(self.dims)
|
||||
self.shape = tuple(dim.size for dim in self.dims)
|
||||
self.strides = tuple(dim.stride for dim in self.dims)
|
||||
self.itemsize = itemsize
|
||||
self.size = functools.reduce(operator.mul, self.shape, 1)
|
||||
self.extent = self._compute_extent()
|
||||
self.flags = self._compute_layout()
|
||||
|
||||
def _compute_layout(self):
|
||||
# The logic here is based on that in _UpdateContiguousFlags from
|
||||
# numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
|
||||
# 13661ac70).
|
||||
# https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
|
||||
|
||||
# Records have no dims, and we can treat them as contiguous
|
||||
if not self.dims:
|
||||
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
|
||||
|
||||
# If this is a broadcast array then it is not contiguous
|
||||
if any([dim.stride == 0 for dim in self.dims]):
|
||||
return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
|
||||
|
||||
flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
|
||||
|
||||
# Check C contiguity
|
||||
sd = self.itemsize
|
||||
for dim in reversed(self.dims):
|
||||
if dim.size == 0:
|
||||
# Contiguous by definition
|
||||
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
|
||||
if dim.size != 1:
|
||||
if dim.stride != sd:
|
||||
flags['C_CONTIGUOUS'] = False
|
||||
sd *= dim.size
|
||||
|
||||
# Check F contiguity
|
||||
sd = self.itemsize
|
||||
for dim in self.dims:
|
||||
if dim.size != 1:
|
||||
if dim.stride != sd:
|
||||
flags['F_CONTIGUOUS'] = False
|
||||
return flags
|
||||
sd *= dim.size
|
||||
|
||||
return flags
|
||||
|
||||
def _compute_extent(self):
|
||||
firstidx = [0] * self.ndim
|
||||
lastidx = [s - 1 for s in self.shape]
|
||||
start = compute_index(firstidx, self.dims)
|
||||
stop = compute_index(lastidx, self.dims) + self.itemsize
|
||||
stop = max(stop, start) # ensure positive extent
|
||||
return Extent(start, stop)
|
||||
|
||||
def __repr__(self):
|
||||
return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if not isinstance(item, tuple):
|
||||
item = [item]
|
||||
else:
|
||||
item = list(item)
|
||||
|
||||
nitem = len(item)
|
||||
ndim = len(self.dims)
|
||||
if nitem > ndim:
|
||||
raise IndexError("%d extra indices given" % (nitem - ndim,))
|
||||
|
||||
# Add empty slices for missing indices
|
||||
while len(item) < ndim:
|
||||
item.append(slice(None, None))
|
||||
|
||||
dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
|
||||
newshape = [d.size for d in dims if not d.single]
|
||||
|
||||
arr = Array(dims, self.itemsize)
|
||||
if newshape:
|
||||
return arr.reshape(*newshape)[0]
|
||||
else:
|
||||
return Element(arr.extent)
|
||||
|
||||
@property
|
||||
def is_c_contig(self):
|
||||
return self.flags['C_CONTIGUOUS']
|
||||
|
||||
@property
|
||||
def is_f_contig(self):
|
||||
return self.flags['F_CONTIGUOUS']
|
||||
|
||||
def iter_contiguous_extent(self):
|
||||
""" Generates extents
|
||||
"""
|
||||
if self.is_c_contig or self.is_f_contig:
|
||||
yield self.extent
|
||||
else:
|
||||
if self.dims[0].stride < self.dims[-1].stride:
|
||||
innerdim = self.dims[0]
|
||||
outerdims = self.dims[1:]
|
||||
outershape = self.shape[1:]
|
||||
else:
|
||||
innerdim = self.dims[-1]
|
||||
outerdims = self.dims[:-1]
|
||||
outershape = self.shape[:-1]
|
||||
|
||||
if innerdim.is_contiguous(self.itemsize):
|
||||
oslen = [range(s) for s in outershape]
|
||||
for indices in itertools.product(*oslen):
|
||||
base = compute_index(indices, outerdims)
|
||||
yield base + innerdim.start, base + innerdim.stop
|
||||
else:
|
||||
oslen = [range(s) for s in self.shape]
|
||||
for indices in itertools.product(*oslen):
|
||||
offset = compute_index(indices, self.dims)
|
||||
yield offset, offset + self.itemsize
|
||||
|
||||
def reshape(self, *newdims, **kws):
|
||||
oldnd = self.ndim
|
||||
newnd = len(newdims)
|
||||
|
||||
if newdims == self.shape:
|
||||
return self, None
|
||||
|
||||
order = kws.pop('order', 'C')
|
||||
if kws:
|
||||
raise TypeError('unknown keyword arguments %s' % kws.keys())
|
||||
if order not in 'CFA':
|
||||
raise ValueError('order not C|F|A')
|
||||
|
||||
# check for exactly one instance of -1 in newdims
|
||||
# https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515 # noqa: E501
|
||||
unknownidx = -1
|
||||
knownsize = 1
|
||||
for i, dim in enumerate(newdims):
|
||||
if dim < 0:
|
||||
if unknownidx == -1:
|
||||
unknownidx = i
|
||||
else:
|
||||
raise ValueError("can only specify one unknown dimension")
|
||||
else:
|
||||
knownsize *= dim
|
||||
|
||||
# compute the missing dimension
|
||||
if unknownidx >= 0:
|
||||
if knownsize == 0 or self.size % knownsize != 0:
|
||||
raise ValueError("cannot infer valid shape "
|
||||
"for unknown dimension")
|
||||
else:
|
||||
newdims = newdims[0:unknownidx] \
|
||||
+ (self.size // knownsize,) \
|
||||
+ newdims[unknownidx + 1:]
|
||||
|
||||
newsize = functools.reduce(operator.mul, newdims, 1)
|
||||
|
||||
if order == 'A':
|
||||
order = 'F' if self.is_f_contig else 'C'
|
||||
|
||||
if newsize != self.size:
|
||||
raise ValueError("reshape changes the size of the array")
|
||||
|
||||
if self.is_c_contig or self.is_f_contig:
|
||||
if order == 'C':
|
||||
newstrides = list(iter_strides_c_contig(self, newdims))
|
||||
elif order == 'F':
|
||||
newstrides = list(iter_strides_f_contig(self, newdims))
|
||||
else:
|
||||
raise AssertionError("unreachable")
|
||||
else:
|
||||
newstrides = np.empty(newnd, np.ctypeslib.c_intp)
|
||||
|
||||
# need to keep these around in variables, not temporaries, so they
|
||||
# don't get GC'ed before we call into the C code
|
||||
olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
|
||||
oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
|
||||
newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
|
||||
|
||||
if not attempt_nocopy_reshape(
|
||||
oldnd,
|
||||
olddims,
|
||||
oldstrides,
|
||||
newnd,
|
||||
newdims,
|
||||
newstrides,
|
||||
self.itemsize,
|
||||
order == 'F',
|
||||
):
|
||||
raise NotImplementedError('reshape would require copy')
|
||||
|
||||
ret = self.from_desc(self.extent.begin, shape=newdims,
|
||||
strides=newstrides, itemsize=self.itemsize)
|
||||
|
||||
return ret, list(self.iter_contiguous_extent())
|
||||
|
||||
def squeeze(self, axis=None):
|
||||
newshape, newstrides = [], []
|
||||
if axis is None:
|
||||
for length, stride in zip(self.shape, self.strides):
|
||||
if length != 1:
|
||||
newshape.append(length)
|
||||
newstrides.append(stride)
|
||||
else:
|
||||
if not isinstance(axis, tuple):
|
||||
axis = (axis,)
|
||||
for ax in axis:
|
||||
if self.shape[ax] != 1:
|
||||
raise ValueError(
|
||||
"cannot select an axis to squeeze out which has size "
|
||||
"not equal to one"
|
||||
)
|
||||
for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
|
||||
if i not in axis:
|
||||
newshape.append(length)
|
||||
newstrides.append(stride)
|
||||
newarr = self.from_desc(
|
||||
self.extent.begin,
|
||||
shape=newshape,
|
||||
strides=newstrides,
|
||||
itemsize=self.itemsize,
|
||||
)
|
||||
return newarr, list(self.iter_contiguous_extent())
|
||||
|
||||
def ravel(self, order='C'):
|
||||
if order not in 'CFA':
|
||||
raise ValueError('order not C|F|A')
|
||||
|
||||
if (order in 'CA' and self.is_c_contig
|
||||
or order in 'FA' and self.is_f_contig):
|
||||
newshape = (self.size,)
|
||||
newstrides = (self.itemsize,)
|
||||
arr = self.from_desc(self.extent.begin, newshape, newstrides,
|
||||
self.itemsize)
|
||||
return arr, list(self.iter_contiguous_extent())
|
||||
|
||||
else:
|
||||
raise NotImplementedError("ravel on non-contiguous array")
|
||||
|
||||
|
||||
def iter_strides_f_contig(arr, shape=None):
|
||||
"""yields the f-contiguous strides
|
||||
"""
|
||||
shape = arr.shape if shape is None else shape
|
||||
itemsize = arr.itemsize
|
||||
yield itemsize
|
||||
sum = 1
|
||||
for s in shape[:-1]:
|
||||
sum *= s
|
||||
yield sum * itemsize
|
||||
|
||||
|
||||
def iter_strides_c_contig(arr, shape=None):
|
||||
"""yields the c-contiguous strides
|
||||
"""
|
||||
shape = arr.shape if shape is None else shape
|
||||
itemsize = arr.itemsize
|
||||
|
||||
def gen():
|
||||
yield itemsize
|
||||
sum = 1
|
||||
for s in reversed(shape[1:]):
|
||||
sum *= s
|
||||
yield sum * itemsize
|
||||
|
||||
for i in reversed(list(gen())):
|
||||
yield i
|
||||
|
||||
|
||||
def is_element_indexing(item, ndim):
|
||||
if isinstance(item, slice):
|
||||
return False
|
||||
|
||||
elif isinstance(item, tuple):
|
||||
if len(item) == ndim:
|
||||
if not any(isinstance(it, slice) for it in item):
|
||||
return True
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _compute_size(start, stop, step):
|
||||
"""Algorithm adapted from cpython rangeobject.c
|
||||
"""
|
||||
if step > 0:
|
||||
lo = start
|
||||
hi = stop
|
||||
else:
|
||||
lo = stop
|
||||
hi = start
|
||||
step = -step
|
||||
if lo >= hi:
|
||||
return 0
|
||||
return (hi - lo - 1) // step + 1
|
||||
@@ -0,0 +1,607 @@
|
||||
"""
|
||||
Enum values for CUDA driver. Information about the values
|
||||
can be found on the official NVIDIA documentation website.
|
||||
ref: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
|
||||
anchor: #group__CUDA__TYPES
|
||||
"""
|
||||
|
||||
|
||||
# Error codes
|
||||
|
||||
CUDA_SUCCESS = 0
|
||||
CUDA_ERROR_INVALID_VALUE = 1
|
||||
CUDA_ERROR_OUT_OF_MEMORY = 2
|
||||
CUDA_ERROR_NOT_INITIALIZED = 3
|
||||
CUDA_ERROR_DEINITIALIZED = 4
|
||||
CUDA_ERROR_PROFILER_DISABLED = 5
|
||||
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
|
||||
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
|
||||
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
|
||||
CUDA_ERROR_STUB_LIBRARY = 34
|
||||
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
|
||||
CUDA_ERROR_NO_DEVICE = 100
|
||||
CUDA_ERROR_INVALID_DEVICE = 101
|
||||
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
|
||||
CUDA_ERROR_INVALID_IMAGE = 200
|
||||
CUDA_ERROR_INVALID_CONTEXT = 201
|
||||
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
|
||||
CUDA_ERROR_MAP_FAILED = 205
|
||||
CUDA_ERROR_UNMAP_FAILED = 206
|
||||
CUDA_ERROR_ARRAY_IS_MAPPED = 207
|
||||
CUDA_ERROR_ALREADY_MAPPED = 208
|
||||
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
|
||||
CUDA_ERROR_ALREADY_ACQUIRED = 210
|
||||
CUDA_ERROR_NOT_MAPPED = 211
|
||||
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
|
||||
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
|
||||
CUDA_ERROR_ECC_UNCORRECTABLE = 214
|
||||
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
|
||||
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
|
||||
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
|
||||
CUDA_ERROR_INVALID_PTX = 218
|
||||
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
|
||||
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
|
||||
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
|
||||
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
|
||||
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
|
||||
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
|
||||
CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
|
||||
CUDA_ERROR_INVALID_SOURCE = 300
|
||||
CUDA_ERROR_FILE_NOT_FOUND = 301
|
||||
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
|
||||
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
|
||||
CUDA_ERROR_OPERATING_SYSTEM = 304
|
||||
CUDA_ERROR_INVALID_HANDLE = 400
|
||||
CUDA_ERROR_ILLEGAL_STATE = 401
|
||||
CUDA_ERROR_NOT_FOUND = 500
|
||||
CUDA_ERROR_NOT_READY = 600
|
||||
CUDA_ERROR_LAUNCH_FAILED = 700
|
||||
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
|
||||
CUDA_ERROR_LAUNCH_TIMEOUT = 702
|
||||
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
|
||||
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
|
||||
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
|
||||
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
|
||||
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
|
||||
CUDA_ERROR_ASSERT = 710
|
||||
CUDA_ERROR_TOO_MANY_PEERS = 711
|
||||
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
|
||||
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
|
||||
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
|
||||
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
|
||||
CUDA_ERROR_MISALIGNED_ADDRESS = 716
|
||||
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
|
||||
CUDA_ERROR_INVALID_PC = 718
|
||||
CUDA_ERROR_LAUNCH_FAILED = 719
|
||||
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
|
||||
CUDA_ERROR_NOT_PERMITTED = 800
|
||||
CUDA_ERROR_NOT_SUPPORTED = 801
|
||||
CUDA_ERROR_SYSTEM_NOT_READY = 802
|
||||
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
|
||||
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
|
||||
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
|
||||
CUDA_ERROR_MPS_RPC_FAILURE = 806
|
||||
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
|
||||
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
|
||||
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
|
||||
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
|
||||
CUDA_ERROR_CDP_NOT_SUPPORTED = 811
|
||||
CUDA_ERROR_CDP_VERSION_MISMATCH = 812
|
||||
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
|
||||
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
|
||||
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
|
||||
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
|
||||
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
|
||||
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
|
||||
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
|
||||
CUDA_ERROR_CAPTURED_EVENT = 907
|
||||
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
|
||||
CUDA_ERROR_TIMEOUT = 909
|
||||
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
|
||||
CUDA_ERROR_EXTERNAL_DEVICE = 911
|
||||
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
|
||||
CUDA_ERROR_UNKNOWN = 999
|
||||
|
||||
|
||||
# Function cache configurations
|
||||
|
||||
# no preference for shared memory or L1 (default)
|
||||
CU_FUNC_CACHE_PREFER_NONE = 0x00
|
||||
# prefer larger shared memory and smaller L1 cache
|
||||
CU_FUNC_CACHE_PREFER_SHARED = 0x01
|
||||
# prefer larger L1 cache and smaller shared memory
|
||||
CU_FUNC_CACHE_PREFER_L1 = 0x02
|
||||
# prefer equal sized L1 cache and shared memory
|
||||
CU_FUNC_CACHE_PREFER_EQUAL = 0x03
|
||||
|
||||
|
||||
# Context creation flags
|
||||
|
||||
# Automatic scheduling
|
||||
CU_CTX_SCHED_AUTO = 0x00
|
||||
# Set spin as default scheduling
|
||||
CU_CTX_SCHED_SPIN = 0x01
|
||||
# Set yield as default scheduling
|
||||
CU_CTX_SCHED_YIELD = 0x02
|
||||
# Set blocking synchronization as default scheduling
|
||||
CU_CTX_SCHED_BLOCKING_SYNC = 0x04
|
||||
|
||||
CU_CTX_SCHED_MASK = 0x07
|
||||
# Support mapped pinned allocations
|
||||
# This flag was deprecated as of CUDA 11.0 and it no longer has effect.
|
||||
# All contexts as of CUDA 3.2 behave as though the flag is enabled.
|
||||
CU_CTX_MAP_HOST = 0x08
|
||||
# Keep local memory allocation after launch
|
||||
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10
|
||||
# Trigger coredumps from exceptions in this context
|
||||
CU_CTX_COREDUMP_ENABLE = 0x20
|
||||
# Enable user pipe to trigger coredumps in this context
|
||||
CU_CTX_USER_COREDUMP_ENABLE = 0x40
|
||||
# Force synchronous blocking on cudaMemcpy/cudaMemset
|
||||
CU_CTX_SYNC_MEMOPS = 0x80
|
||||
|
||||
CU_CTX_FLAGS_MASK = 0xff
|
||||
|
||||
|
||||
# DEFINES
|
||||
|
||||
# If set, host memory is portable between CUDA contexts.
|
||||
# Flag for cuMemHostAlloc()
|
||||
CU_MEMHOSTALLOC_PORTABLE = 0x01
|
||||
|
||||
# If set, host memory is mapped into CUDA address space and
|
||||
# cuMemHostGetDevicePointer() may be called on the host pointer.
|
||||
# Flag for cuMemHostAlloc()
|
||||
CU_MEMHOSTALLOC_DEVICEMAP = 0x02
|
||||
|
||||
# If set, host memory is allocated as write-combined - fast to write,
|
||||
# faster to DMA, slow to read except via SSE4 streaming load instruction
|
||||
# (MOVNTDQA).
|
||||
# Flag for cuMemHostAlloc()
|
||||
CU_MEMHOSTALLOC_WRITECOMBINED = 0x04
|
||||
|
||||
|
||||
# If set, host memory is portable between CUDA contexts.
|
||||
# Flag for cuMemHostRegister()
|
||||
CU_MEMHOSTREGISTER_PORTABLE = 0x01
|
||||
|
||||
# If set, host memory is mapped into CUDA address space and
|
||||
# cuMemHostGetDevicePointer() may be called on the host pointer.
|
||||
# Flag for cuMemHostRegister()
|
||||
CU_MEMHOSTREGISTER_DEVICEMAP = 0x02
|
||||
|
||||
# If set, the passed memory pointer is treated as pointing to some
|
||||
# memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
|
||||
# On Windows the flag is a no-op. On Linux that memory is marked
|
||||
# as non cache-coherent for the GPU and is expected
|
||||
# to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED
|
||||
# if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older
|
||||
# Linux kernel versions. On all other platforms, it is not supported
|
||||
# and CUDA_ERROR_NOT_SUPPORTED is returned.
|
||||
# Flag for cuMemHostRegister()
|
||||
CU_MEMHOSTREGISTER_IOMEMORY = 0x04
|
||||
|
||||
# If set, the passed memory pointer is treated as pointing to memory
|
||||
# that is considered read-only by the device. On platforms without
|
||||
# CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
|
||||
# this flag is required in order to register memory mapped
|
||||
# to the CPU as read-only. Support for the use of this flag can be
|
||||
# queried from the device attribute
|
||||
# CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.
|
||||
# Using this flag with a current context associated with a device
|
||||
# that does not have this attribute set will cause cuMemHostRegister
|
||||
# to error with CUDA_ERROR_NOT_SUPPORTED.
|
||||
CU_MEMHOSTREGISTER_READ_ONLY = 0x08
|
||||
|
||||
|
||||
# CUDA Mem Attach Flags
|
||||
|
||||
# If set, managed memory is accessible from all streams on all devices.
|
||||
CU_MEM_ATTACH_GLOBAL = 0x01
|
||||
|
||||
# If set on a platform where the device attribute
|
||||
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory is
|
||||
# only accessible on the host (unless explicitly attached to a stream
|
||||
# with cudaStreamAttachMemAsync, in which case it can be used in kernels
|
||||
# launched on that stream).
|
||||
CU_MEM_ATTACH_HOST = 0x02
|
||||
|
||||
# If set on a platform where the device attribute
|
||||
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory accesses
|
||||
# on the associated device must only be from a single stream.
|
||||
CU_MEM_ATTACH_SINGLE = 0x04
|
||||
|
||||
|
||||
# Event creation flags
|
||||
|
||||
# Default event flag
|
||||
CU_EVENT_DEFAULT = 0x0
|
||||
# Event uses blocking synchronization
|
||||
CU_EVENT_BLOCKING_SYNC = 0x1
|
||||
# Event will not record timing data
|
||||
CU_EVENT_DISABLE_TIMING = 0x2
|
||||
# Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
|
||||
CU_EVENT_INTERPROCESS = 0x4
|
||||
|
||||
|
||||
# Pointer information
|
||||
|
||||
# The CUcontext on which a pointer was allocated or registered
|
||||
CU_POINTER_ATTRIBUTE_CONTEXT = 1
|
||||
# The CUmemorytype describing the physical location of a pointer
|
||||
CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2
|
||||
# The address at which a pointer's memory may be accessed on the device
|
||||
CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3
|
||||
# The address at which a pointer's memory may be accessed on the host
|
||||
CU_POINTER_ATTRIBUTE_HOST_POINTER = 4
|
||||
# A pair of tokens for use with the nv-p2p.h Linux kernel interface
|
||||
CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5
|
||||
# Synchronize every synchronous memory operation initiated on this region
|
||||
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6
|
||||
# A process-wide unique ID for an allocated memory region
|
||||
CU_POINTER_ATTRIBUTE_BUFFER_ID = 7
|
||||
# Indicates if the pointer points to managed memory
|
||||
CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
|
||||
# A device ordinal of a device on which a pointer was allocated or registered
|
||||
CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9
|
||||
# 1 if this pointer maps to an allocation
|
||||
# that is suitable for cudaIpcGetMemHandle, 0 otherwise
|
||||
CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10
|
||||
# Starting address for this requested pointer
|
||||
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11
|
||||
# Size of the address range for this requested pointer
|
||||
CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12
|
||||
# 1 if this pointer is in a valid address range
|
||||
# that is mapped to a backing allocation, 0 otherwise
|
||||
CU_POINTER_ATTRIBUTE_MAPPED = 13
|
||||
# Bitmask of allowed CUmemAllocationHandleType for this allocation
|
||||
CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14
|
||||
# 1 if the memory this pointer is referencing
|
||||
# can be used with the GPUDirect RDMA API
|
||||
CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
|
||||
# Returns the access flags the device associated
|
||||
# with the current context has on the corresponding
|
||||
# memory referenced by the pointer given
|
||||
CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16
|
||||
# Returns the mempool handle for the allocation
|
||||
# if it was allocated from a mempool. Otherwise returns NULL
|
||||
CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17
|
||||
# Size of the actual underlying mapping that the pointer belongs to
|
||||
CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18
|
||||
# The start address of the mapping that the pointer belongs to
|
||||
CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19
|
||||
# A process-wide unique id corresponding to the
|
||||
# physical allocation the pointer belongs to
|
||||
CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20
|
||||
|
||||
|
||||
# Memory types
|
||||
|
||||
# Host memory
|
||||
CU_MEMORYTYPE_HOST = 0x01
|
||||
# Device memory
|
||||
CU_MEMORYTYPE_DEVICE = 0x02
|
||||
# Array memory
|
||||
CU_MEMORYTYPE_ARRAY = 0x03
|
||||
# Unified device or host memory
|
||||
CU_MEMORYTYPE_UNIFIED = 0x04
|
||||
|
||||
|
||||
# Device code formats
|
||||
|
||||
# Compiled device-class-specific device code
|
||||
# Applicable options: none
|
||||
CU_JIT_INPUT_CUBIN = 0
|
||||
|
||||
# PTX source code
|
||||
# Applicable options: PTX compiler options
|
||||
CU_JIT_INPUT_PTX = 1
|
||||
|
||||
# Bundle of multiple cubins and/or PTX of some device code
|
||||
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
|
||||
CU_JIT_INPUT_FATBINARY = 2
|
||||
|
||||
# Host object with embedded device code
|
||||
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
|
||||
CU_JIT_INPUT_OBJECT = 3
|
||||
|
||||
# Archive of host objects with embedded device code
|
||||
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
|
||||
CU_JIT_INPUT_LIBRARY = 4
|
||||
|
||||
CU_JIT_NUM_INPUT_TYPES = 6
|
||||
|
||||
|
||||
# Online compiler and linker options
|
||||
|
||||
# Max number of registers that a thread may use.
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_MAX_REGISTERS = 0
|
||||
|
||||
# IN: Specifies minimum number of threads per block to target compilation
|
||||
# for
|
||||
# OUT: Returns the number of threads the compiler actually targeted.
|
||||
# This restricts the resource utilization fo the compiler (e.g. max
|
||||
# registers) such that a block with the given number of threads should be
|
||||
# able to launch based on register limitations. Note, this option does not
|
||||
# currently take into account any other resource limitations, such as
|
||||
# shared memory utilization.
|
||||
# Cannot be combined with ::CU_JIT_TARGET.
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_THREADS_PER_BLOCK = 1
|
||||
|
||||
# Overwrites the option value with the total wall clock time, in
|
||||
# milliseconds, spent in the compiler and linker
|
||||
# Option type: float
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_WALL_TIME = 2
|
||||
|
||||
# Pointer to a buffer in which to print any log messages
|
||||
# that are informational in nature (the buffer size is specified via
|
||||
# option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
|
||||
# Option type: char *
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_INFO_LOG_BUFFER = 3
|
||||
|
||||
# IN: Log buffer size in bytes. Log messages will be capped at this size
|
||||
# (including null terminator)
|
||||
# OUT: Amount of log buffer filled with messages
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4
|
||||
|
||||
# Pointer to a buffer in which to print any log messages that
|
||||
# reflect errors (the buffer size is specified via option
|
||||
# ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
|
||||
# Option type: char *
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_ERROR_LOG_BUFFER = 5
|
||||
|
||||
# IN: Log buffer size in bytes. Log messages will be capped at this size
|
||||
# (including null terminator)
|
||||
# OUT: Amount of log buffer filled with messages
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6
|
||||
|
||||
# Level of optimizations to apply to generated code (0 - 4), with 4
|
||||
# being the default and highest level of optimizations.
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_OPTIMIZATION_LEVEL = 7
|
||||
|
||||
# No option value required. Determines the target based on the current
|
||||
# attached context (default)
|
||||
# Option type: No option value needed
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_TARGET_FROM_CUCONTEXT = 8
|
||||
|
||||
# Target is chosen based on supplied ::CUjit_target. Cannot be
|
||||
# combined with ::CU_JIT_THREADS_PER_BLOCK.
|
||||
# Option type: unsigned int for enumerated type ::CUjit_target
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_TARGET = 9
|
||||
|
||||
# Specifies choice of fallback strategy if matching cubin is not found.
|
||||
# Choice is based on supplied ::CUjit_fallback.
|
||||
# Option type: unsigned int for enumerated type ::CUjit_fallback
|
||||
# Applies to: compiler only
|
||||
CU_JIT_FALLBACK_STRATEGY = 10
|
||||
|
||||
# Specifies whether to create debug information in output (-g)
|
||||
# (0: false, default)
|
||||
# Option type: int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_GENERATE_DEBUG_INFO = 11
|
||||
|
||||
# Generate verbose log messages (0: false, default)
|
||||
# Option type: int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_LOG_VERBOSE = 12
|
||||
|
||||
# Generate line number information (-lineinfo) (0: false, default)
|
||||
# Option type: int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_GENERATE_LINE_INFO = 13
|
||||
|
||||
# Specifies whether to enable caching explicitly (-dlcm)
|
||||
# Choice is based on supplied ::CUjit_cacheMode_enum.
|
||||
# Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum
|
||||
# Applies to: compiler only
|
||||
CU_JIT_CACHE_MODE = 14
|
||||
|
||||
|
||||
# CUfunction_attribute
|
||||
|
||||
# The maximum number of threads per block, beyond which a launch of the
|
||||
# function would fail. This number depends on both the function and the
|
||||
# device on which the function is currently loaded.
|
||||
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0
|
||||
|
||||
# The size in bytes of statically-allocated shared memory required by
|
||||
# this function. This does not include dynamically-allocated shared
|
||||
# memory requested by the user at runtime.
|
||||
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1
|
||||
|
||||
# The size in bytes of user-allocated constant memory required by this
|
||||
# function.
|
||||
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2
|
||||
|
||||
# The size in bytes of local memory used by each thread of this function.
|
||||
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3
|
||||
|
||||
# The number of registers used by each thread of this function.
|
||||
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
|
||||
|
||||
# The PTX virtual architecture version for which the function was
|
||||
# compiled. This value is the major PTX version * 10 + the minor PTX
|
||||
# version, so a PTX version 1.3 function would return the value 13.
|
||||
# Note that this may return the undefined value of 0 for cubins
|
||||
# compiled prior to CUDA 3.0.
|
||||
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5
|
||||
|
||||
# The binary architecture version for which the function was compiled.
|
||||
# This value is the major binary version * 10 + the minor binary version,
|
||||
# so a binary version 1.3 function would return the value 13. Note that
|
||||
# this will return a value of 10 for legacy cubins that do not have a
|
||||
# properly-encoded binary architecture version.
|
||||
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6
|
||||
|
||||
# The attribute to indicate whether the function has been compiled
|
||||
# with user specified option "-Xptxas --dlcm=ca" set
|
||||
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
|
||||
|
||||
# The maximum size in bytes of dynamically-allocated shared memory
|
||||
# that can be used by this function. If the user-specified
|
||||
# dynamic shared memory size is larger than this value,
|
||||
# the launch will fail. See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8
|
||||
|
||||
# On devices where the L1 cache and shared memory use the same
|
||||
# hardware resources, this sets the shared memory carveout preference,
|
||||
# in percent of the total shared memory. Refer to
|
||||
# CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
|
||||
# This is only a hint, and the driver can choose a different ratio
|
||||
# if required to execute the function.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9
|
||||
|
||||
# If this attribute is set, the kernel must launch with a valid cluster
|
||||
# size specified. See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10
|
||||
|
||||
# The required cluster width in blocks. The values must either all be 0
|
||||
# or all be positive. The validity of the cluster dimensions
|
||||
# is otherwise checked at launch time. If the value is set during
|
||||
# compile time, it cannot be set at runtime.
|
||||
# Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11
|
||||
|
||||
# The required cluster height in blocks. The values must either all be 0
|
||||
# or all be positive. The validity of the cluster dimensions
|
||||
# is otherwise checked at launch time.If the value is set during
|
||||
# compile time, it cannot be set at runtime.
|
||||
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12
|
||||
|
||||
# The required cluster depth in blocks. The values must either all be 0
|
||||
# or all be positive. The validity of the cluster dimensions
|
||||
# is otherwise checked at launch time.If the value is set during
|
||||
# compile time, it cannot be set at runtime.
|
||||
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
|
||||
|
||||
# Whether the function can be launched with non-portable cluster size.
|
||||
# 1 is allowed, 0 is disallowed. A non-portable cluster size may only
|
||||
# function on the specific SKUs the program is tested on.
|
||||
# The launch might fail if the program is run on a different hardware platform.
|
||||
# For more details refer to link :
|
||||
# https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
|
||||
CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
|
||||
|
||||
# The block scheduling policy of a function.
|
||||
# The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
|
||||
|
||||
|
||||
# Device attributes
|
||||
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4
|
||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5
|
||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6
|
||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
|
||||
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9
|
||||
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10
|
||||
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12
|
||||
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
|
||||
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14
|
||||
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
|
||||
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
|
||||
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
|
||||
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_WIDTH = 21
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_WIDTH = 22
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_HEIGHT = 23
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH = 24
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT = 25
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH = 26
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_WIDTH = 27
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_HEIGHT = 28
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_LAYERS = 29
|
||||
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30
|
||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31
|
||||
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32
|
||||
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
|
||||
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
|
||||
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
|
||||
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37
|
||||
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTI_PROCESSOR = 39
|
||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40
|
||||
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_WIDTH = 42
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_LAYERS = 43
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_WIDTH = 45
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_HEIGHT = 46
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH_ALT = 47
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT_ALT = 48
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH_ALT = 49
|
||||
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
|
||||
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_WIDTH = 52
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_WIDTH = 53
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_LAYERS = 54
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_WIDTH = 55
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_WIDTH = 56
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_HEIGHT = 57
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_WIDTH = 58
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_HEIGHT = 59
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_DEPTH = 60
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_WIDTH = 61
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_LAYERS = 62
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_WIDTH = 63
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_HEIGHT = 64
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_LAYERS = 65
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_WIDTH = 66
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_WIDTH = 67
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_LAYERS = 68
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LINEAR_WIDTH = 69
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_WIDTH = 70
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_HEIGHT = 71
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_PITCH = 72
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_MIPMAPPED_WIDTH = 73
|
||||
CU_DEVICE_ATTRIBUTE_MAX_MAX_TEXTURE_2D_MIPMAPPED_HEIGHT = 74
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_MIPMAPPED_WIDTH = 77
|
||||
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78
|
||||
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79
|
||||
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
|
||||
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83
|
||||
CU_DEVICE_ATTRIBUTE_IS_MULTI_GPU_BOARD = 84
|
||||
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85
|
||||
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86
|
||||
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87
|
||||
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
|
||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90
|
||||
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
|
||||
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95
|
||||
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
|
||||
@@ -0,0 +1,36 @@
|
||||
class CudaDriverError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CudaRuntimeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CudaSupportError(ImportError):
|
||||
pass
|
||||
|
||||
|
||||
class NvvmError(Exception):
|
||||
def __str__(self):
|
||||
return '\n'.join(map(str, self.args))
|
||||
|
||||
|
||||
class NvvmSupportError(ImportError):
|
||||
pass
|
||||
|
||||
|
||||
class NvvmWarning(Warning):
|
||||
pass
|
||||
|
||||
|
||||
class NvrtcError(Exception):
|
||||
def __str__(self):
|
||||
return '\n'.join(map(str, self.args))
|
||||
|
||||
|
||||
class NvrtcCompilationError(NvrtcError):
|
||||
pass
|
||||
|
||||
|
||||
class NvrtcSupportError(ImportError):
|
||||
pass
|
||||
@@ -0,0 +1,176 @@
|
||||
"""CUDA Toolkit libraries lookup utilities.
|
||||
|
||||
CUDA Toolkit libraries can be available via either:
|
||||
|
||||
- the `cuda-nvcc` and `cuda-nvrtc` conda packages for CUDA 12,
|
||||
- the `cudatoolkit` conda package for CUDA 11,
|
||||
- a user supplied location from CUDA_HOME,
|
||||
- a system wide location,
|
||||
- package-specific locations (e.g. the Debian NVIDIA packages),
|
||||
- or can be discovered by the system loader.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ctypes
|
||||
|
||||
from numba.misc.findlib import find_lib
|
||||
from numba.cuda.cuda_paths import get_cuda_paths
|
||||
from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
|
||||
from numba.cuda.cudadrv.error import CudaSupportError
|
||||
|
||||
|
||||
if sys.platform == 'win32':
|
||||
_dllnamepattern = '%s.dll'
|
||||
_staticnamepattern = '%s.lib'
|
||||
elif sys.platform == 'darwin':
|
||||
_dllnamepattern = 'lib%s.dylib'
|
||||
_staticnamepattern = 'lib%s.a'
|
||||
else:
|
||||
_dllnamepattern = 'lib%s.so'
|
||||
_staticnamepattern = 'lib%s.a'
|
||||
|
||||
|
||||
def get_libdevice():
|
||||
d = get_cuda_paths()
|
||||
paths = d['libdevice'].info
|
||||
return paths
|
||||
|
||||
|
||||
def open_libdevice():
|
||||
with open(get_libdevice(), 'rb') as bcfile:
|
||||
return bcfile.read()
|
||||
|
||||
|
||||
def get_cudalib(lib, static=False):
|
||||
"""
|
||||
Find the path of a CUDA library based on a search of known locations. If
|
||||
the search fails, return a generic filename for the library (e.g.
|
||||
'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
|
||||
loader's search mechanism.
|
||||
"""
|
||||
if lib == 'nvvm':
|
||||
return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
|
||||
else:
|
||||
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
|
||||
libdir = get_cuda_paths()[dir_type].info
|
||||
|
||||
candidates = find_lib(lib, libdir, static=static)
|
||||
namepattern = _staticnamepattern if static else _dllnamepattern
|
||||
return max(candidates) if candidates else namepattern % lib
|
||||
|
||||
|
||||
def open_cudalib(lib):
|
||||
path = get_cudalib(lib)
|
||||
return ctypes.CDLL(path)
|
||||
|
||||
|
||||
def check_static_lib(path):
|
||||
if not os.path.isfile(path):
|
||||
raise FileNotFoundError(f'{path} not found')
|
||||
|
||||
|
||||
def _get_source_variable(lib, static=False):
|
||||
if lib == 'nvvm':
|
||||
return get_cuda_paths()['nvvm'].by
|
||||
elif lib == 'libdevice':
|
||||
return get_cuda_paths()['libdevice'].by
|
||||
else:
|
||||
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
|
||||
return get_cuda_paths()[dir_type].by
|
||||
|
||||
|
||||
def test():
|
||||
"""Test library lookup. Path info is printed to stdout.
|
||||
"""
|
||||
failed = False
|
||||
|
||||
# Check for the driver
|
||||
try:
|
||||
dlloader, candidates = locate_driver_and_loader()
|
||||
print('Finding driver from candidates:')
|
||||
for location in candidates:
|
||||
print(f'\t{location}')
|
||||
print(f'Using loader {dlloader}')
|
||||
print('\tTrying to load driver', end='...')
|
||||
dll, path = load_driver(dlloader, candidates)
|
||||
print('\tok')
|
||||
print(f'\t\tLoaded from {path}')
|
||||
except CudaSupportError as e:
|
||||
print(f'\tERROR: failed to open driver: {e}')
|
||||
failed = True
|
||||
|
||||
# Find the absolute location of the driver on Linux. Various driver-related
|
||||
# issues have been reported by WSL2 users, and it is almost always due to a
|
||||
# Linux (i.e. not- WSL2) driver being installed in a WSL2 system.
|
||||
# Providing the absolute location of the driver indicates its version
|
||||
# number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
|
||||
# look up whether the driver was intended for "native" Linux.
|
||||
if sys.platform == 'linux' and not failed:
|
||||
pid = os.getpid()
|
||||
mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
|
||||
try:
|
||||
with open(mapsfile) as f:
|
||||
maps = f.read()
|
||||
# It's difficult to predict all that might go wrong reading the maps
|
||||
# file - in case various error conditions ensue (the file is not found,
|
||||
# not readable, etc.) we use OSError to hopefully catch any of them.
|
||||
except OSError:
|
||||
# It's helpful to report that this went wrong to the user, but we
|
||||
# don't set failed to True because this doesn't have any connection
|
||||
# to actual CUDA functionality.
|
||||
print(f'\tERROR: Could not open {mapsfile} to determine absolute '
|
||||
'path to libcuda.so')
|
||||
else:
|
||||
# In this case we could read the maps, so we can report the
|
||||
# relevant ones to the user
|
||||
locations = set(s for s in maps.split() if 'libcuda.so' in s)
|
||||
print('\tMapped libcuda.so paths:')
|
||||
for location in locations:
|
||||
print(f'\t\t{location}')
|
||||
|
||||
# Checks for dynamic libraries
|
||||
libs = 'nvvm nvrtc cudart'.split()
|
||||
for lib in libs:
|
||||
path = get_cudalib(lib)
|
||||
print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
|
||||
print('\tLocated at', path)
|
||||
|
||||
try:
|
||||
print('\tTrying to open library', end='...')
|
||||
open_cudalib(lib)
|
||||
print('\tok')
|
||||
except OSError as e:
|
||||
print('\tERROR: failed to open %s:\n%s' % (lib, e))
|
||||
failed = True
|
||||
|
||||
# Check for cudadevrt (the only static library)
|
||||
lib = 'cudadevrt'
|
||||
path = get_cudalib(lib, static=True)
|
||||
print('Finding {} from {}'.format(lib, _get_source_variable(lib,
|
||||
static=True)))
|
||||
print('\tLocated at', path)
|
||||
|
||||
try:
|
||||
print('\tChecking library', end='...')
|
||||
check_static_lib(path)
|
||||
print('\tok')
|
||||
except FileNotFoundError as e:
|
||||
print('\tERROR: failed to find %s:\n%s' % (lib, e))
|
||||
failed = True
|
||||
|
||||
# Check for libdevice
|
||||
where = _get_source_variable('libdevice')
|
||||
print(f'Finding libdevice from {where}')
|
||||
path = get_libdevice()
|
||||
print('\tLocated at', path)
|
||||
|
||||
try:
|
||||
print('\tChecking library', end='...')
|
||||
check_static_lib(path)
|
||||
print('\tok')
|
||||
except FileNotFoundError as e:
|
||||
print('\tERROR: failed to find %s:\n%s' % (lib, e))
|
||||
failed = True
|
||||
|
||||
return not failed
|
||||
@@ -0,0 +1,20 @@
|
||||
from numba.cuda.cudadrv import devices, driver
|
||||
from numba.core.registry import cpu_target
|
||||
|
||||
|
||||
def _calc_array_sizeof(ndim):
|
||||
"""
|
||||
Use the ABI size in the CPU target
|
||||
"""
|
||||
ctx = cpu_target.target_context
|
||||
return ctx.calc_array_sizeof(ndim)
|
||||
|
||||
|
||||
def ndarray_device_allocate_data(ary):
|
||||
"""
|
||||
Allocate gpu data buffer
|
||||
"""
|
||||
datasize = driver.host_memory_size(ary)
|
||||
# allocate
|
||||
gpu_data = devices.get_context().memalloc(datasize)
|
||||
return gpu_data
|
||||
@@ -0,0 +1,260 @@
|
||||
from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
|
||||
from enum import IntEnum
|
||||
from numba.core import config
|
||||
from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
|
||||
NvrtcSupportError)
|
||||
|
||||
import functools
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
# Opaque handle for compilation unit
|
||||
nvrtc_program = c_void_p
|
||||
|
||||
# Result code
|
||||
nvrtc_result = c_int
|
||||
|
||||
|
||||
class NvrtcResult(IntEnum):
|
||||
NVRTC_SUCCESS = 0
|
||||
NVRTC_ERROR_OUT_OF_MEMORY = 1
|
||||
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
|
||||
NVRTC_ERROR_INVALID_INPUT = 3
|
||||
NVRTC_ERROR_INVALID_PROGRAM = 4
|
||||
NVRTC_ERROR_INVALID_OPTION = 5
|
||||
NVRTC_ERROR_COMPILATION = 6
|
||||
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
|
||||
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
|
||||
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
|
||||
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
|
||||
NVRTC_ERROR_INTERNAL_ERROR = 11
|
||||
|
||||
|
||||
_nvrtc_lock = threading.Lock()
|
||||
|
||||
|
||||
class NvrtcProgram:
|
||||
"""
|
||||
A class for managing the lifetime of nvrtcProgram instances. Instances of
|
||||
the class own an nvrtcProgram; when an instance is deleted, the underlying
|
||||
nvrtcProgram is destroyed using the appropriate NVRTC API.
|
||||
"""
|
||||
def __init__(self, nvrtc, handle):
|
||||
self._nvrtc = nvrtc
|
||||
self._handle = handle
|
||||
|
||||
@property
|
||||
def handle(self):
|
||||
return self._handle
|
||||
|
||||
def __del__(self):
|
||||
if self._handle:
|
||||
self._nvrtc.destroy_program(self)
|
||||
|
||||
|
||||
class NVRTC:
|
||||
"""
|
||||
Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
|
||||
calls.
|
||||
|
||||
The sole instance of this class is a process-wide singleton, similar to the
|
||||
NVVM interface. Initialization is protected by a lock and uses the standard
|
||||
(for Numba) open_cudalib function to load the NVRTC library.
|
||||
"""
|
||||
_PROTOTYPES = {
|
||||
# nvrtcResult nvrtcVersion(int *major, int *minor)
|
||||
'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
|
||||
# nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
||||
# const char *src,
|
||||
# const char *name,
|
||||
# int numHeaders,
|
||||
# const char * const *headers,
|
||||
# const char * const *includeNames)
|
||||
'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
|
||||
c_int, POINTER(c_char_p), POINTER(c_char_p)),
|
||||
# nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
|
||||
'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
|
||||
# nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
|
||||
# int numOptions,
|
||||
# const char * const *options)
|
||||
'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
|
||||
POINTER(c_char_p)),
|
||||
# nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
|
||||
'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
|
||||
# nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
|
||||
'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
|
||||
# nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
|
||||
# size_t *cubinSizeRet);
|
||||
'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
|
||||
# nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
|
||||
'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
|
||||
# nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
|
||||
# size_t *logSizeRet);
|
||||
'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
|
||||
POINTER(c_size_t)),
|
||||
# nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
||||
'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
|
||||
}
|
||||
|
||||
# Singleton reference
|
||||
__INSTANCE = None
|
||||
|
||||
def __new__(cls):
|
||||
with _nvrtc_lock:
|
||||
if cls.__INSTANCE is None:
|
||||
from numba.cuda.cudadrv.libs import open_cudalib
|
||||
cls.__INSTANCE = inst = object.__new__(cls)
|
||||
try:
|
||||
lib = open_cudalib('nvrtc')
|
||||
except OSError as e:
|
||||
cls.__INSTANCE = None
|
||||
raise NvrtcSupportError("NVRTC cannot be loaded") from e
|
||||
|
||||
# Find & populate functions
|
||||
for name, proto in inst._PROTOTYPES.items():
|
||||
func = getattr(lib, name)
|
||||
func.restype = proto[0]
|
||||
func.argtypes = proto[1:]
|
||||
|
||||
@functools.wraps(func)
|
||||
def checked_call(*args, func=func, name=name):
|
||||
error = func(*args)
|
||||
if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
|
||||
raise NvrtcCompilationError()
|
||||
elif error != NvrtcResult.NVRTC_SUCCESS:
|
||||
try:
|
||||
error_name = NvrtcResult(error).name
|
||||
except ValueError:
|
||||
error_name = ('Unknown nvrtc_result '
|
||||
f'(error code: {error})')
|
||||
msg = f'Failed to call {name}: {error_name}'
|
||||
raise NvrtcError(msg)
|
||||
|
||||
setattr(inst, name, checked_call)
|
||||
|
||||
return cls.__INSTANCE
|
||||
|
||||
def get_version(self):
|
||||
"""
|
||||
Get the NVRTC version as a tuple (major, minor).
|
||||
"""
|
||||
major = c_int()
|
||||
minor = c_int()
|
||||
self.nvrtcVersion(byref(major), byref(minor))
|
||||
return major.value, minor.value
|
||||
|
||||
def create_program(self, src, name):
|
||||
"""
|
||||
Create an NVRTC program with managed lifetime.
|
||||
"""
|
||||
if isinstance(src, str):
|
||||
src = src.encode()
|
||||
if isinstance(name, str):
|
||||
name = name.encode()
|
||||
|
||||
handle = nvrtc_program()
|
||||
|
||||
# The final three arguments are for passing the contents of headers -
|
||||
# this is not supported, so there are 0 headers and the header names
|
||||
# and contents are null.
|
||||
self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
|
||||
return NvrtcProgram(self, handle)
|
||||
|
||||
def compile_program(self, program, options):
|
||||
"""
|
||||
Compile an NVRTC program. Compilation may fail due to a user error in
|
||||
the source; this function returns ``True`` if there is a compilation
|
||||
error and ``False`` on success.
|
||||
"""
|
||||
# We hold a list of encoded options to ensure they can't be collected
|
||||
# prior to the call to nvrtcCompileProgram
|
||||
encoded_options = [opt.encode() for opt in options]
|
||||
option_pointers = [c_char_p(opt) for opt in encoded_options]
|
||||
c_options_type = (c_char_p * len(options))
|
||||
c_options = c_options_type(*option_pointers)
|
||||
try:
|
||||
self.nvrtcCompileProgram(program.handle, len(options), c_options)
|
||||
return False
|
||||
except NvrtcCompilationError:
|
||||
return True
|
||||
|
||||
def destroy_program(self, program):
|
||||
"""
|
||||
Destroy an NVRTC program.
|
||||
"""
|
||||
self.nvrtcDestroyProgram(byref(program.handle))
|
||||
|
||||
def get_compile_log(self, program):
|
||||
"""
|
||||
Get the compile log as a Python string.
|
||||
"""
|
||||
log_size = c_size_t()
|
||||
self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
|
||||
|
||||
log = (c_char * log_size.value)()
|
||||
self.nvrtcGetProgramLog(program.handle, log)
|
||||
|
||||
return log.value.decode()
|
||||
|
||||
def get_ptx(self, program):
|
||||
"""
|
||||
Get the compiled PTX as a Python string.
|
||||
"""
|
||||
ptx_size = c_size_t()
|
||||
self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
|
||||
|
||||
ptx = (c_char * ptx_size.value)()
|
||||
self.nvrtcGetPTX(program.handle, ptx)
|
||||
|
||||
return ptx.value.decode()
|
||||
|
||||
|
||||
def compile(src, name, cc):
|
||||
"""
|
||||
Compile a CUDA C/C++ source to PTX for a given compute capability.
|
||||
|
||||
:param src: The source code to compile
|
||||
:type src: str
|
||||
:param name: The filename of the source (for information only)
|
||||
:type name: str
|
||||
:param cc: A tuple ``(major, minor)`` of the compute capability
|
||||
:type cc: tuple
|
||||
:return: The compiled PTX and compilation log
|
||||
:rtype: tuple
|
||||
"""
|
||||
nvrtc = NVRTC()
|
||||
program = nvrtc.create_program(src, name)
|
||||
|
||||
# Compilation options:
|
||||
# - Compile for the current device's compute capability.
|
||||
# - The CUDA include path is added.
|
||||
# - Relocatable Device Code (rdc) is needed to prevent device functions
|
||||
# being optimized away.
|
||||
major, minor = cc
|
||||
arch = f'--gpu-architecture=compute_{major}{minor}'
|
||||
include = f'-I{config.CUDA_INCLUDE_PATH}'
|
||||
|
||||
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
|
||||
numba_cuda_path = os.path.dirname(cudadrv_path)
|
||||
numba_include = f'-I{numba_cuda_path}'
|
||||
options = [arch, include, numba_include, '-rdc', 'true']
|
||||
|
||||
# Compile the program
|
||||
compile_error = nvrtc.compile_program(program, options)
|
||||
|
||||
# Get log from compilation
|
||||
log = nvrtc.get_compile_log(program)
|
||||
|
||||
# If the compile failed, provide the log in an exception
|
||||
if compile_error:
|
||||
msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
|
||||
raise NvrtcError(msg)
|
||||
|
||||
# Otherwise, if there's any content in the log, present it as a warning
|
||||
if log:
|
||||
msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
|
||||
warnings.warn(msg)
|
||||
|
||||
ptx = nvrtc.get_ptx(program)
|
||||
return ptx, log
|
||||
@@ -0,0 +1,707 @@
|
||||
"""
|
||||
This is a direct translation of nvvm.h
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
|
||||
c_char)
|
||||
|
||||
import threading
|
||||
|
||||
from llvmlite import ir
|
||||
|
||||
from .error import NvvmError, NvvmSupportError, NvvmWarning
|
||||
from .libs import get_libdevice, open_libdevice, open_cudalib
|
||||
from numba.core import cgutils, config
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ADDRSPACE_GENERIC = 0
|
||||
ADDRSPACE_GLOBAL = 1
|
||||
ADDRSPACE_SHARED = 3
|
||||
ADDRSPACE_CONSTANT = 4
|
||||
ADDRSPACE_LOCAL = 5
|
||||
|
||||
# Opaque handle for compilation unit
|
||||
nvvm_program = c_void_p
|
||||
|
||||
# Result code
|
||||
nvvm_result = c_int
|
||||
|
||||
RESULT_CODE_NAMES = '''
|
||||
NVVM_SUCCESS
|
||||
NVVM_ERROR_OUT_OF_MEMORY
|
||||
NVVM_ERROR_PROGRAM_CREATION_FAILURE
|
||||
NVVM_ERROR_IR_VERSION_MISMATCH
|
||||
NVVM_ERROR_INVALID_INPUT
|
||||
NVVM_ERROR_INVALID_PROGRAM
|
||||
NVVM_ERROR_INVALID_IR
|
||||
NVVM_ERROR_INVALID_OPTION
|
||||
NVVM_ERROR_NO_MODULE_IN_PROGRAM
|
||||
NVVM_ERROR_COMPILATION
|
||||
'''.split()
|
||||
|
||||
for i, k in enumerate(RESULT_CODE_NAMES):
|
||||
setattr(sys.modules[__name__], k, i)
|
||||
|
||||
# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
|
||||
|
||||
_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
|
||||
'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
|
||||
'v64:64:64-v128:128:128-n16:32:64')
|
||||
_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
|
||||
'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
|
||||
'v64:64:64-v128:128:128-n16:32:64')
|
||||
|
||||
|
||||
def is_available():
|
||||
"""
|
||||
Return if libNVVM is available
|
||||
"""
|
||||
try:
|
||||
NVVM()
|
||||
except NvvmSupportError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
_nvvm_lock = threading.Lock()
|
||||
|
||||
|
||||
class NVVM(object):
|
||||
'''Process-wide singleton.
|
||||
'''
|
||||
_PROTOTYPES = {
|
||||
|
||||
# nvvmResult nvvmVersion(int *major, int *minor)
|
||||
'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
|
||||
|
||||
# nvvmResult nvvmCreateProgram(nvvmProgram *cu)
|
||||
'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
|
||||
|
||||
# nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
|
||||
'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
|
||||
|
||||
# nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
|
||||
# size_t size, const char *name)
|
||||
'nvvmAddModuleToProgram': (
|
||||
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
|
||||
|
||||
# nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
|
||||
# const char* buffer,
|
||||
# size_t size,
|
||||
# const char *name)
|
||||
'nvvmLazyAddModuleToProgram': (
|
||||
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
|
||||
|
||||
# nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
|
||||
# const char **options)
|
||||
'nvvmCompileProgram': (
|
||||
nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
|
||||
|
||||
# nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
|
||||
# size_t *bufferSizeRet)
|
||||
'nvvmGetCompiledResultSize': (
|
||||
nvvm_result, nvvm_program, POINTER(c_size_t)),
|
||||
|
||||
# nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
|
||||
'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
|
||||
|
||||
# nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
|
||||
# size_t *bufferSizeRet)
|
||||
'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
|
||||
|
||||
# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
|
||||
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
|
||||
|
||||
# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
|
||||
# int* minorDbg )
|
||||
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
|
||||
POINTER(c_int), POINTER(c_int)),
|
||||
# nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
|
||||
# const char** options)
|
||||
'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
|
||||
POINTER(c_char_p))
|
||||
}
|
||||
|
||||
# Singleton reference
|
||||
__INSTANCE = None
|
||||
|
||||
def __new__(cls):
|
||||
with _nvvm_lock:
|
||||
if cls.__INSTANCE is None:
|
||||
cls.__INSTANCE = inst = object.__new__(cls)
|
||||
try:
|
||||
inst.driver = open_cudalib('nvvm')
|
||||
except OSError as e:
|
||||
cls.__INSTANCE = None
|
||||
errmsg = ("libNVVM cannot be found. Do `conda install "
|
||||
"cudatoolkit`:\n%s")
|
||||
raise NvvmSupportError(errmsg % e)
|
||||
|
||||
# Find & populate functions
|
||||
for name, proto in inst._PROTOTYPES.items():
|
||||
func = getattr(inst.driver, name)
|
||||
func.restype = proto[0]
|
||||
func.argtypes = proto[1:]
|
||||
setattr(inst, name, func)
|
||||
|
||||
return cls.__INSTANCE
|
||||
|
||||
def __init__(self):
|
||||
ir_versions = self.get_ir_version()
|
||||
self._majorIR = ir_versions[0]
|
||||
self._minorIR = ir_versions[1]
|
||||
self._majorDbg = ir_versions[2]
|
||||
self._minorDbg = ir_versions[3]
|
||||
self._supported_ccs = get_supported_ccs()
|
||||
|
||||
@property
|
||||
def data_layout(self):
|
||||
if (self._majorIR, self._minorIR) < (1, 8):
|
||||
return _datalayout_original
|
||||
else:
|
||||
return _datalayout_i128
|
||||
|
||||
@property
|
||||
def supported_ccs(self):
|
||||
return self._supported_ccs
|
||||
|
||||
def get_version(self):
|
||||
major = c_int()
|
||||
minor = c_int()
|
||||
err = self.nvvmVersion(byref(major), byref(minor))
|
||||
self.check_error(err, 'Failed to get version.')
|
||||
return major.value, minor.value
|
||||
|
||||
def get_ir_version(self):
|
||||
majorIR = c_int()
|
||||
minorIR = c_int()
|
||||
majorDbg = c_int()
|
||||
minorDbg = c_int()
|
||||
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
|
||||
byref(majorDbg), byref(minorDbg))
|
||||
self.check_error(err, 'Failed to get IR version.')
|
||||
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
|
||||
|
||||
def check_error(self, error, msg, exit=False):
|
||||
if error:
|
||||
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
|
||||
if exit:
|
||||
print(exc)
|
||||
sys.exit(1)
|
||||
else:
|
||||
raise exc
|
||||
|
||||
|
||||
class CompilationUnit(object):
|
||||
def __init__(self):
|
||||
self.driver = NVVM()
|
||||
self._handle = nvvm_program()
|
||||
err = self.driver.nvvmCreateProgram(byref(self._handle))
|
||||
self.driver.check_error(err, 'Failed to create CU')
|
||||
|
||||
def __del__(self):
|
||||
driver = NVVM()
|
||||
err = driver.nvvmDestroyProgram(byref(self._handle))
|
||||
driver.check_error(err, 'Failed to destroy CU', exit=True)
|
||||
|
||||
def add_module(self, buffer):
|
||||
"""
|
||||
Add a module level NVVM IR to a compilation unit.
|
||||
- The buffer should contain an NVVM module IR either in the bitcode
|
||||
representation (LLVM3.0) or in the text representation.
|
||||
"""
|
||||
err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
|
||||
len(buffer), None)
|
||||
self.driver.check_error(err, 'Failed to add module')
|
||||
|
||||
def lazy_add_module(self, buffer):
|
||||
"""
|
||||
Lazily add an NVVM IR module to a compilation unit.
|
||||
The buffer should contain NVVM module IR either in the bitcode
|
||||
representation or in the text representation.
|
||||
"""
|
||||
err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
|
||||
len(buffer), None)
|
||||
self.driver.check_error(err, 'Failed to add module')
|
||||
|
||||
def compile(self, **options):
|
||||
"""Perform Compilation.
|
||||
|
||||
Compilation options are accepted as keyword arguments, with the
|
||||
following considerations:
|
||||
|
||||
- Underscores (`_`) in option names are converted to dashes (`-`), to
|
||||
match NVVM's option name format.
|
||||
- Options that take a value will be emitted in the form
|
||||
"-<name>=<value>".
|
||||
- Booleans passed as option values will be converted to integers.
|
||||
- Options which take no value (such as `-gen-lto`) should have a value
|
||||
of `None` passed in and will be emitted in the form "-<name>".
|
||||
|
||||
For documentation on NVVM compilation options, see the CUDA Toolkit
|
||||
Documentation:
|
||||
|
||||
https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
|
||||
"""
|
||||
|
||||
def stringify_option(k, v):
|
||||
k = k.replace('_', '-')
|
||||
|
||||
if v is None:
|
||||
return f'-{k}'
|
||||
|
||||
if isinstance(v, bool):
|
||||
v = int(v)
|
||||
|
||||
return f'-{k}={v}'
|
||||
|
||||
options = [stringify_option(k, v) for k, v in options.items()]
|
||||
|
||||
c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
|
||||
for x in options])
|
||||
# verify
|
||||
err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
|
||||
self._try_error(err, 'Failed to verify\n')
|
||||
|
||||
# compile
|
||||
err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
|
||||
self._try_error(err, 'Failed to compile\n')
|
||||
|
||||
# get result
|
||||
reslen = c_size_t()
|
||||
err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
|
||||
|
||||
self._try_error(err, 'Failed to get size of compiled result.')
|
||||
|
||||
output_buffer = (c_char * reslen.value)()
|
||||
err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
|
||||
self._try_error(err, 'Failed to get compiled result.')
|
||||
|
||||
# get log
|
||||
self.log = self.get_log()
|
||||
if self.log:
|
||||
warnings.warn(self.log, category=NvvmWarning)
|
||||
|
||||
return output_buffer[:]
|
||||
|
||||
def _try_error(self, err, msg):
|
||||
self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
|
||||
|
||||
def get_log(self):
|
||||
reslen = c_size_t()
|
||||
err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
|
||||
self.driver.check_error(err, 'Failed to get compilation log size.')
|
||||
|
||||
if reslen.value > 1:
|
||||
logbuf = (c_char * reslen.value)()
|
||||
err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
|
||||
self.driver.check_error(err, 'Failed to get compilation log.')
|
||||
|
||||
return logbuf.value.decode('utf8') # populate log attribute
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
COMPUTE_CAPABILITIES = (
|
||||
(3, 5), (3, 7),
|
||||
(5, 0), (5, 2), (5, 3),
|
||||
(6, 0), (6, 1), (6, 2),
|
||||
(7, 0), (7, 2), (7, 5),
|
||||
(8, 0), (8, 6), (8, 7), (8, 9),
|
||||
(9, 0)
|
||||
)
|
||||
|
||||
# Maps CTK version -> (min supported cc, max supported cc) inclusive
|
||||
CTK_SUPPORTED = {
|
||||
(11, 2): ((3, 5), (8, 6)),
|
||||
(11, 3): ((3, 5), (8, 6)),
|
||||
(11, 4): ((3, 5), (8, 7)),
|
||||
(11, 5): ((3, 5), (8, 7)),
|
||||
(11, 6): ((3, 5), (8, 7)),
|
||||
(11, 7): ((3, 5), (8, 7)),
|
||||
(11, 8): ((3, 5), (9, 0)),
|
||||
(12, 0): ((5, 0), (9, 0)),
|
||||
(12, 1): ((5, 0), (9, 0)),
|
||||
(12, 2): ((5, 0), (9, 0)),
|
||||
(12, 3): ((5, 0), (9, 0)),
|
||||
(12, 4): ((5, 0), (9, 0)),
|
||||
}
|
||||
|
||||
|
||||
def ccs_supported_by_ctk(ctk_version):
|
||||
try:
|
||||
# For supported versions, we look up the range of supported CCs
|
||||
min_cc, max_cc = CTK_SUPPORTED[ctk_version]
|
||||
return tuple([cc for cc in COMPUTE_CAPABILITIES
|
||||
if min_cc <= cc <= max_cc])
|
||||
except KeyError:
|
||||
# For unsupported CUDA toolkit versions, all we can do is assume all
|
||||
# non-deprecated versions we are aware of are supported.
|
||||
return tuple([cc for cc in COMPUTE_CAPABILITIES
|
||||
if cc >= config.CUDA_DEFAULT_PTX_CC])
|
||||
|
||||
|
||||
def get_supported_ccs():
|
||||
try:
|
||||
from numba.cuda.cudadrv.runtime import runtime
|
||||
cudart_version = runtime.get_version()
|
||||
except: # noqa: E722
|
||||
# We can't support anything if there's an error getting the runtime
|
||||
# version (e.g. if it's not present or there's another issue)
|
||||
_supported_cc = ()
|
||||
return _supported_cc
|
||||
|
||||
# Ensure the minimum CTK version requirement is met
|
||||
min_cudart = min(CTK_SUPPORTED)
|
||||
if cudart_version < min_cudart:
|
||||
_supported_cc = ()
|
||||
ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
|
||||
unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
|
||||
f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
|
||||
"required version.")
|
||||
warnings.warn(unsupported_ver)
|
||||
return _supported_cc
|
||||
|
||||
_supported_cc = ccs_supported_by_ctk(cudart_version)
|
||||
return _supported_cc
|
||||
|
||||
|
||||
def find_closest_arch(mycc):
|
||||
"""
|
||||
Given a compute capability, return the closest compute capability supported
|
||||
by the CUDA toolkit.
|
||||
|
||||
:param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
|
||||
:return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
|
||||
"""
|
||||
supported_ccs = NVVM().supported_ccs
|
||||
|
||||
if not supported_ccs:
|
||||
msg = "No supported GPU compute capabilities found. " \
|
||||
"Please check your cudatoolkit version matches your CUDA version."
|
||||
raise NvvmSupportError(msg)
|
||||
|
||||
for i, cc in enumerate(supported_ccs):
|
||||
if cc == mycc:
|
||||
# Matches
|
||||
return cc
|
||||
elif cc > mycc:
|
||||
# Exceeded
|
||||
if i == 0:
|
||||
# CC lower than supported
|
||||
msg = "GPU compute capability %d.%d is not supported" \
|
||||
"(requires >=%d.%d)" % (mycc + cc)
|
||||
raise NvvmSupportError(msg)
|
||||
else:
|
||||
# return the previous CC
|
||||
return supported_ccs[i - 1]
|
||||
|
||||
# CC higher than supported
|
||||
return supported_ccs[-1] # Choose the highest
|
||||
|
||||
|
||||
def get_arch_option(major, minor):
|
||||
"""Matches with the closest architecture option
|
||||
"""
|
||||
if config.FORCE_CUDA_CC:
|
||||
arch = config.FORCE_CUDA_CC
|
||||
else:
|
||||
arch = find_closest_arch((major, minor))
|
||||
return 'compute_%d%d' % arch
|
||||
|
||||
|
||||
MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
|
||||
Please ensure you have a CUDA Toolkit 11.2 or higher.
|
||||
For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
|
||||
|
||||
$ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
|
||||
|
||||
For CUDA 11, ``cudatoolkit`` is required:
|
||||
|
||||
$ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
|
||||
'''
|
||||
|
||||
|
||||
class LibDevice(object):
|
||||
_cache_ = None
|
||||
|
||||
def __init__(self):
|
||||
if self._cache_ is None:
|
||||
if get_libdevice() is None:
|
||||
raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
|
||||
self._cache_ = open_libdevice()
|
||||
|
||||
self.bc = self._cache_
|
||||
|
||||
def get(self):
|
||||
return self.bc
|
||||
|
||||
|
||||
cas_nvvm = """
|
||||
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
|
||||
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
# Translation of code from CUDA Programming Guide v6.5, section B.12
|
||||
ir_numba_atomic_binary_template = """
|
||||
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%iptr = bitcast {T}* %ptr to {Ti}*
|
||||
%old2 = load volatile {Ti}, {Ti}* %iptr
|
||||
br label %attempt
|
||||
|
||||
attempt:
|
||||
%old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
|
||||
%dold = bitcast {Ti} %old to {T}
|
||||
%dnew = {OP} {T} %dold, %val
|
||||
%new = bitcast {T} %dnew to {Ti}
|
||||
{CAS}
|
||||
%repeat = icmp ne {Ti} %cas, %old
|
||||
br i1 %repeat, label %attempt, label %done
|
||||
|
||||
done:
|
||||
%result = bitcast {Ti} %old to {T}
|
||||
ret {T} %result
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
ir_numba_atomic_inc_template = """
|
||||
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%old2 = load volatile {T}, {T}* %iptr
|
||||
br label %attempt
|
||||
|
||||
attempt:
|
||||
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
|
||||
%bndchk = icmp ult {T} %old, %val
|
||||
%inc = add {T} %old, 1
|
||||
%new = select i1 %bndchk, {T} %inc, {T} 0
|
||||
{CAS}
|
||||
%repeat = icmp ne {T} %cas, %old
|
||||
br i1 %repeat, label %attempt, label %done
|
||||
|
||||
done:
|
||||
ret {T} %old
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
ir_numba_atomic_dec_template = """
|
||||
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%old2 = load volatile {T}, {T}* %iptr
|
||||
br label %attempt
|
||||
|
||||
attempt:
|
||||
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
|
||||
%dec = add {T} %old, -1
|
||||
%bndchk = icmp ult {T} %dec, %val
|
||||
%new = select i1 %bndchk, {T} %dec, {T} %val
|
||||
{CAS}
|
||||
%repeat = icmp ne {T} %cas, %old
|
||||
br i1 %repeat, label %attempt, label %done
|
||||
|
||||
done:
|
||||
ret {T} %old
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
ir_numba_atomic_minmax_template = """
|
||||
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%ptrval = load volatile {T}, {T}* %ptr
|
||||
; Return early when:
|
||||
; - For nanmin / nanmax when val is a NaN
|
||||
; - For min / max when val or ptr is a NaN
|
||||
%early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
|
||||
br i1 %early_return, label %done, label %lt_check
|
||||
|
||||
lt_check:
|
||||
%dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
|
||||
; Continue attempts if dold less or greater than val (depending on whether min or max)
|
||||
; or if dold is NaN (for nanmin / nanmax)
|
||||
%cmp = fcmp {OP} {T} %dold, %val
|
||||
br i1 %cmp, label %attempt, label %done
|
||||
|
||||
attempt:
|
||||
; Attempt to swap in the value
|
||||
%old = bitcast {T} %dold to {Ti}
|
||||
%iptr = bitcast {T}* %ptr to {Ti}*
|
||||
%new = bitcast {T} %val to {Ti}
|
||||
{CAS}
|
||||
%dcas = bitcast {Ti} %cas to {T}
|
||||
br label %lt_check
|
||||
|
||||
done:
|
||||
ret {T} %ptrval
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
def ir_cas(Ti):
|
||||
return cas_nvvm.format(Ti=Ti)
|
||||
|
||||
|
||||
def ir_numba_atomic_binary(T, Ti, OP, FUNC):
|
||||
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
|
||||
return ir_numba_atomic_binary_template.format(**params)
|
||||
|
||||
|
||||
def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
|
||||
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
|
||||
FUNC=FUNC, CAS=ir_cas(Ti))
|
||||
|
||||
return ir_numba_atomic_minmax_template.format(**params)
|
||||
|
||||
|
||||
def ir_numba_atomic_inc(T, Tu):
|
||||
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
|
||||
|
||||
|
||||
def ir_numba_atomic_dec(T, Tu):
|
||||
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
|
||||
|
||||
|
||||
def llvm_replace(llvmir):
|
||||
replacements = [
|
||||
('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
|
||||
('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
|
||||
('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
|
||||
('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
|
||||
ir_numba_atomic_inc(T='i64', Tu='u64')),
|
||||
('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
|
||||
ir_numba_atomic_dec(T='i64', Tu='u64')),
|
||||
('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
|
||||
PTR_OR_VAL='ptr', FUNC='max')),
|
||||
('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
|
||||
PTR_OR_VAL='ptr', FUNC='max')),
|
||||
('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
|
||||
PTR_OR_VAL='ptr', FUNC='min')),
|
||||
('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
|
||||
PTR_OR_VAL='ptr', FUNC='min')),
|
||||
('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
|
||||
PTR_OR_VAL='', FUNC='max')),
|
||||
('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
|
||||
PTR_OR_VAL='', FUNC='max')),
|
||||
('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
|
||||
PTR_OR_VAL='', FUNC='min')),
|
||||
('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
|
||||
PTR_OR_VAL='', FUNC='min')),
|
||||
('immarg', '')
|
||||
]
|
||||
|
||||
for decl, fn in replacements:
|
||||
llvmir = llvmir.replace(decl, fn)
|
||||
|
||||
llvmir = llvm140_to_70_ir(llvmir)
|
||||
|
||||
return llvmir
|
||||
|
||||
|
||||
def compile_ir(llvmir, **opts):
|
||||
if isinstance(llvmir, str):
|
||||
llvmir = [llvmir]
|
||||
|
||||
if opts.pop('fastmath', False):
|
||||
opts.update({
|
||||
'ftz': True,
|
||||
'fma': True,
|
||||
'prec_div': False,
|
||||
'prec_sqrt': False,
|
||||
})
|
||||
|
||||
cu = CompilationUnit()
|
||||
libdevice = LibDevice()
|
||||
|
||||
for mod in llvmir:
|
||||
mod = llvm_replace(mod)
|
||||
cu.add_module(mod.encode('utf8'))
|
||||
cu.lazy_add_module(libdevice.get())
|
||||
|
||||
return cu.compile(**opts)
|
||||
|
||||
|
||||
re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
|
||||
|
||||
|
||||
def llvm140_to_70_ir(ir):
|
||||
"""
|
||||
Convert LLVM 14.0 IR for LLVM 7.0.
|
||||
"""
|
||||
buf = []
|
||||
for line in ir.splitlines():
|
||||
if line.startswith('attributes #'):
|
||||
# Remove function attributes unsupported by LLVM 7.0
|
||||
m = re_attributes_def.match(line)
|
||||
attrs = m.group(1).split()
|
||||
attrs = ' '.join(a for a in attrs if a != 'willreturn')
|
||||
line = line.replace(m.group(1), attrs)
|
||||
|
||||
buf.append(line)
|
||||
|
||||
return '\n'.join(buf)
|
||||
|
||||
|
||||
def set_cuda_kernel(function):
|
||||
"""
|
||||
Mark a function as a CUDA kernel. Kernels have the following requirements:
|
||||
|
||||
- Metadata that marks them as a kernel.
|
||||
- Addition to the @llvm.used list, so that they will not be discarded.
|
||||
- The noinline attribute is not permitted, because this causes NVVM to emit
|
||||
a warning, which counts as failing IR verification.
|
||||
|
||||
Presently it is assumed that there is one kernel per module, which holds
|
||||
for Numba-jitted functions. If this changes in future or this function is
|
||||
to be used externally, this function may need modification to add to the
|
||||
@llvm.used list rather than creating it.
|
||||
"""
|
||||
module = function.module
|
||||
|
||||
# Add kernel metadata
|
||||
mdstr = ir.MetaDataString(module, "kernel")
|
||||
mdvalue = ir.Constant(ir.IntType(32), 1)
|
||||
md = module.add_metadata((function, mdstr, mdvalue))
|
||||
|
||||
nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
|
||||
nmd.add(md)
|
||||
|
||||
# Create the used list
|
||||
ptrty = ir.IntType(8).as_pointer()
|
||||
usedty = ir.ArrayType(ptrty, 1)
|
||||
|
||||
fnptr = function.bitcast(ptrty)
|
||||
|
||||
llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
|
||||
llvm_used.linkage = 'appending'
|
||||
llvm_used.section = 'llvm.metadata'
|
||||
llvm_used.initializer = ir.Constant(usedty, [fnptr])
|
||||
|
||||
# Remove 'noinline' if it is present.
|
||||
function.attributes.discard('noinline')
|
||||
|
||||
|
||||
def add_ir_version(mod):
|
||||
"""Add NVVM IR version to module"""
|
||||
# We specify the IR version to match the current NVVM's IR version
|
||||
i32 = ir.IntType(32)
|
||||
ir_versions = [i32(v) for v in NVVM().get_ir_version()]
|
||||
md_ver = mod.add_metadata(ir_versions)
|
||||
mod.add_named_metadata('nvvmir.version', md_ver)
|
||||
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
Declarations of the Runtime API functions.
|
||||
"""
|
||||
|
||||
from ctypes import c_int, POINTER
|
||||
|
||||
API_PROTOTYPES = {
|
||||
# cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
|
||||
'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
CUDA Runtime wrapper.
|
||||
|
||||
This provides a very minimal set of bindings, since the Runtime API is not
|
||||
really used in Numba except for querying the Runtime version.
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import functools
|
||||
import sys
|
||||
|
||||
from numba.core import config
|
||||
from numba.cuda.cudadrv.driver import ERROR_MAP, make_logger
|
||||
from numba.cuda.cudadrv.error import CudaSupportError, CudaRuntimeError
|
||||
from numba.cuda.cudadrv.libs import open_cudalib
|
||||
from numba.cuda.cudadrv.rtapi import API_PROTOTYPES
|
||||
from numba.cuda.cudadrv import enums
|
||||
|
||||
|
||||
class CudaRuntimeAPIError(CudaRuntimeError):
|
||||
"""
|
||||
Raised when there is an error accessing a C API from the CUDA Runtime.
|
||||
"""
|
||||
def __init__(self, code, msg):
|
||||
self.code = code
|
||||
self.msg = msg
|
||||
super().__init__(code, msg)
|
||||
|
||||
def __str__(self):
|
||||
return "[%s] %s" % (self.code, self.msg)
|
||||
|
||||
|
||||
class Runtime:
|
||||
"""
|
||||
Runtime object that lazily binds runtime API functions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.is_initialized = False
|
||||
|
||||
def _initialize(self):
|
||||
# lazily initialize logger
|
||||
global _logger
|
||||
_logger = make_logger()
|
||||
|
||||
if config.DISABLE_CUDA:
|
||||
msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
|
||||
"in the environment, or because CUDA is unsupported on "
|
||||
"32-bit systems.")
|
||||
raise CudaSupportError(msg)
|
||||
self.lib = open_cudalib('cudart')
|
||||
|
||||
self.is_initialized = True
|
||||
|
||||
def __getattr__(self, fname):
|
||||
# First request of a runtime API function
|
||||
try:
|
||||
proto = API_PROTOTYPES[fname]
|
||||
except KeyError:
|
||||
raise AttributeError(fname)
|
||||
restype = proto[0]
|
||||
argtypes = proto[1:]
|
||||
|
||||
if not self.is_initialized:
|
||||
self._initialize()
|
||||
|
||||
# Find function in runtime library
|
||||
libfn = self._find_api(fname)
|
||||
libfn.restype = restype
|
||||
libfn.argtypes = argtypes
|
||||
|
||||
safe_call = self._wrap_api_call(fname, libfn)
|
||||
setattr(self, fname, safe_call)
|
||||
return safe_call
|
||||
|
||||
def _wrap_api_call(self, fname, libfn):
|
||||
@functools.wraps(libfn)
|
||||
def safe_cuda_api_call(*args):
|
||||
_logger.debug('call runtime api: %s', libfn.__name__)
|
||||
retcode = libfn(*args)
|
||||
self._check_error(fname, retcode)
|
||||
return safe_cuda_api_call
|
||||
|
||||
def _check_error(self, fname, retcode):
|
||||
if retcode != enums.CUDA_SUCCESS:
|
||||
errname = ERROR_MAP.get(retcode, "cudaErrorUnknown")
|
||||
msg = "Call to %s results in %s" % (fname, errname)
|
||||
_logger.error(msg)
|
||||
raise CudaRuntimeAPIError(retcode, msg)
|
||||
|
||||
def _find_api(self, fname):
|
||||
try:
|
||||
return getattr(self.lib, fname)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Not found.
|
||||
# Delay missing function error to use
|
||||
def absent_function(*args, **kws):
|
||||
msg = "runtime missing function: %s."
|
||||
raise CudaRuntimeError(msg % fname)
|
||||
|
||||
setattr(self, fname, absent_function)
|
||||
return absent_function
|
||||
|
||||
def get_version(self):
|
||||
"""
|
||||
Returns the CUDA Runtime version as a tuple (major, minor).
|
||||
"""
|
||||
rtver = ctypes.c_int()
|
||||
self.cudaRuntimeGetVersion(ctypes.byref(rtver))
|
||||
# The version is encoded as (1000 * major) + (10 * minor)
|
||||
major = rtver.value // 1000
|
||||
minor = (rtver.value - (major * 1000)) // 10
|
||||
return (major, minor)
|
||||
|
||||
def is_supported_version(self):
|
||||
"""
|
||||
Returns True if the CUDA Runtime is a supported version.
|
||||
"""
|
||||
|
||||
return self.get_version() in self.supported_versions
|
||||
|
||||
@property
|
||||
def supported_versions(self):
|
||||
"""A tuple of all supported CUDA toolkit versions. Versions are given in
|
||||
the form ``(major_version, minor_version)``."""
|
||||
if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
|
||||
# Only 64-bit Linux and Windows are supported
|
||||
return ()
|
||||
return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
|
||||
(11, 7))
|
||||
|
||||
|
||||
runtime = Runtime()
|
||||
|
||||
|
||||
def get_version():
|
||||
"""
|
||||
Return the runtime version as a tuple of (major, minor)
|
||||
"""
|
||||
return runtime.get_version()
|
||||
Reference in New Issue
Block a user