Videre
This commit is contained in:
@@ -0,0 +1,904 @@
|
||||
"""
|
||||
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
|
||||
on the object. If it exists and evaluate to True, it must define shape,
|
||||
strides, dtype and size attributes similar to a NumPy ndarray.
|
||||
"""
|
||||
|
||||
import math
|
||||
import functools
|
||||
import operator
|
||||
import copy
|
||||
from ctypes import c_void_p
|
||||
|
||||
import numpy as np
|
||||
|
||||
import numba
|
||||
from numba import _devicearray
|
||||
from numba.cuda.cudadrv import devices, dummyarray
|
||||
from numba.cuda.cudadrv import driver as _driver
|
||||
from numba.core import types, config
|
||||
from numba.np.unsafe.ndarray import to_fixed_tuple
|
||||
from numba.np.numpy_support import numpy_version
|
||||
from numba.np import numpy_support
|
||||
from numba.cuda.api_util import prepare_shape_strides_dtype
|
||||
from numba.core.errors import NumbaPerformanceWarning
|
||||
from warnings import warn
|
||||
|
||||
try:
|
||||
lru_cache = getattr(functools, 'lru_cache')(None)
|
||||
except AttributeError:
|
||||
# Python 3.1 or lower
|
||||
def lru_cache(func):
|
||||
return func
|
||||
|
||||
|
||||
def is_cuda_ndarray(obj):
|
||||
"Check if an object is a CUDA ndarray"
|
||||
return getattr(obj, '__cuda_ndarray__', False)
|
||||
|
||||
|
||||
def verify_cuda_ndarray_interface(obj):
|
||||
"Verify the CUDA ndarray interface for an obj"
|
||||
require_cuda_ndarray(obj)
|
||||
|
||||
def requires_attr(attr, typ):
|
||||
if not hasattr(obj, attr):
|
||||
raise AttributeError(attr)
|
||||
if not isinstance(getattr(obj, attr), typ):
|
||||
raise AttributeError('%s must be of type %s' % (attr, typ))
|
||||
|
||||
requires_attr('shape', tuple)
|
||||
requires_attr('strides', tuple)
|
||||
requires_attr('dtype', np.dtype)
|
||||
requires_attr('size', int)
|
||||
|
||||
|
||||
def require_cuda_ndarray(obj):
|
||||
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
|
||||
if not is_cuda_ndarray(obj):
|
||||
raise ValueError('require an cuda ndarray object')
|
||||
|
||||
|
||||
class DeviceNDArrayBase(_devicearray.DeviceArray):
|
||||
"""A on GPU NDArray representation
|
||||
"""
|
||||
__cuda_memory__ = True
|
||||
__cuda_ndarray__ = True # There must be gpu_data attribute
|
||||
|
||||
def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
|
||||
"""
|
||||
Args
|
||||
----
|
||||
|
||||
shape
|
||||
array shape.
|
||||
strides
|
||||
array strides.
|
||||
dtype
|
||||
data type as np.dtype coercible object.
|
||||
stream
|
||||
cuda stream.
|
||||
gpu_data
|
||||
user provided device memory for the ndarray data buffer
|
||||
"""
|
||||
if isinstance(shape, int):
|
||||
shape = (shape,)
|
||||
if isinstance(strides, int):
|
||||
strides = (strides,)
|
||||
dtype = np.dtype(dtype)
|
||||
self.ndim = len(shape)
|
||||
if len(strides) != self.ndim:
|
||||
raise ValueError('strides not match ndim')
|
||||
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
|
||||
dtype.itemsize)
|
||||
self.shape = tuple(shape)
|
||||
self.strides = tuple(strides)
|
||||
self.dtype = dtype
|
||||
self.size = int(functools.reduce(operator.mul, self.shape, 1))
|
||||
# prepare gpu memory
|
||||
if self.size > 0:
|
||||
if gpu_data is None:
|
||||
self.alloc_size = _driver.memory_size_from_info(
|
||||
self.shape, self.strides, self.dtype.itemsize)
|
||||
gpu_data = devices.get_context().memalloc(self.alloc_size)
|
||||
else:
|
||||
self.alloc_size = _driver.device_memory_size(gpu_data)
|
||||
else:
|
||||
# Make NULL pointer for empty allocation
|
||||
if _driver.USE_NV_BINDING:
|
||||
null = _driver.binding.CUdeviceptr(0)
|
||||
else:
|
||||
null = c_void_p(0)
|
||||
gpu_data = _driver.MemoryPointer(context=devices.get_context(),
|
||||
pointer=null, size=0)
|
||||
self.alloc_size = 0
|
||||
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
@property
|
||||
def __cuda_array_interface__(self):
|
||||
if _driver.USE_NV_BINDING:
|
||||
if self.device_ctypes_pointer is not None:
|
||||
ptr = int(self.device_ctypes_pointer)
|
||||
else:
|
||||
ptr = 0
|
||||
else:
|
||||
if self.device_ctypes_pointer.value is not None:
|
||||
ptr = self.device_ctypes_pointer.value
|
||||
else:
|
||||
ptr = 0
|
||||
|
||||
return {
|
||||
'shape': tuple(self.shape),
|
||||
'strides': None if is_contiguous(self) else tuple(self.strides),
|
||||
'data': (ptr, False),
|
||||
'typestr': self.dtype.str,
|
||||
'stream': int(self.stream) if self.stream != 0 else None,
|
||||
'version': 3,
|
||||
}
|
||||
|
||||
def bind(self, stream=0):
|
||||
"""Bind a CUDA stream to this object so that all subsequent operation
|
||||
on this array defaults to the given stream.
|
||||
"""
|
||||
clone = copy.copy(self)
|
||||
clone.stream = stream
|
||||
return clone
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.transpose()
|
||||
|
||||
def transpose(self, axes=None):
|
||||
if axes and tuple(axes) == tuple(range(self.ndim)):
|
||||
return self
|
||||
elif self.ndim != 2:
|
||||
msg = "transposing a non-2D DeviceNDArray isn't supported"
|
||||
raise NotImplementedError(msg)
|
||||
elif axes is not None and set(axes) != set(range(self.ndim)):
|
||||
raise ValueError("invalid axes list %r" % (axes,))
|
||||
else:
|
||||
from numba.cuda.kernels.transpose import transpose
|
||||
return transpose(self)
|
||||
|
||||
def _default_stream(self, stream):
|
||||
return self.stream if not stream else stream
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
"""
|
||||
Magic attribute expected by Numba to get the numba type that
|
||||
represents this object.
|
||||
"""
|
||||
# Typing considerations:
|
||||
#
|
||||
# 1. The preference is to use 'C' or 'F' layout since this enables
|
||||
# hardcoding stride values into compiled kernels, which is more
|
||||
# efficient than storing a passed-in value in a register.
|
||||
#
|
||||
# 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
|
||||
# the more likely / common case.
|
||||
#
|
||||
# 3. If an array is broadcast then it must be typed as 'A' - using 'C'
|
||||
# or 'F' does not apply for broadcast arrays, because the strides, some
|
||||
# of which will be 0, will not match those hardcoded in for 'C' or 'F'
|
||||
# layouts.
|
||||
|
||||
broadcast = 0 in self.strides
|
||||
if self.flags['C_CONTIGUOUS'] and not broadcast:
|
||||
layout = 'C'
|
||||
elif self.flags['F_CONTIGUOUS'] and not broadcast:
|
||||
layout = 'F'
|
||||
else:
|
||||
layout = 'A'
|
||||
|
||||
dtype = numpy_support.from_dtype(self.dtype)
|
||||
return types.Array(dtype, self.ndim, layout)
|
||||
|
||||
@property
|
||||
def device_ctypes_pointer(self):
|
||||
"""Returns the ctypes pointer to the GPU data buffer
|
||||
"""
|
||||
if self.gpu_data is None:
|
||||
if _driver.USE_NV_BINDING:
|
||||
return _driver.binding.CUdeviceptr(0)
|
||||
else:
|
||||
return c_void_p(0)
|
||||
else:
|
||||
return self.gpu_data.device_ctypes_pointer
|
||||
|
||||
@devices.require_context
|
||||
def copy_to_device(self, ary, stream=0):
|
||||
"""Copy `ary` to `self`.
|
||||
|
||||
If `ary` is a CUDA memory, perform a device-to-device transfer.
|
||||
Otherwise, perform a a host-to-device transfer.
|
||||
"""
|
||||
if ary.size == 0:
|
||||
# Nothing to do
|
||||
return
|
||||
|
||||
sentry_contiguous(self)
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
self_core, ary_core = array_core(self), array_core(ary)
|
||||
if _driver.is_device_memory(ary):
|
||||
sentry_contiguous(ary)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
_driver.device_to_device(self, ary, self.alloc_size, stream=stream)
|
||||
else:
|
||||
# Ensure same contiguity. Only makes a host-side copy if necessary
|
||||
# (i.e., in order to materialize a writable strided view)
|
||||
ary_core = np.array(
|
||||
ary_core,
|
||||
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
|
||||
subok=True,
|
||||
copy=(not ary_core.flags['WRITEABLE'])
|
||||
if numpy_version < (2, 0) else None)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
_driver.host_to_device(self, ary_core, self.alloc_size,
|
||||
stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def copy_to_host(self, ary=None, stream=0):
|
||||
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
|
||||
if ``ary`` is ``None``.
|
||||
|
||||
If a CUDA ``stream`` is given, then the transfer will be made
|
||||
asynchronously as part as the given stream. Otherwise, the transfer is
|
||||
synchronous: the function returns after the copy is finished.
|
||||
|
||||
Always returns the host array.
|
||||
|
||||
Example::
|
||||
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
arr = np.arange(1000)
|
||||
d_arr = cuda.to_device(arr)
|
||||
|
||||
my_kernel[100, 100](d_arr)
|
||||
|
||||
result_array = d_arr.copy_to_host()
|
||||
"""
|
||||
if any(s < 0 for s in self.strides):
|
||||
msg = 'D->H copy not implemented for negative strides: {}'
|
||||
raise NotImplementedError(msg.format(self.strides))
|
||||
assert self.alloc_size >= 0, "Negative memory size"
|
||||
stream = self._default_stream(stream)
|
||||
if ary is None:
|
||||
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
|
||||
else:
|
||||
check_array_compatibility(self, ary)
|
||||
hostary = ary
|
||||
|
||||
if self.alloc_size != 0:
|
||||
_driver.device_to_host(hostary, self, self.alloc_size,
|
||||
stream=stream)
|
||||
|
||||
if ary is None:
|
||||
if self.size == 0:
|
||||
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
|
||||
buffer=hostary)
|
||||
else:
|
||||
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
|
||||
strides=self.strides, buffer=hostary)
|
||||
return hostary
|
||||
|
||||
def split(self, section, stream=0):
|
||||
"""Split the array into equal partition of the `section` size.
|
||||
If the array cannot be equally divided, the last section will be
|
||||
smaller.
|
||||
"""
|
||||
stream = self._default_stream(stream)
|
||||
if self.ndim != 1:
|
||||
raise ValueError("only support 1d array")
|
||||
if self.strides[0] != self.dtype.itemsize:
|
||||
raise ValueError("only support unit stride")
|
||||
nsect = int(math.ceil(float(self.size) / section))
|
||||
strides = self.strides
|
||||
itemsize = self.dtype.itemsize
|
||||
for i in range(nsect):
|
||||
begin = i * section
|
||||
end = min(begin + section, self.size)
|
||||
shape = (end - begin,)
|
||||
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
|
||||
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
|
||||
gpu_data=gpu_data)
|
||||
|
||||
def as_cuda_arg(self):
|
||||
"""Returns a device memory object that is used as the argument.
|
||||
"""
|
||||
return self.gpu_data
|
||||
|
||||
def get_ipc_handle(self):
|
||||
"""
|
||||
Returns a *IpcArrayHandle* object that is safe to serialize and transfer
|
||||
to another process to share the local allocation.
|
||||
|
||||
Note: this feature is only available on Linux.
|
||||
"""
|
||||
ipch = devices.get_context().get_ipc_handle(self.gpu_data)
|
||||
desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
|
||||
return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
|
||||
|
||||
def squeeze(self, axis=None, stream=0):
|
||||
"""
|
||||
Remove axes of size one from the array shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : None or int or tuple of ints, optional
|
||||
Subset of dimensions to remove. A `ValueError` is raised if an axis
|
||||
with size greater than one is selected. If `None`, all axes with
|
||||
size one are removed.
|
||||
stream : cuda stream or 0, optional
|
||||
Default stream for the returned view of the array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DeviceNDArray
|
||||
Squeezed view into the array.
|
||||
|
||||
"""
|
||||
new_dummy, _ = self._dummy.squeeze(axis=axis)
|
||||
return DeviceNDArray(
|
||||
shape=new_dummy.shape,
|
||||
strides=new_dummy.strides,
|
||||
dtype=self.dtype,
|
||||
stream=self._default_stream(stream),
|
||||
gpu_data=self.gpu_data,
|
||||
)
|
||||
|
||||
def view(self, dtype):
|
||||
"""Returns a new object by reinterpretting the dtype without making a
|
||||
copy of the data.
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
shape = list(self.shape)
|
||||
strides = list(self.strides)
|
||||
|
||||
if self.dtype.itemsize != dtype.itemsize:
|
||||
if not self.is_c_contiguous():
|
||||
raise ValueError(
|
||||
"To change to a dtype of a different size,"
|
||||
" the array must be C-contiguous"
|
||||
)
|
||||
|
||||
shape[-1], rem = divmod(
|
||||
shape[-1] * self.dtype.itemsize,
|
||||
dtype.itemsize
|
||||
)
|
||||
|
||||
if rem != 0:
|
||||
raise ValueError(
|
||||
"When changing to a larger dtype,"
|
||||
" its size must be a divisor of the total size in bytes"
|
||||
" of the last axis of the array."
|
||||
)
|
||||
|
||||
strides[-1] = dtype.itemsize
|
||||
|
||||
return DeviceNDArray(
|
||||
shape=shape,
|
||||
strides=strides,
|
||||
dtype=dtype,
|
||||
stream=self.stream,
|
||||
gpu_data=self.gpu_data,
|
||||
)
|
||||
|
||||
@property
|
||||
def nbytes(self):
|
||||
# Note: not using `alloc_size`. `alloc_size` reports memory
|
||||
# consumption of the allocation, not the size of the array
|
||||
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
|
||||
return self.dtype.itemsize * self.size
|
||||
|
||||
|
||||
class DeviceRecord(DeviceNDArrayBase):
|
||||
'''
|
||||
An on-GPU record type
|
||||
'''
|
||||
def __init__(self, dtype, stream=0, gpu_data=None):
|
||||
shape = ()
|
||||
strides = ()
|
||||
super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
|
||||
gpu_data)
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
"""
|
||||
For `numpy.ndarray` compatibility. Ideally this would return a
|
||||
`np.core.multiarray.flagsobj`, but that needs to be constructed
|
||||
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
|
||||
aren't writeable).
|
||||
"""
|
||||
return dict(self._dummy.flags) # defensive copy
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
"""
|
||||
Magic attribute expected by Numba to get the numba type that
|
||||
represents this object.
|
||||
"""
|
||||
return numpy_support.from_dtype(self.dtype)
|
||||
|
||||
@devices.require_context
|
||||
def __getitem__(self, item):
|
||||
return self._do_getitem(item)
|
||||
|
||||
@devices.require_context
|
||||
def getitem(self, item, stream=0):
|
||||
"""Do `__getitem__(item)` with CUDA stream
|
||||
"""
|
||||
return self._do_getitem(item, stream)
|
||||
|
||||
def _do_getitem(self, item, stream=0):
|
||||
stream = self._default_stream(stream)
|
||||
typ, offset = self.dtype.fields[item]
|
||||
newdata = self.gpu_data.view(offset)
|
||||
|
||||
if typ.shape == ():
|
||||
if typ.names is not None:
|
||||
return DeviceRecord(dtype=typ, stream=stream,
|
||||
gpu_data=newdata)
|
||||
else:
|
||||
hostary = np.empty(1, dtype=typ)
|
||||
_driver.device_to_host(dst=hostary, src=newdata,
|
||||
size=typ.itemsize,
|
||||
stream=stream)
|
||||
return hostary[0]
|
||||
else:
|
||||
shape, strides, dtype = \
|
||||
prepare_shape_strides_dtype(typ.shape,
|
||||
None,
|
||||
typ.subdtype[0], 'C')
|
||||
return DeviceNDArray(shape=shape, strides=strides,
|
||||
dtype=dtype, gpu_data=newdata,
|
||||
stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def __setitem__(self, key, value):
|
||||
return self._do_setitem(key, value)
|
||||
|
||||
@devices.require_context
|
||||
def setitem(self, key, value, stream=0):
|
||||
"""Do `__setitem__(key, value)` with CUDA stream
|
||||
"""
|
||||
return self._do_setitem(key, value, stream=stream)
|
||||
|
||||
def _do_setitem(self, key, value, stream=0):
|
||||
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
# If the record didn't have a default stream, and the user didn't
|
||||
# provide a stream, then we will use the default stream for the
|
||||
# assignment kernel and synchronize on it.
|
||||
synchronous = not stream
|
||||
if synchronous:
|
||||
ctx = devices.get_context()
|
||||
stream = ctx.get_default_stream()
|
||||
|
||||
# (1) prepare LHS
|
||||
|
||||
typ, offset = self.dtype.fields[key]
|
||||
newdata = self.gpu_data.view(offset)
|
||||
|
||||
lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
|
||||
|
||||
# (2) prepare RHS
|
||||
|
||||
rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
|
||||
|
||||
# (3) do the copy
|
||||
|
||||
_driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
|
||||
|
||||
if synchronous:
|
||||
stream.synchronize()
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _assign_kernel(ndim):
|
||||
"""
|
||||
A separate method so we don't need to compile code every assignment (!).
|
||||
|
||||
:param ndim: We need to have static array sizes for cuda.local.array, so
|
||||
bake in the number of dimensions into the kernel
|
||||
"""
|
||||
from numba import cuda # circular!
|
||||
|
||||
if ndim == 0:
|
||||
# the (2, ndim) allocation below is not yet supported, so avoid it
|
||||
@cuda.jit
|
||||
def kernel(lhs, rhs):
|
||||
lhs[()] = rhs[()]
|
||||
return kernel
|
||||
|
||||
@cuda.jit
|
||||
def kernel(lhs, rhs):
|
||||
location = cuda.grid(1)
|
||||
|
||||
n_elements = 1
|
||||
for i in range(lhs.ndim):
|
||||
n_elements *= lhs.shape[i]
|
||||
if location >= n_elements:
|
||||
# bake n_elements into the kernel, better than passing it in
|
||||
# as another argument.
|
||||
return
|
||||
|
||||
# [0, :] is the to-index (into `lhs`)
|
||||
# [1, :] is the from-index (into `rhs`)
|
||||
idx = cuda.local.array(
|
||||
shape=(2, ndim),
|
||||
dtype=types.int64)
|
||||
|
||||
for i in range(ndim - 1, -1, -1):
|
||||
idx[0, i] = location % lhs.shape[i]
|
||||
idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
|
||||
location //= lhs.shape[i]
|
||||
|
||||
lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
|
||||
return kernel
|
||||
|
||||
|
||||
class DeviceNDArray(DeviceNDArrayBase):
|
||||
'''
|
||||
An on-GPU array type
|
||||
'''
|
||||
def is_f_contiguous(self):
|
||||
'''
|
||||
Return true if the array is Fortran-contiguous.
|
||||
'''
|
||||
return self._dummy.is_f_contig
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
"""
|
||||
For `numpy.ndarray` compatibility. Ideally this would return a
|
||||
`np.core.multiarray.flagsobj`, but that needs to be constructed
|
||||
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
|
||||
aren't writeable).
|
||||
"""
|
||||
return dict(self._dummy.flags) # defensive copy
|
||||
|
||||
def is_c_contiguous(self):
|
||||
'''
|
||||
Return true if the array is C-contiguous.
|
||||
'''
|
||||
return self._dummy.is_c_contig
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
"""
|
||||
:return: an `numpy.ndarray`, so copies to the host.
|
||||
"""
|
||||
if dtype:
|
||||
return self.copy_to_host().__array__(dtype)
|
||||
else:
|
||||
return self.copy_to_host().__array__()
|
||||
|
||||
def __len__(self):
|
||||
return self.shape[0]
|
||||
|
||||
def reshape(self, *newshape, **kws):
|
||||
"""
|
||||
Reshape the array without changing its contents, similarly to
|
||||
:meth:`numpy.ndarray.reshape`. Example::
|
||||
|
||||
d_arr = d_arr.reshape(20, 50, order='F')
|
||||
"""
|
||||
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
|
||||
newshape = newshape[0]
|
||||
|
||||
cls = type(self)
|
||||
if newshape == self.shape:
|
||||
# nothing to do
|
||||
return cls(shape=self.shape, strides=self.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data)
|
||||
|
||||
newarr, extents = self._dummy.reshape(*newshape, **kws)
|
||||
|
||||
if extents == [self._dummy.extent]:
|
||||
return cls(shape=newarr.shape, strides=newarr.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data)
|
||||
else:
|
||||
raise NotImplementedError("operation requires copying")
|
||||
|
||||
def ravel(self, order='C', stream=0):
|
||||
'''
|
||||
Flattens a contiguous array without changing its contents, similar to
|
||||
:meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
|
||||
exception.
|
||||
'''
|
||||
stream = self._default_stream(stream)
|
||||
cls = type(self)
|
||||
newarr, extents = self._dummy.ravel(order=order)
|
||||
|
||||
if extents == [self._dummy.extent]:
|
||||
return cls(shape=newarr.shape, strides=newarr.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data,
|
||||
stream=stream)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("operation requires copying")
|
||||
|
||||
@devices.require_context
|
||||
def __getitem__(self, item):
|
||||
return self._do_getitem(item)
|
||||
|
||||
@devices.require_context
|
||||
def getitem(self, item, stream=0):
|
||||
"""Do `__getitem__(item)` with CUDA stream
|
||||
"""
|
||||
return self._do_getitem(item, stream)
|
||||
|
||||
def _do_getitem(self, item, stream=0):
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
arr = self._dummy.__getitem__(item)
|
||||
extents = list(arr.iter_contiguous_extent())
|
||||
cls = type(self)
|
||||
if len(extents) == 1:
|
||||
newdata = self.gpu_data.view(*extents[0])
|
||||
|
||||
if not arr.is_array:
|
||||
# Check for structured array type (record)
|
||||
if self.dtype.names is not None:
|
||||
return DeviceRecord(dtype=self.dtype, stream=stream,
|
||||
gpu_data=newdata)
|
||||
else:
|
||||
# Element indexing
|
||||
hostary = np.empty(1, dtype=self.dtype)
|
||||
_driver.device_to_host(dst=hostary, src=newdata,
|
||||
size=self._dummy.itemsize,
|
||||
stream=stream)
|
||||
return hostary[0]
|
||||
else:
|
||||
return cls(shape=arr.shape, strides=arr.strides,
|
||||
dtype=self.dtype, gpu_data=newdata, stream=stream)
|
||||
else:
|
||||
newdata = self.gpu_data.view(*arr.extent)
|
||||
return cls(shape=arr.shape, strides=arr.strides,
|
||||
dtype=self.dtype, gpu_data=newdata, stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def __setitem__(self, key, value):
|
||||
return self._do_setitem(key, value)
|
||||
|
||||
@devices.require_context
|
||||
def setitem(self, key, value, stream=0):
|
||||
"""Do `__setitem__(key, value)` with CUDA stream
|
||||
"""
|
||||
return self._do_setitem(key, value, stream=stream)
|
||||
|
||||
def _do_setitem(self, key, value, stream=0):
|
||||
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
# If the array didn't have a default stream, and the user didn't provide
|
||||
# a stream, then we will use the default stream for the assignment
|
||||
# kernel and synchronize on it.
|
||||
synchronous = not stream
|
||||
if synchronous:
|
||||
ctx = devices.get_context()
|
||||
stream = ctx.get_default_stream()
|
||||
|
||||
# (1) prepare LHS
|
||||
|
||||
arr = self._dummy.__getitem__(key)
|
||||
newdata = self.gpu_data.view(*arr.extent)
|
||||
|
||||
if isinstance(arr, dummyarray.Element):
|
||||
# convert to a 0d array
|
||||
shape = ()
|
||||
strides = ()
|
||||
else:
|
||||
shape = arr.shape
|
||||
strides = arr.strides
|
||||
|
||||
lhs = type(self)(
|
||||
shape=shape,
|
||||
strides=strides,
|
||||
dtype=self.dtype,
|
||||
gpu_data=newdata,
|
||||
stream=stream)
|
||||
|
||||
# (2) prepare RHS
|
||||
|
||||
rhs, _ = auto_device(value, stream=stream, user_explicit=True)
|
||||
if rhs.ndim > lhs.ndim:
|
||||
raise ValueError("Can't assign %s-D array to %s-D self" % (
|
||||
rhs.ndim,
|
||||
lhs.ndim))
|
||||
rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
|
||||
# negative indices would not work if rhs.ndim == 0
|
||||
rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
|
||||
rhs = rhs.reshape(*rhs_shape)
|
||||
for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
|
||||
if r != 1 and l != r:
|
||||
raise ValueError("Can't copy sequence with size %d to array "
|
||||
"axis %d with dimension %d" % ( r, i, l))
|
||||
|
||||
# (3) do the copy
|
||||
|
||||
n_elements = functools.reduce(operator.mul, lhs.shape, 1)
|
||||
_assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
|
||||
if synchronous:
|
||||
stream.synchronize()
|
||||
|
||||
|
||||
class IpcArrayHandle(object):
|
||||
"""
|
||||
An IPC array handle that can be serialized and transfer to another process
|
||||
in the same machine for share a GPU allocation.
|
||||
|
||||
On the destination process, use the *.open()* method to creates a new
|
||||
*DeviceNDArray* object that shares the allocation from the original process.
|
||||
To release the resources, call the *.close()* method. After that, the
|
||||
destination can no longer use the shared array object. (Note: the
|
||||
underlying weakref to the resource is now dead.)
|
||||
|
||||
This object implements the context-manager interface that calls the
|
||||
*.open()* and *.close()* method automatically::
|
||||
|
||||
with the_ipc_array_handle as ipc_array:
|
||||
# use ipc_array here as a normal gpu array object
|
||||
some_code(ipc_array)
|
||||
# ipc_array is dead at this point
|
||||
"""
|
||||
def __init__(self, ipc_handle, array_desc):
|
||||
self._array_desc = array_desc
|
||||
self._ipc_handle = ipc_handle
|
||||
|
||||
def open(self):
|
||||
"""
|
||||
Returns a new *DeviceNDArray* that shares the allocation from the
|
||||
original process. Must not be used on the original process.
|
||||
"""
|
||||
dptr = self._ipc_handle.open(devices.get_context())
|
||||
return DeviceNDArray(gpu_data=dptr, **self._array_desc)
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Closes the IPC handle to the array.
|
||||
"""
|
||||
self._ipc_handle.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self.open()
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.close()
|
||||
|
||||
|
||||
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
|
||||
"""
|
||||
A host array that uses CUDA mapped memory.
|
||||
"""
|
||||
|
||||
def device_setup(self, gpu_data, stream=0):
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
|
||||
class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
|
||||
"""
|
||||
A host array that uses CUDA managed memory.
|
||||
"""
|
||||
|
||||
def device_setup(self, gpu_data, stream=0):
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
|
||||
def from_array_like(ary, stream=0, gpu_data=None):
|
||||
"Create a DeviceNDArray object that is like ary."
|
||||
return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
|
||||
gpu_data=gpu_data)
|
||||
|
||||
|
||||
def from_record_like(rec, stream=0, gpu_data=None):
|
||||
"Create a DeviceRecord object that is like rec."
|
||||
return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
|
||||
|
||||
|
||||
def array_core(ary):
|
||||
"""
|
||||
Extract the repeated core of a broadcast array.
|
||||
|
||||
Broadcast arrays are by definition non-contiguous due to repeated
|
||||
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
|
||||
contiguity and copy the underlying data from such arrays, we must create
|
||||
a view without the repeated dimensions.
|
||||
|
||||
"""
|
||||
if not ary.strides or not ary.size:
|
||||
return ary
|
||||
core_index = []
|
||||
for stride in ary.strides:
|
||||
core_index.append(0 if stride == 0 else slice(None))
|
||||
return ary[tuple(core_index)]
|
||||
|
||||
|
||||
def is_contiguous(ary):
|
||||
"""
|
||||
Returns True iff `ary` is C-style contiguous while ignoring
|
||||
broadcasted and 1-sized dimensions.
|
||||
As opposed to array_core(), it does not call require_context(),
|
||||
which can be quite expensive.
|
||||
"""
|
||||
size = ary.dtype.itemsize
|
||||
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
|
||||
if shape > 1 and stride != 0:
|
||||
if size != stride:
|
||||
return False
|
||||
size *= shape
|
||||
return True
|
||||
|
||||
|
||||
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
|
||||
"be transferred as a single memory region. Please "
|
||||
"ensure contiguous buffer with numpy "
|
||||
".ascontiguousarray()")
|
||||
|
||||
|
||||
def sentry_contiguous(ary):
|
||||
core = array_core(ary)
|
||||
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
|
||||
raise ValueError(errmsg_contiguous_buffer)
|
||||
|
||||
|
||||
def auto_device(obj, stream=0, copy=True, user_explicit=False):
|
||||
"""
|
||||
Create a DeviceRecord or DeviceArray like obj and optionally copy data from
|
||||
host to device. If obj already represents device memory, it is returned and
|
||||
no copy is made.
|
||||
"""
|
||||
if _driver.is_device_memory(obj):
|
||||
return obj, False
|
||||
elif hasattr(obj, '__cuda_array_interface__'):
|
||||
return numba.cuda.as_cuda_array(obj), False
|
||||
else:
|
||||
if isinstance(obj, np.void):
|
||||
devobj = from_record_like(obj, stream=stream)
|
||||
else:
|
||||
# This allows you to pass non-array objects like constants and
|
||||
# objects implementing the array interface
|
||||
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
|
||||
# into this function (with no overhead -- copies -- for `obj`s
|
||||
# that are already `ndarray`s.
|
||||
obj = np.array(
|
||||
obj,
|
||||
copy=False if numpy_version < (2, 0) else None,
|
||||
subok=True)
|
||||
sentry_contiguous(obj)
|
||||
devobj = from_array_like(obj, stream=stream)
|
||||
if copy:
|
||||
if config.CUDA_WARN_ON_IMPLICIT_COPY:
|
||||
if (
|
||||
not user_explicit and
|
||||
(not isinstance(obj, DeviceNDArray)
|
||||
and isinstance(obj, np.ndarray))
|
||||
):
|
||||
msg = ("Host array used in CUDA kernel will incur "
|
||||
"copy overhead to/from device.")
|
||||
warn(NumbaPerformanceWarning(msg))
|
||||
devobj.copy_to_device(obj, stream=stream)
|
||||
return devobj, True
|
||||
|
||||
|
||||
def check_array_compatibility(ary1, ary2):
|
||||
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
|
||||
if ary1.dtype != ary2.dtype:
|
||||
raise TypeError('incompatible dtype: %s vs. %s' %
|
||||
(ary1.dtype, ary2.dtype))
|
||||
if ary1sq.shape != ary2sq.shape:
|
||||
raise ValueError('incompatible shape: %s vs. %s' %
|
||||
(ary1.shape, ary2.shape))
|
||||
# We check strides only if the size is nonzero, because strides are
|
||||
# irrelevant (and can differ) for zero-length copies.
|
||||
if ary1.size and ary1sq.strides != ary2sq.strides:
|
||||
raise ValueError('incompatible strides: %s vs. %s' %
|
||||
(ary1.strides, ary2.strides))
|
||||
Reference in New Issue
Block a user