Videre
This commit is contained in:
@@ -0,0 +1,38 @@
|
||||
import sys
|
||||
|
||||
from .api import *
|
||||
from .vector_types import vector_types
|
||||
from .reduction import Reduce
|
||||
from .cudadrv.devicearray import (device_array, device_array_like, pinned,
|
||||
pinned_array, pinned_array_like,
|
||||
mapped_array, to_device, auto_device)
|
||||
from .cudadrv import devicearray
|
||||
from .cudadrv.devices import require_context, gpus
|
||||
from .cudadrv.devices import get_context as current_context
|
||||
from .cudadrv.runtime import runtime
|
||||
from numba.core import config
|
||||
reduce = Reduce
|
||||
|
||||
# Register simulated vector types as module level variables
|
||||
for name, svty in vector_types.items():
|
||||
setattr(sys.modules[__name__], name, svty)
|
||||
for alias in svty.aliases:
|
||||
setattr(sys.modules[__name__], alias, svty)
|
||||
del vector_types, name, svty, alias
|
||||
|
||||
# Ensure that any user code attempting to import cudadrv etc. gets the
|
||||
# simulator's version and not the real version if the simulator is enabled.
|
||||
if config.ENABLE_CUDASIM:
|
||||
import sys
|
||||
from numba.cuda.simulator import cudadrv
|
||||
sys.modules['numba.cuda.cudadrv'] = cudadrv
|
||||
sys.modules['numba.cuda.cudadrv.devicearray'] = cudadrv.devicearray
|
||||
sys.modules['numba.cuda.cudadrv.devices'] = cudadrv.devices
|
||||
sys.modules['numba.cuda.cudadrv.driver'] = cudadrv.driver
|
||||
sys.modules['numba.cuda.cudadrv.runtime'] = cudadrv.runtime
|
||||
sys.modules['numba.cuda.cudadrv.drvapi'] = cudadrv.drvapi
|
||||
sys.modules['numba.cuda.cudadrv.error'] = cudadrv.error
|
||||
sys.modules['numba.cuda.cudadrv.nvvm'] = cudadrv.nvvm
|
||||
|
||||
from . import compiler
|
||||
sys.modules['numba.cuda.compiler'] = compiler
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,110 @@
|
||||
'''
|
||||
Contains CUDA API functions
|
||||
'''
|
||||
|
||||
# Imports here bring together parts of the API from other modules, so some of
|
||||
# them appear unused.
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .cudadrv.devices import require_context, reset, gpus # noqa: F401
|
||||
from .kernel import FakeCUDAKernel
|
||||
from numba.core.sigutils import is_signature
|
||||
from warnings import warn
|
||||
from ..args import In, Out, InOut # noqa: F401
|
||||
|
||||
|
||||
def select_device(dev=0):
|
||||
assert dev == 0, 'Only a single device supported by the simulator'
|
||||
|
||||
|
||||
def is_float16_supported():
|
||||
return True
|
||||
|
||||
|
||||
class stream(object):
|
||||
'''
|
||||
The stream API is supported in the simulator - however, all execution
|
||||
occurs synchronously, so synchronization requires no operation.
|
||||
'''
|
||||
@contextmanager
|
||||
def auto_synchronize(self):
|
||||
yield
|
||||
|
||||
def synchronize(self):
|
||||
pass
|
||||
|
||||
|
||||
def synchronize():
|
||||
pass
|
||||
|
||||
|
||||
def close():
|
||||
gpus.closed = True
|
||||
|
||||
|
||||
def declare_device(*args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def detect():
|
||||
print('Found 1 CUDA devices')
|
||||
print('id %d %20s %40s' % (0, 'SIMULATOR', '[SUPPORTED]'))
|
||||
print('%40s: 5.0' % 'compute capability')
|
||||
|
||||
|
||||
def list_devices():
|
||||
return gpus
|
||||
|
||||
|
||||
# Events
|
||||
|
||||
class Event(object):
|
||||
'''
|
||||
The simulator supports the event API, but they do not record timing info,
|
||||
and all simulation is synchronous. Execution time is not recorded.
|
||||
'''
|
||||
def record(self, stream=0):
|
||||
pass
|
||||
|
||||
def wait(self, stream=0):
|
||||
pass
|
||||
|
||||
def synchronize(self):
|
||||
pass
|
||||
|
||||
def elapsed_time(self, event):
|
||||
warn('Simulator timings are bogus')
|
||||
return 0.0
|
||||
|
||||
|
||||
event = Event
|
||||
|
||||
|
||||
def jit(func_or_sig=None, device=False, debug=False, argtypes=None,
|
||||
inline=False, restype=None, fastmath=False, link=None,
|
||||
boundscheck=None, opt=True, cache=None
|
||||
):
|
||||
# Here for API compatibility
|
||||
if boundscheck:
|
||||
raise NotImplementedError("bounds checking is not supported for CUDA")
|
||||
|
||||
if link is not None:
|
||||
raise NotImplementedError('Cannot link PTX in the simulator')
|
||||
|
||||
# Check for first argument specifying types - in that case the
|
||||
# decorator is not being passed a function
|
||||
if (func_or_sig is None or is_signature(func_or_sig)
|
||||
or isinstance(func_or_sig, list)):
|
||||
def jitwrapper(fn):
|
||||
return FakeCUDAKernel(fn,
|
||||
device=device,
|
||||
fastmath=fastmath,
|
||||
debug=debug)
|
||||
return jitwrapper
|
||||
return FakeCUDAKernel(func_or_sig, device=device, debug=debug)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def defer_cleanup():
|
||||
# No effect for simulator
|
||||
yield
|
||||
@@ -0,0 +1,9 @@
|
||||
'''
|
||||
The compiler is not implemented in the simulator. This module provides a stub
|
||||
to allow tests to import successfully.
|
||||
'''
|
||||
|
||||
compile = None
|
||||
compile_for_current_device = None
|
||||
compile_ptx = None
|
||||
compile_ptx_for_current_device = None
|
||||
@@ -0,0 +1,2 @@
|
||||
from numba.cuda.simulator.cudadrv import (devicearray, devices, driver, drvapi,
|
||||
error, nvvm)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,436 @@
|
||||
'''
|
||||
The Device Array API is not implemented in the simulator. This module provides
|
||||
stubs to allow tests to import correctly.
|
||||
'''
|
||||
from contextlib import contextmanager
|
||||
from numba.np.numpy_support import numpy_version
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
DeviceRecord = None
|
||||
from_record_like = None
|
||||
|
||||
|
||||
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
|
||||
"be transferred as a single memory region. Please "
|
||||
"ensure contiguous buffer with numpy "
|
||||
".ascontiguousarray()")
|
||||
|
||||
|
||||
class FakeShape(tuple):
|
||||
'''
|
||||
The FakeShape class is used to provide a shape which does not allow negative
|
||||
indexing, similar to the shape in CUDA Python. (Numpy shape arrays allow
|
||||
negative indexing)
|
||||
'''
|
||||
|
||||
def __getitem__(self, k):
|
||||
if isinstance(k, int) and k < 0:
|
||||
raise IndexError('tuple index out of range')
|
||||
return super(FakeShape, self).__getitem__(k)
|
||||
|
||||
|
||||
class FakeWithinKernelCUDAArray(object):
|
||||
'''
|
||||
Created to emulate the behavior of arrays within kernels, where either
|
||||
array.item or array['item'] is valid (that is, give all structured
|
||||
arrays `numpy.recarray`-like semantics). This behaviour does not follow
|
||||
the semantics of Python and NumPy with non-jitted code, and will be
|
||||
deprecated and removed.
|
||||
'''
|
||||
|
||||
def __init__(self, item):
|
||||
assert isinstance(item, FakeCUDAArray)
|
||||
self.__dict__['_item'] = item
|
||||
|
||||
def __wrap_if_fake(self, item):
|
||||
if isinstance(item, FakeCUDAArray):
|
||||
return FakeWithinKernelCUDAArray(item)
|
||||
else:
|
||||
return item
|
||||
|
||||
def __getattr__(self, attrname):
|
||||
try:
|
||||
if attrname in dir(self._item._ary): # For e.g. array size.
|
||||
return self.__wrap_if_fake(getattr(self._item._ary, attrname))
|
||||
else:
|
||||
return self.__wrap_if_fake(self._item.__getitem__(attrname))
|
||||
except Exception as e:
|
||||
if not isinstance(e, AttributeError):
|
||||
raise AttributeError(attrname) from e
|
||||
|
||||
def __setattr__(self, nm, val):
|
||||
self._item.__setitem__(nm, val)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.__wrap_if_fake(self._item.__getitem__(idx))
|
||||
|
||||
def __setitem__(self, idx, val):
|
||||
self._item.__setitem__(idx, val)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._item)
|
||||
|
||||
def __array_ufunc__(self, ufunc, method, *args, **kwargs):
|
||||
# ufuncs can only be called directly on instances of numpy.ndarray (not
|
||||
# things that implement its interfaces, like the FakeCUDAArray or
|
||||
# FakeWithinKernelCUDAArray). For other objects, __array_ufunc__ is
|
||||
# called when they are arguments to ufuncs, to provide an opportunity
|
||||
# to somehow implement the ufunc. Since the FakeWithinKernelCUDAArray
|
||||
# is just a thin wrapper over an ndarray, we can implement all ufuncs
|
||||
# by passing the underlying ndarrays to a call to the intended ufunc.
|
||||
call = getattr(ufunc, method)
|
||||
|
||||
def convert_fakes(obj):
|
||||
if isinstance(obj, FakeWithinKernelCUDAArray):
|
||||
obj = obj._item._ary
|
||||
|
||||
return obj
|
||||
|
||||
out = kwargs.get('out')
|
||||
if out:
|
||||
kwargs['out'] = tuple(convert_fakes(o) for o in out)
|
||||
args = tuple(convert_fakes(a) for a in args)
|
||||
return call(*args, **kwargs)
|
||||
|
||||
|
||||
class FakeCUDAArray(object):
|
||||
'''
|
||||
Implements the interface of a DeviceArray/DeviceRecord, but mostly just
|
||||
wraps a NumPy array.
|
||||
'''
|
||||
|
||||
__cuda_ndarray__ = True # There must be gpu_data attribute
|
||||
|
||||
def __init__(self, ary, stream=0):
|
||||
self._ary = ary
|
||||
self.stream = stream
|
||||
|
||||
@property
|
||||
def alloc_size(self):
|
||||
return self._ary.nbytes
|
||||
|
||||
@property
|
||||
def nbytes(self):
|
||||
# return nbytes -- FakeCUDAArray is a wrapper around NumPy
|
||||
return self._ary.nbytes
|
||||
|
||||
def __getattr__(self, attrname):
|
||||
try:
|
||||
attr = getattr(self._ary, attrname)
|
||||
return attr
|
||||
except AttributeError as e:
|
||||
msg = "Wrapped array has no attribute '%s'" % attrname
|
||||
raise AttributeError(msg) from e
|
||||
|
||||
def bind(self, stream=0):
|
||||
return FakeCUDAArray(self._ary, stream)
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.transpose()
|
||||
|
||||
def transpose(self, axes=None):
|
||||
return FakeCUDAArray(np.transpose(self._ary, axes=axes))
|
||||
|
||||
def __getitem__(self, idx):
|
||||
ret = self._ary.__getitem__(idx)
|
||||
if type(ret) not in [np.ndarray, np.void]:
|
||||
return ret
|
||||
else:
|
||||
return FakeCUDAArray(ret, stream=self.stream)
|
||||
|
||||
def __setitem__(self, idx, val):
|
||||
return self._ary.__setitem__(idx, val)
|
||||
|
||||
def copy_to_host(self, ary=None, stream=0):
|
||||
if ary is None:
|
||||
ary = np.empty_like(self._ary)
|
||||
else:
|
||||
check_array_compatibility(self, ary)
|
||||
np.copyto(ary, self._ary)
|
||||
return ary
|
||||
|
||||
def copy_to_device(self, ary, stream=0):
|
||||
'''
|
||||
Copy from the provided array into this array.
|
||||
|
||||
This may be less forgiving than the CUDA Python implementation, which
|
||||
will copy data up to the length of the smallest of the two arrays,
|
||||
whereas this expects the size of the arrays to be equal.
|
||||
'''
|
||||
sentry_contiguous(self)
|
||||
self_core, ary_core = array_core(self), array_core(ary)
|
||||
if isinstance(ary, FakeCUDAArray):
|
||||
sentry_contiguous(ary)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
else:
|
||||
ary_core = np.array(
|
||||
ary_core,
|
||||
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
|
||||
subok=True,
|
||||
copy=False if numpy_version < (2, 0) else None)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
np.copyto(self_core._ary, ary_core)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return FakeShape(self._ary.shape)
|
||||
|
||||
def ravel(self, *args, **kwargs):
|
||||
return FakeCUDAArray(self._ary.ravel(*args, **kwargs))
|
||||
|
||||
def reshape(self, *args, **kwargs):
|
||||
return FakeCUDAArray(self._ary.reshape(*args, **kwargs))
|
||||
|
||||
def view(self, *args, **kwargs):
|
||||
return FakeCUDAArray(self._ary.view(*args, **kwargs))
|
||||
|
||||
def is_c_contiguous(self):
|
||||
return self._ary.flags.c_contiguous
|
||||
|
||||
def is_f_contiguous(self):
|
||||
return self._ary.flags.f_contiguous
|
||||
|
||||
def __str__(self):
|
||||
return str(self._ary)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self._ary)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._ary)
|
||||
|
||||
# TODO: Add inplace, bitwise, unary magic methods
|
||||
# (or maybe inherit this class from numpy)?
|
||||
def __eq__(self, other):
|
||||
return FakeCUDAArray(self._ary == other)
|
||||
|
||||
def __ne__(self, other):
|
||||
return FakeCUDAArray(self._ary != other)
|
||||
|
||||
def __lt__(self, other):
|
||||
return FakeCUDAArray(self._ary < other)
|
||||
|
||||
def __le__(self, other):
|
||||
return FakeCUDAArray(self._ary <= other)
|
||||
|
||||
def __gt__(self, other):
|
||||
return FakeCUDAArray(self._ary > other)
|
||||
|
||||
def __ge__(self, other):
|
||||
return FakeCUDAArray(self._ary >= other)
|
||||
|
||||
def __add__(self, other):
|
||||
return FakeCUDAArray(self._ary + other)
|
||||
|
||||
def __sub__(self, other):
|
||||
return FakeCUDAArray(self._ary - other)
|
||||
|
||||
def __mul__(self, other):
|
||||
return FakeCUDAArray(self._ary * other)
|
||||
|
||||
def __floordiv__(self, other):
|
||||
return FakeCUDAArray(self._ary // other)
|
||||
|
||||
def __truediv__(self, other):
|
||||
return FakeCUDAArray(self._ary / other)
|
||||
|
||||
def __mod__(self, other):
|
||||
return FakeCUDAArray(self._ary % other)
|
||||
|
||||
def __pow__(self, other):
|
||||
return FakeCUDAArray(self._ary ** other)
|
||||
|
||||
def split(self, section, stream=0):
|
||||
return [
|
||||
FakeCUDAArray(a)
|
||||
for a in np.split(self._ary, range(section, len(self), section))
|
||||
]
|
||||
|
||||
|
||||
def array_core(ary):
|
||||
"""
|
||||
Extract the repeated core of a broadcast array.
|
||||
|
||||
Broadcast arrays are by definition non-contiguous due to repeated
|
||||
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
|
||||
contiguity and copy the underlying data from such arrays, we must create
|
||||
a view without the repeated dimensions.
|
||||
|
||||
"""
|
||||
if not ary.strides or not ary.size:
|
||||
return ary
|
||||
core_index = []
|
||||
for stride in ary.strides:
|
||||
core_index.append(0 if stride == 0 else slice(None))
|
||||
return ary[tuple(core_index)]
|
||||
|
||||
|
||||
def is_contiguous(ary):
|
||||
"""
|
||||
Returns True iff `ary` is C-style contiguous while ignoring
|
||||
broadcasted and 1-sized dimensions.
|
||||
As opposed to array_core(), it does not call require_context(),
|
||||
which can be quite expensive.
|
||||
"""
|
||||
size = ary.dtype.itemsize
|
||||
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
|
||||
if shape > 1 and stride != 0:
|
||||
if size != stride:
|
||||
return False
|
||||
size *= shape
|
||||
return True
|
||||
|
||||
|
||||
def sentry_contiguous(ary):
|
||||
core = array_core(ary)
|
||||
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
|
||||
raise ValueError(errmsg_contiguous_buffer)
|
||||
|
||||
|
||||
def check_array_compatibility(ary1, ary2):
|
||||
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
|
||||
if ary1.dtype != ary2.dtype:
|
||||
raise TypeError('incompatible dtype: %s vs. %s' %
|
||||
(ary1.dtype, ary2.dtype))
|
||||
if ary1sq.shape != ary2sq.shape:
|
||||
raise ValueError('incompatible shape: %s vs. %s' %
|
||||
(ary1.shape, ary2.shape))
|
||||
if ary1sq.strides != ary2sq.strides:
|
||||
raise ValueError('incompatible strides: %s vs. %s' %
|
||||
(ary1.strides, ary2.strides))
|
||||
|
||||
|
||||
def to_device(ary, stream=0, copy=True, to=None):
|
||||
ary = np.array(ary,
|
||||
copy=False if numpy_version < (2, 0) else None,
|
||||
subok=True)
|
||||
sentry_contiguous(ary)
|
||||
if to is None:
|
||||
buffer_dtype = np.int64 if ary.dtype.char in 'Mm' else ary.dtype
|
||||
return FakeCUDAArray(
|
||||
np.ndarray(
|
||||
buffer=np.copy(array_core(ary)).view(buffer_dtype),
|
||||
dtype=ary.dtype,
|
||||
shape=ary.shape,
|
||||
strides=ary.strides,
|
||||
).view(type=type(ary)),
|
||||
)
|
||||
else:
|
||||
to.copy_to_device(ary, stream=stream)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def pinned(arg):
|
||||
yield
|
||||
|
||||
|
||||
def mapped_array(*args, **kwargs):
|
||||
for unused_arg in ('portable', 'wc'):
|
||||
if unused_arg in kwargs:
|
||||
kwargs.pop(unused_arg)
|
||||
return device_array(*args, **kwargs)
|
||||
|
||||
|
||||
def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
|
||||
return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order)
|
||||
|
||||
|
||||
def managed_array(shape, dtype=np.float64, strides=None, order='C'):
|
||||
return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order)
|
||||
|
||||
|
||||
def device_array(*args, **kwargs):
|
||||
stream = kwargs.pop('stream') if 'stream' in kwargs else 0
|
||||
return FakeCUDAArray(np.ndarray(*args, **kwargs), stream=stream)
|
||||
|
||||
|
||||
def _contiguous_strides_like_array(ary):
|
||||
"""
|
||||
Given an array, compute strides for a new contiguous array of the same
|
||||
shape.
|
||||
"""
|
||||
# Don't recompute strides if the default strides will be sufficient to
|
||||
# create a contiguous array.
|
||||
if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
|
||||
return None
|
||||
|
||||
# Otherwise, we need to compute new strides using an algorithm adapted from
|
||||
# NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
|
||||
# core/src/multiarray/ctors.c. We permute the strides in ascending order
|
||||
# then compute the stride for the dimensions with the same permutation.
|
||||
|
||||
# Stride permutation. E.g. a stride array (4, -2, 12) becomes
|
||||
# [(1, -2), (0, 4), (2, 12)]
|
||||
strideperm = [ x for x in enumerate(ary.strides) ]
|
||||
strideperm.sort(key=lambda x: x[1])
|
||||
|
||||
# Compute new strides using permutation
|
||||
strides = [0] * len(ary.strides)
|
||||
stride = ary.dtype.itemsize
|
||||
for i_perm, _ in strideperm:
|
||||
strides[i_perm] = stride
|
||||
stride *= ary.shape[i_perm]
|
||||
return tuple(strides)
|
||||
|
||||
|
||||
def _order_like_array(ary):
|
||||
if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
|
||||
return 'F'
|
||||
else:
|
||||
return 'C'
|
||||
|
||||
|
||||
def device_array_like(ary, stream=0):
|
||||
strides = _contiguous_strides_like_array(ary)
|
||||
order = _order_like_array(ary)
|
||||
return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
||||
order=order)
|
||||
|
||||
|
||||
def pinned_array_like(ary):
|
||||
strides = _contiguous_strides_like_array(ary)
|
||||
order = _order_like_array(ary)
|
||||
return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
||||
order=order)
|
||||
|
||||
|
||||
def auto_device(ary, stream=0, copy=True):
|
||||
if isinstance(ary, FakeCUDAArray):
|
||||
return ary, False
|
||||
|
||||
if not isinstance(ary, np.void):
|
||||
ary = np.array(
|
||||
ary,
|
||||
copy=False if numpy_version < (2, 0) else None,
|
||||
subok=True)
|
||||
return to_device(ary, stream, copy), True
|
||||
|
||||
|
||||
def is_cuda_ndarray(obj):
|
||||
"Check if an object is a CUDA ndarray"
|
||||
return getattr(obj, '__cuda_ndarray__', False)
|
||||
|
||||
|
||||
def verify_cuda_ndarray_interface(obj):
|
||||
"Verify the CUDA ndarray interface for an obj"
|
||||
require_cuda_ndarray(obj)
|
||||
|
||||
def requires_attr(attr, typ):
|
||||
if not hasattr(obj, attr):
|
||||
raise AttributeError(attr)
|
||||
if not isinstance(getattr(obj, attr), typ):
|
||||
raise AttributeError('%s must be of type %s' % (attr, typ))
|
||||
|
||||
requires_attr('shape', tuple)
|
||||
requires_attr('strides', tuple)
|
||||
requires_attr('dtype', np.dtype)
|
||||
requires_attr('size', int)
|
||||
|
||||
|
||||
def require_cuda_ndarray(obj):
|
||||
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
|
||||
if not is_cuda_ndarray(obj):
|
||||
raise ValueError('require an cuda ndarray object')
|
||||
@@ -0,0 +1,117 @@
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
|
||||
_MemoryInfo = namedtuple("_MemoryInfo", "free,total")
|
||||
|
||||
_SIMULATOR_CC = (5, 2)
|
||||
|
||||
|
||||
class FakeCUDADevice:
|
||||
def __init__(self):
|
||||
self.uuid = 'GPU-00000000-0000-0000-0000-000000000000'
|
||||
|
||||
@property
|
||||
def compute_capability(self):
|
||||
return _SIMULATOR_CC
|
||||
|
||||
|
||||
class FakeCUDAContext:
|
||||
'''
|
||||
This stub implements functionality only for simulating a single GPU
|
||||
at the moment.
|
||||
'''
|
||||
def __init__(self, device_id):
|
||||
self._device_id = device_id
|
||||
self._device = FakeCUDADevice()
|
||||
|
||||
def __enter__(self):
|
||||
pass
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return "<Managed Device {self.id}>".format(self=self)
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self._device_id
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self._device
|
||||
|
||||
@property
|
||||
def compute_capability(self):
|
||||
return _SIMULATOR_CC
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def get_memory_info(self):
|
||||
"""
|
||||
Cross-platform free / total host memory is hard without external
|
||||
dependencies, e.g. `psutil` - so return infinite memory to maintain API
|
||||
type compatibility
|
||||
"""
|
||||
return _MemoryInfo(float('inf'), float('inf'))
|
||||
|
||||
def memalloc(self, sz):
|
||||
"""
|
||||
Allocates memory on the simulated device
|
||||
At present, there is no division between simulated
|
||||
host memory and simulated device memory.
|
||||
"""
|
||||
return np.ndarray(sz, dtype='u1')
|
||||
|
||||
def memhostalloc(self, sz, mapped=False, portable=False, wc=False):
|
||||
'''Allocates memory on the host'''
|
||||
return self.memalloc(sz)
|
||||
|
||||
|
||||
class FakeDeviceList:
|
||||
'''
|
||||
This stub implements a device list containing a single GPU. It also
|
||||
keeps track of the GPU status, i.e. whether the context is closed or not,
|
||||
which may have been set by the user calling reset()
|
||||
'''
|
||||
def __init__(self):
|
||||
self.lst = (FakeCUDAContext(0),)
|
||||
self.closed = False
|
||||
|
||||
def __getitem__(self, devnum):
|
||||
self.closed = False
|
||||
return self.lst[devnum]
|
||||
|
||||
def __str__(self):
|
||||
return ', '.join([str(d) for d in self.lst])
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.lst)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lst)
|
||||
|
||||
@property
|
||||
def current(self):
|
||||
if self.closed:
|
||||
return None
|
||||
return self.lst[0]
|
||||
|
||||
|
||||
gpus = FakeDeviceList()
|
||||
|
||||
|
||||
def reset():
|
||||
gpus[0].closed = True
|
||||
|
||||
|
||||
def get_context(devnum=0):
|
||||
return FakeCUDAContext(devnum)
|
||||
|
||||
|
||||
def require_context(func):
|
||||
'''
|
||||
In the simulator, a context is always "available", so this is a no-op.
|
||||
'''
|
||||
return func
|
||||
@@ -0,0 +1,62 @@
|
||||
'''
|
||||
Most of the driver API is unsupported in the simulator, but some stubs are
|
||||
provided to allow tests to import correctly.
|
||||
'''
|
||||
|
||||
|
||||
def device_memset(dst, val, size, stream=0):
|
||||
dst.view('u1')[:size].fill(bytes([val])[0])
|
||||
|
||||
|
||||
def host_to_device(dst, src, size, stream=0):
|
||||
dst.view('u1')[:size] = src.view('u1')[:size]
|
||||
|
||||
|
||||
def device_to_host(dst, src, size, stream=0):
|
||||
host_to_device(dst, src, size)
|
||||
|
||||
|
||||
def device_memory_size(obj):
|
||||
return obj.itemsize * obj.size
|
||||
|
||||
|
||||
def device_to_device(dst, src, size, stream=0):
|
||||
host_to_device(dst, src, size)
|
||||
|
||||
|
||||
class FakeDriver(object):
|
||||
def get_device_count(self):
|
||||
return 1
|
||||
|
||||
|
||||
driver = FakeDriver()
|
||||
|
||||
|
||||
class Linker:
|
||||
@classmethod
|
||||
def new(cls, max_registers=0, lineinfo=False, cc=None):
|
||||
return Linker()
|
||||
|
||||
@property
|
||||
def lto(self):
|
||||
return False
|
||||
|
||||
|
||||
class LinkerError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class NvrtcError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class CudaAPIError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def launch_kernel(*args, **kwargs):
|
||||
msg = 'Launching kernels directly is not supported in the simulator'
|
||||
raise RuntimeError(msg)
|
||||
|
||||
|
||||
USE_NV_BINDING = False
|
||||
@@ -0,0 +1,4 @@
|
||||
'''
|
||||
drvapi is not implemented in the simulator, but this module exists to allow
|
||||
tests to import correctly.
|
||||
'''
|
||||
@@ -0,0 +1,4 @@
|
||||
# Dummy arrays are not implemented in the simulator. This file allows the dummy
|
||||
# array tests to be imported, but they are skipped on the simulator.
|
||||
|
||||
Array = None
|
||||
@@ -0,0 +1,6 @@
|
||||
class CudaSupportError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class NvrtcError(Exception):
|
||||
pass
|
||||
@@ -0,0 +1,2 @@
|
||||
def check_static_lib(lib):
|
||||
raise FileNotFoundError('Linking libraries not supported by cudasim')
|
||||
@@ -0,0 +1,29 @@
|
||||
'''
|
||||
NVVM is not supported in the simulator, but stubs are provided to allow tests
|
||||
to import correctly.
|
||||
'''
|
||||
|
||||
|
||||
class NvvmSupportError(ImportError):
|
||||
pass
|
||||
|
||||
|
||||
class NVVM(object):
|
||||
def __init__(self):
|
||||
raise NvvmSupportError('NVVM not supported in the simulator')
|
||||
|
||||
|
||||
CompilationUnit = None
|
||||
compile_ir = None
|
||||
set_cuda_kernel = None
|
||||
get_arch_option = None
|
||||
LibDevice = None
|
||||
NvvmError = None
|
||||
|
||||
|
||||
def is_available():
|
||||
return False
|
||||
|
||||
|
||||
def get_supported_ccs():
|
||||
return ()
|
||||
@@ -0,0 +1,19 @@
|
||||
'''
|
||||
The runtime API is unsupported in the simulator, but some stubs are
|
||||
provided to allow tests to import correctly.
|
||||
'''
|
||||
|
||||
|
||||
class FakeRuntime(object):
|
||||
def get_version(self):
|
||||
return (-1, -1)
|
||||
|
||||
def is_supported_version(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def supported_versions(self):
|
||||
return (-1, -1),
|
||||
|
||||
|
||||
runtime = FakeRuntime()
|
||||
@@ -0,0 +1,312 @@
|
||||
from contextlib import contextmanager
|
||||
import functools
|
||||
import sys
|
||||
import threading
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray
|
||||
from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module
|
||||
from ..errors import normalize_kernel_dimensions
|
||||
from ..args import wrap_arg, ArgHint
|
||||
|
||||
|
||||
"""
|
||||
Global variable to keep track of the current "kernel context", i.e the
|
||||
FakeCUDAModule. We only support one kernel launch at a time.
|
||||
No support for concurrent kernel launch.
|
||||
"""
|
||||
_kernel_context = None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _push_kernel_context(mod):
|
||||
"""
|
||||
Push the current kernel context.
|
||||
"""
|
||||
global _kernel_context
|
||||
assert _kernel_context is None, "concurrent simulated kernel not supported"
|
||||
_kernel_context = mod
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_kernel_context = None
|
||||
|
||||
|
||||
def _get_kernel_context():
|
||||
"""
|
||||
Get the current kernel context. This is usually done by a device function.
|
||||
"""
|
||||
return _kernel_context
|
||||
|
||||
|
||||
class FakeOverload:
|
||||
'''
|
||||
Used only to provide the max_cooperative_grid_blocks method
|
||||
'''
|
||||
def max_cooperative_grid_blocks(self, blockdim):
|
||||
# We can only run one block in a cooperative grid because we have no
|
||||
# mechanism for synchronization between different blocks
|
||||
return 1
|
||||
|
||||
|
||||
class FakeOverloadDict(dict):
|
||||
def __getitem__(self, key):
|
||||
# Always return a fake overload for any signature, as we don't keep
|
||||
# track of overloads in the simulator.
|
||||
return FakeOverload()
|
||||
|
||||
|
||||
class FakeCUDAKernel(object):
|
||||
'''
|
||||
Wraps a @cuda.jit-ed function.
|
||||
'''
|
||||
|
||||
def __init__(
|
||||
self, fn, device, fastmath=False, extensions=None, debug=False
|
||||
):
|
||||
if extensions is None:
|
||||
extensions = []
|
||||
self.fn = fn
|
||||
self._device = device
|
||||
self._fastmath = fastmath
|
||||
self._debug = debug
|
||||
self.extensions = list(extensions) # defensive copy
|
||||
# Initial configuration: grid unconfigured, stream 0, no dynamic shared
|
||||
# memory.
|
||||
self.grid_dim = None
|
||||
self.block_dim = None
|
||||
self.stream = 0
|
||||
self.dynshared_size = 0
|
||||
functools.update_wrapper(self, fn)
|
||||
|
||||
def __call__(self, *args):
|
||||
if self._device:
|
||||
with swapped_cuda_module(self.fn, _get_kernel_context()):
|
||||
return self.fn(*args)
|
||||
|
||||
# Ensure we've been given a valid grid configuration
|
||||
grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
|
||||
self.block_dim)
|
||||
|
||||
fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
|
||||
self.dynshared_size)
|
||||
with _push_kernel_context(fake_cuda_module):
|
||||
# fake_args substitutes all numpy arrays for FakeCUDAArrays
|
||||
# because they implement some semantics differently
|
||||
retr = []
|
||||
|
||||
def fake_arg(arg):
|
||||
# map the arguments using any extension you've registered
|
||||
_, arg = functools.reduce(
|
||||
lambda ty_val, extension: extension.prepare_args(
|
||||
*ty_val,
|
||||
stream=0,
|
||||
retr=retr),
|
||||
self.extensions,
|
||||
(None, arg)
|
||||
)
|
||||
|
||||
if isinstance(arg, np.ndarray) and arg.ndim > 0:
|
||||
ret = wrap_arg(arg).to_device(retr)
|
||||
elif isinstance(arg, ArgHint):
|
||||
ret = arg.to_device(retr)
|
||||
elif isinstance(arg, np.void):
|
||||
ret = FakeCUDAArray(arg) # In case a np record comes in.
|
||||
else:
|
||||
ret = arg
|
||||
if isinstance(ret, FakeCUDAArray):
|
||||
return FakeWithinKernelCUDAArray(ret)
|
||||
return ret
|
||||
|
||||
fake_args = [fake_arg(arg) for arg in args]
|
||||
with swapped_cuda_module(self.fn, fake_cuda_module):
|
||||
# Execute one block at a time
|
||||
for grid_point in np.ndindex(*grid_dim):
|
||||
bm = BlockManager(self.fn, grid_dim, block_dim, self._debug)
|
||||
bm.run(grid_point, *fake_args)
|
||||
|
||||
for wb in retr:
|
||||
wb()
|
||||
|
||||
def __getitem__(self, configuration):
|
||||
self.grid_dim, self.block_dim = \
|
||||
normalize_kernel_dimensions(*configuration[:2])
|
||||
|
||||
if len(configuration) == 4:
|
||||
self.dynshared_size = configuration[3]
|
||||
|
||||
return self
|
||||
|
||||
def bind(self):
|
||||
pass
|
||||
|
||||
def specialize(self, *args):
|
||||
return self
|
||||
|
||||
def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
|
||||
if ntasks < 0:
|
||||
raise ValueError("Can't create ForAll with negative task count: %s"
|
||||
% ntasks)
|
||||
return self[ntasks, 1, stream, sharedmem]
|
||||
|
||||
@property
|
||||
def overloads(self):
|
||||
return FakeOverloadDict()
|
||||
|
||||
@property
|
||||
def py_func(self):
|
||||
return self.fn
|
||||
|
||||
|
||||
# Thread emulation
|
||||
|
||||
class BlockThread(threading.Thread):
|
||||
'''
|
||||
Manages the execution of a function for a single CUDA thread.
|
||||
'''
|
||||
def __init__(self, f, manager, blockIdx, threadIdx, debug):
|
||||
if debug:
|
||||
def debug_wrapper(*args, **kwargs):
|
||||
np.seterr(divide='raise')
|
||||
f(*args, **kwargs)
|
||||
target = debug_wrapper
|
||||
else:
|
||||
target = f
|
||||
|
||||
super(BlockThread, self).__init__(target=target)
|
||||
self.syncthreads_event = threading.Event()
|
||||
self.syncthreads_blocked = False
|
||||
self._manager = manager
|
||||
self.blockIdx = Dim3(*blockIdx)
|
||||
self.threadIdx = Dim3(*threadIdx)
|
||||
self.exception = None
|
||||
self.daemon = True
|
||||
self.abort = False
|
||||
self.debug = debug
|
||||
blockDim = Dim3(*self._manager._block_dim)
|
||||
self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y +
|
||||
blockDim.y *
|
||||
self.threadIdx.z))
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
super(BlockThread, self).run()
|
||||
except Exception as e:
|
||||
tid = 'tid=%s' % list(self.threadIdx)
|
||||
ctaid = 'ctaid=%s' % list(self.blockIdx)
|
||||
if str(e) == '':
|
||||
msg = '%s %s' % (tid, ctaid)
|
||||
else:
|
||||
msg = '%s %s: %s' % (tid, ctaid, e)
|
||||
tb = sys.exc_info()[2]
|
||||
# Using `with_traceback` here would cause it to be mutated by
|
||||
# future raise statements, which may or may not matter.
|
||||
self.exception = (type(e)(msg), tb)
|
||||
|
||||
def syncthreads(self):
|
||||
|
||||
if self.abort:
|
||||
raise RuntimeError("abort flag set on syncthreads call")
|
||||
|
||||
self.syncthreads_blocked = True
|
||||
self.syncthreads_event.wait()
|
||||
self.syncthreads_event.clear()
|
||||
|
||||
if self.abort:
|
||||
raise RuntimeError("abort flag set on syncthreads clear")
|
||||
|
||||
def syncthreads_count(self, value):
|
||||
idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
|
||||
self._manager.block_state[idx] = value
|
||||
self.syncthreads()
|
||||
count = np.count_nonzero(self._manager.block_state)
|
||||
self.syncthreads()
|
||||
return count
|
||||
|
||||
def syncthreads_and(self, value):
|
||||
idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
|
||||
self._manager.block_state[idx] = value
|
||||
self.syncthreads()
|
||||
test = np.all(self._manager.block_state)
|
||||
self.syncthreads()
|
||||
return 1 if test else 0
|
||||
|
||||
def syncthreads_or(self, value):
|
||||
idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
|
||||
self._manager.block_state[idx] = value
|
||||
self.syncthreads()
|
||||
test = np.any(self._manager.block_state)
|
||||
self.syncthreads()
|
||||
return 1 if test else 0
|
||||
|
||||
def __str__(self):
|
||||
return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx)
|
||||
|
||||
|
||||
class BlockManager(object):
|
||||
'''
|
||||
Manages the execution of a thread block.
|
||||
|
||||
When run() is called, all threads are started. Each thread executes until it
|
||||
hits syncthreads(), at which point it sets its own syncthreads_blocked to
|
||||
True so that the BlockManager knows it is blocked. It then waits on its
|
||||
syncthreads_event.
|
||||
|
||||
The BlockManager polls threads to determine if they are blocked in
|
||||
syncthreads(). If it finds a blocked thread, it adds it to the set of
|
||||
blocked threads. When all threads are blocked, it unblocks all the threads.
|
||||
The thread are unblocked by setting their syncthreads_blocked back to False
|
||||
and setting their syncthreads_event.
|
||||
|
||||
The polling continues until no threads are alive, when execution is
|
||||
complete.
|
||||
'''
|
||||
def __init__(self, f, grid_dim, block_dim, debug):
|
||||
self._grid_dim = grid_dim
|
||||
self._block_dim = block_dim
|
||||
self._f = f
|
||||
self._debug = debug
|
||||
self.block_state = np.zeros(block_dim, dtype=np.bool_)
|
||||
|
||||
def run(self, grid_point, *args):
|
||||
# Create all threads
|
||||
threads = set()
|
||||
livethreads = set()
|
||||
blockedthreads = set()
|
||||
for block_point in np.ndindex(*self._block_dim):
|
||||
def target():
|
||||
self._f(*args)
|
||||
t = BlockThread(target, self, grid_point, block_point, self._debug)
|
||||
t.start()
|
||||
threads.add(t)
|
||||
livethreads.add(t)
|
||||
|
||||
# Potential optimisations:
|
||||
# 1. Continue the while loop immediately after finding a blocked thread
|
||||
# 2. Don't poll already-blocked threads
|
||||
while livethreads:
|
||||
for t in livethreads:
|
||||
if t.syncthreads_blocked:
|
||||
blockedthreads.add(t)
|
||||
elif t.exception:
|
||||
|
||||
# Abort all other simulator threads on exception,
|
||||
# do *not* join immediately to facilitate debugging.
|
||||
for t_other in threads:
|
||||
t_other.abort = True
|
||||
t_other.syncthreads_blocked = False
|
||||
t_other.syncthreads_event.set()
|
||||
|
||||
raise t.exception[0].with_traceback(t.exception[1])
|
||||
if livethreads == blockedthreads:
|
||||
for t in blockedthreads:
|
||||
t.syncthreads_blocked = False
|
||||
t.syncthreads_event.set()
|
||||
blockedthreads = set()
|
||||
livethreads = set([ t for t in livethreads if t.is_alive() ])
|
||||
# Final check for exceptions in case any were set prior to thread
|
||||
# finishing, before we could check it
|
||||
for t in threads:
|
||||
if t.exception:
|
||||
raise t.exception[0].with_traceback(t.exception[1])
|
||||
@@ -0,0 +1,495 @@
|
||||
'''
|
||||
Implements the cuda module as called from within an executing kernel
|
||||
(@cuda.jit-decorated function).
|
||||
'''
|
||||
|
||||
from contextlib import contextmanager
|
||||
import sys
|
||||
import threading
|
||||
import traceback
|
||||
from numba.core import types
|
||||
import numpy as np
|
||||
|
||||
from numba.np import numpy_support
|
||||
|
||||
from .vector_types import vector_types
|
||||
|
||||
|
||||
class Dim3(object):
|
||||
'''
|
||||
Used to implement thread/block indices/dimensions
|
||||
'''
|
||||
def __init__(self, x, y, z):
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.z = z
|
||||
|
||||
def __str__(self):
|
||||
return '(%s, %s, %s)' % (self.x, self.y, self.z)
|
||||
|
||||
def __repr__(self):
|
||||
return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
|
||||
|
||||
def __iter__(self):
|
||||
yield self.x
|
||||
yield self.y
|
||||
yield self.z
|
||||
|
||||
|
||||
class GridGroup:
|
||||
'''
|
||||
Used to implement the grid group.
|
||||
'''
|
||||
|
||||
def sync(self):
|
||||
# Synchronization of the grid group is equivalent to synchronization of
|
||||
# the thread block, because we only support cooperative grids with one
|
||||
# block.
|
||||
threading.current_thread().syncthreads()
|
||||
|
||||
|
||||
class FakeCUDACg:
|
||||
'''
|
||||
CUDA Cooperative Groups
|
||||
'''
|
||||
def this_grid(self):
|
||||
return GridGroup()
|
||||
|
||||
|
||||
class FakeCUDALocal(object):
|
||||
'''
|
||||
CUDA Local arrays
|
||||
'''
|
||||
def array(self, shape, dtype):
|
||||
if isinstance(dtype, types.Type):
|
||||
dtype = numpy_support.as_dtype(dtype)
|
||||
return np.empty(shape, dtype)
|
||||
|
||||
|
||||
class FakeCUDAConst(object):
|
||||
'''
|
||||
CUDA Const arrays
|
||||
'''
|
||||
def array_like(self, ary):
|
||||
return ary
|
||||
|
||||
|
||||
class FakeCUDAShared(object):
|
||||
'''
|
||||
CUDA Shared arrays.
|
||||
|
||||
Limitations: assumes that only one call to cuda.shared.array is on a line,
|
||||
and that that line is only executed once per thread. i.e.::
|
||||
|
||||
a = cuda.shared.array(...); b = cuda.shared.array(...)
|
||||
|
||||
will erroneously alias a and b, and::
|
||||
|
||||
for i in range(10):
|
||||
sharedarrs[i] = cuda.shared.array(...)
|
||||
|
||||
will alias all arrays created at that point (though it is not certain that
|
||||
this would be supported by Numba anyway).
|
||||
'''
|
||||
|
||||
def __init__(self, dynshared_size):
|
||||
self._allocations = {}
|
||||
self._dynshared_size = dynshared_size
|
||||
self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
|
||||
|
||||
def array(self, shape, dtype):
|
||||
if isinstance(dtype, types.Type):
|
||||
dtype = numpy_support.as_dtype(dtype)
|
||||
# Dynamic shared memory is requested with size 0 - this all shares the
|
||||
# same underlying memory
|
||||
if shape == 0:
|
||||
# Count must be the maximum number of whole elements that fit in the
|
||||
# buffer (Numpy complains if the buffer is not a multiple of the
|
||||
# element size)
|
||||
count = self._dynshared_size // dtype.itemsize
|
||||
return np.frombuffer(self._dynshared.data, dtype=dtype, count=count)
|
||||
|
||||
# Otherwise, identify allocations by source file and line number
|
||||
# We pass the reference frame explicitly to work around
|
||||
# http://bugs.python.org/issue25108
|
||||
stack = traceback.extract_stack(sys._getframe())
|
||||
caller = stack[-2][0:2]
|
||||
res = self._allocations.get(caller)
|
||||
if res is None:
|
||||
res = np.empty(shape, dtype)
|
||||
self._allocations[caller] = res
|
||||
return res
|
||||
|
||||
|
||||
addlock = threading.Lock()
|
||||
sublock = threading.Lock()
|
||||
andlock = threading.Lock()
|
||||
orlock = threading.Lock()
|
||||
xorlock = threading.Lock()
|
||||
maxlock = threading.Lock()
|
||||
minlock = threading.Lock()
|
||||
compare_and_swaplock = threading.Lock()
|
||||
caslock = threading.Lock()
|
||||
inclock = threading.Lock()
|
||||
declock = threading.Lock()
|
||||
exchlock = threading.Lock()
|
||||
|
||||
|
||||
class FakeCUDAAtomic(object):
|
||||
def add(self, array, index, val):
|
||||
with addlock:
|
||||
old = array[index]
|
||||
array[index] += val
|
||||
return old
|
||||
|
||||
def sub(self, array, index, val):
|
||||
with sublock:
|
||||
old = array[index]
|
||||
array[index] -= val
|
||||
return old
|
||||
|
||||
def and_(self, array, index, val):
|
||||
with andlock:
|
||||
old = array[index]
|
||||
array[index] &= val
|
||||
return old
|
||||
|
||||
def or_(self, array, index, val):
|
||||
with orlock:
|
||||
old = array[index]
|
||||
array[index] |= val
|
||||
return old
|
||||
|
||||
def xor(self, array, index, val):
|
||||
with xorlock:
|
||||
old = array[index]
|
||||
array[index] ^= val
|
||||
return old
|
||||
|
||||
def inc(self, array, index, val):
|
||||
with inclock:
|
||||
old = array[index]
|
||||
if old >= val:
|
||||
array[index] = 0
|
||||
else:
|
||||
array[index] += 1
|
||||
return old
|
||||
|
||||
def dec(self, array, index, val):
|
||||
with declock:
|
||||
old = array[index]
|
||||
if (old == 0) or (old > val):
|
||||
array[index] = val
|
||||
else:
|
||||
array[index] -= 1
|
||||
return old
|
||||
|
||||
def exch(self, array, index, val):
|
||||
with exchlock:
|
||||
old = array[index]
|
||||
array[index] = val
|
||||
return old
|
||||
|
||||
def max(self, array, index, val):
|
||||
with maxlock:
|
||||
old = array[index]
|
||||
array[index] = max(old, val)
|
||||
return old
|
||||
|
||||
def min(self, array, index, val):
|
||||
with minlock:
|
||||
old = array[index]
|
||||
array[index] = min(old, val)
|
||||
return old
|
||||
|
||||
def nanmax(self, array, index, val):
|
||||
with maxlock:
|
||||
old = array[index]
|
||||
array[index] = np.nanmax([array[index], val])
|
||||
return old
|
||||
|
||||
def nanmin(self, array, index, val):
|
||||
with minlock:
|
||||
old = array[index]
|
||||
array[index] = np.nanmin([array[index], val])
|
||||
return old
|
||||
|
||||
def compare_and_swap(self, array, old, val):
|
||||
with compare_and_swaplock:
|
||||
index = (0,) * array.ndim
|
||||
loaded = array[index]
|
||||
if loaded == old:
|
||||
array[index] = val
|
||||
return loaded
|
||||
|
||||
def cas(self, array, index, old, val):
|
||||
with caslock:
|
||||
loaded = array[index]
|
||||
if loaded == old:
|
||||
array[index] = val
|
||||
return loaded
|
||||
|
||||
|
||||
class FakeCUDAFp16(object):
|
||||
def hadd(self, a, b):
|
||||
return a + b
|
||||
|
||||
def hsub(self, a, b):
|
||||
return a - b
|
||||
|
||||
def hmul(self, a, b):
|
||||
return a * b
|
||||
|
||||
def hdiv(self, a, b):
|
||||
return a / b
|
||||
|
||||
def hfma(self, a, b, c):
|
||||
return a * b + c
|
||||
|
||||
def hneg(self, a):
|
||||
return -a
|
||||
|
||||
def habs(self, a):
|
||||
return abs(a)
|
||||
|
||||
def hsin(self, x):
|
||||
return np.sin(x, dtype=np.float16)
|
||||
|
||||
def hcos(self, x):
|
||||
return np.cos(x, dtype=np.float16)
|
||||
|
||||
def hlog(self, x):
|
||||
return np.log(x, dtype=np.float16)
|
||||
|
||||
def hlog2(self, x):
|
||||
return np.log2(x, dtype=np.float16)
|
||||
|
||||
def hlog10(self, x):
|
||||
return np.log10(x, dtype=np.float16)
|
||||
|
||||
def hexp(self, x):
|
||||
return np.exp(x, dtype=np.float16)
|
||||
|
||||
def hexp2(self, x):
|
||||
return np.exp2(x, dtype=np.float16)
|
||||
|
||||
def hexp10(self, x):
|
||||
return np.float16(10 ** x)
|
||||
|
||||
def hsqrt(self, x):
|
||||
return np.sqrt(x, dtype=np.float16)
|
||||
|
||||
def hrsqrt(self, x):
|
||||
return np.float16(x ** -0.5)
|
||||
|
||||
def hceil(self, x):
|
||||
return np.ceil(x, dtype=np.float16)
|
||||
|
||||
def hfloor(self, x):
|
||||
return np.ceil(x, dtype=np.float16)
|
||||
|
||||
def hrcp(self, x):
|
||||
return np.reciprocal(x, dtype=np.float16)
|
||||
|
||||
def htrunc(self, x):
|
||||
return np.trunc(x, dtype=np.float16)
|
||||
|
||||
def hrint(self, x):
|
||||
return np.rint(x, dtype=np.float16)
|
||||
|
||||
def heq(self, a, b):
|
||||
return a == b
|
||||
|
||||
def hne(self, a, b):
|
||||
return a != b
|
||||
|
||||
def hge(self, a, b):
|
||||
return a >= b
|
||||
|
||||
def hgt(self, a, b):
|
||||
return a > b
|
||||
|
||||
def hle(self, a, b):
|
||||
return a <= b
|
||||
|
||||
def hlt(self, a, b):
|
||||
return a < b
|
||||
|
||||
def hmax(self, a, b):
|
||||
return max(a, b)
|
||||
|
||||
def hmin(self, a, b):
|
||||
return min(a, b)
|
||||
|
||||
|
||||
class FakeCUDAModule(object):
|
||||
'''
|
||||
An instance of this class will be injected into the __globals__ for an
|
||||
executing function in order to implement calls to cuda.*. This will fail to
|
||||
work correctly if the user code does::
|
||||
|
||||
from numba import cuda as something_else
|
||||
|
||||
In other words, the CUDA module must be called cuda.
|
||||
'''
|
||||
|
||||
def __init__(self, grid_dim, block_dim, dynshared_size):
|
||||
self.gridDim = Dim3(*grid_dim)
|
||||
self.blockDim = Dim3(*block_dim)
|
||||
self._cg = FakeCUDACg()
|
||||
self._local = FakeCUDALocal()
|
||||
self._shared = FakeCUDAShared(dynshared_size)
|
||||
self._const = FakeCUDAConst()
|
||||
self._atomic = FakeCUDAAtomic()
|
||||
self._fp16 = FakeCUDAFp16()
|
||||
# Insert the vector types into the kernel context
|
||||
# Note that we need to do this in addition to exposing them as module
|
||||
# variables in `simulator.__init__.py`, because the test cases need
|
||||
# to access the actual cuda module as well as the fake cuda module
|
||||
# for vector types.
|
||||
for name, svty in vector_types.items():
|
||||
setattr(self, name, svty)
|
||||
for alias in svty.aliases:
|
||||
setattr(self, alias, svty)
|
||||
|
||||
@property
|
||||
def cg(self):
|
||||
return self._cg
|
||||
|
||||
@property
|
||||
def local(self):
|
||||
return self._local
|
||||
|
||||
@property
|
||||
def shared(self):
|
||||
return self._shared
|
||||
|
||||
@property
|
||||
def const(self):
|
||||
return self._const
|
||||
|
||||
@property
|
||||
def atomic(self):
|
||||
return self._atomic
|
||||
|
||||
@property
|
||||
def fp16(self):
|
||||
return self._fp16
|
||||
|
||||
@property
|
||||
def threadIdx(self):
|
||||
return threading.current_thread().threadIdx
|
||||
|
||||
@property
|
||||
def blockIdx(self):
|
||||
return threading.current_thread().blockIdx
|
||||
|
||||
@property
|
||||
def warpsize(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def laneid(self):
|
||||
return threading.current_thread().thread_id % 32
|
||||
|
||||
def syncthreads(self):
|
||||
threading.current_thread().syncthreads()
|
||||
|
||||
def threadfence(self):
|
||||
# No-op
|
||||
pass
|
||||
|
||||
def threadfence_block(self):
|
||||
# No-op
|
||||
pass
|
||||
|
||||
def threadfence_system(self):
|
||||
# No-op
|
||||
pass
|
||||
|
||||
def syncthreads_count(self, val):
|
||||
return threading.current_thread().syncthreads_count(val)
|
||||
|
||||
def syncthreads_and(self, val):
|
||||
return threading.current_thread().syncthreads_and(val)
|
||||
|
||||
def syncthreads_or(self, val):
|
||||
return threading.current_thread().syncthreads_or(val)
|
||||
|
||||
def popc(self, val):
|
||||
return bin(val).count("1")
|
||||
|
||||
def fma(self, a, b, c):
|
||||
return a * b + c
|
||||
|
||||
def cbrt(self, a):
|
||||
return a ** (1 / 3)
|
||||
|
||||
def brev(self, val):
|
||||
return int('{:032b}'.format(val)[::-1], 2)
|
||||
|
||||
def clz(self, val):
|
||||
s = '{:032b}'.format(val)
|
||||
return len(s) - len(s.lstrip('0'))
|
||||
|
||||
def ffs(self, val):
|
||||
# The algorithm is:
|
||||
# 1. Count the number of trailing zeros.
|
||||
# 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
|
||||
# 3. If we've counted 32 zeros (resulting in 33), there were no bits
|
||||
# set so we need to return zero.
|
||||
s = '{:032b}'.format(val)
|
||||
r = (len(s) - len(s.rstrip('0')) + 1) % 33
|
||||
return r
|
||||
|
||||
def selp(self, a, b, c):
|
||||
return b if a else c
|
||||
|
||||
def grid(self, n):
|
||||
bdim = self.blockDim
|
||||
bid = self.blockIdx
|
||||
tid = self.threadIdx
|
||||
x = bid.x * bdim.x + tid.x
|
||||
if n == 1:
|
||||
return x
|
||||
y = bid.y * bdim.y + tid.y
|
||||
if n == 2:
|
||||
return (x, y)
|
||||
z = bid.z * bdim.z + tid.z
|
||||
if n == 3:
|
||||
return (x, y, z)
|
||||
|
||||
raise RuntimeError("Global ID has 1-3 dimensions. %d requested" % n)
|
||||
|
||||
def gridsize(self, n):
|
||||
bdim = self.blockDim
|
||||
gdim = self.gridDim
|
||||
x = bdim.x * gdim.x
|
||||
if n == 1:
|
||||
return x
|
||||
y = bdim.y * gdim.y
|
||||
if n == 2:
|
||||
return (x, y)
|
||||
z = bdim.z * gdim.z
|
||||
if n == 3:
|
||||
return (x, y, z)
|
||||
|
||||
raise RuntimeError("Global grid has 1-3 dimensions. %d requested" % n)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def swapped_cuda_module(fn, fake_cuda_module):
|
||||
from numba import cuda
|
||||
|
||||
fn_globs = fn.__globals__
|
||||
# get all globals that is the "cuda" module
|
||||
orig = dict((k, v) for k, v in fn_globs.items() if v is cuda)
|
||||
# build replacement dict
|
||||
repl = dict((k, fake_cuda_module) for k, v in orig.items())
|
||||
# replace
|
||||
fn_globs.update(repl)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
# revert
|
||||
fn_globs.update(orig)
|
||||
@@ -0,0 +1,15 @@
|
||||
from functools import reduce as pyreduce
|
||||
|
||||
|
||||
def Reduce(func):
|
||||
def reduce_wrapper(seq, res=None, init=0):
|
||||
r = pyreduce(func, seq, init)
|
||||
if res is not None:
|
||||
res[0] = r
|
||||
return None
|
||||
else:
|
||||
return r
|
||||
return reduce_wrapper
|
||||
|
||||
|
||||
reduce = Reduce
|
||||
@@ -0,0 +1,60 @@
|
||||
from numba import types
|
||||
from numba.cuda.stubs import _vector_type_stubs
|
||||
|
||||
|
||||
class SimulatedVectorType:
|
||||
attributes = ['x', 'y', 'z', 'w']
|
||||
|
||||
def __init__(self, *args):
|
||||
args_flattened = []
|
||||
for arg in args:
|
||||
if isinstance(arg, SimulatedVectorType):
|
||||
args_flattened += arg.as_list()
|
||||
else:
|
||||
args_flattened.append(arg)
|
||||
self._attrs = self.attributes[:len(args_flattened)]
|
||||
if not self.num_elements == len(args_flattened):
|
||||
raise TypeError(
|
||||
f"{self.name} expects {self.num_elements}"
|
||||
f" elements, got {len(args_flattened)}"
|
||||
)
|
||||
|
||||
for arg, attr in zip(args_flattened, self._attrs):
|
||||
setattr(self, attr, arg)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def num_elements(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def as_list(self):
|
||||
return [getattr(self, attr) for attr in self._attrs]
|
||||
|
||||
|
||||
def make_simulated_vector_type(num_elements, name):
|
||||
base_type = types.float32
|
||||
|
||||
obj = type(name, (SimulatedVectorType,), {
|
||||
"num_elements": num_elements,
|
||||
"base_type": base_type,
|
||||
"name": name
|
||||
})
|
||||
obj.user_facing_object = obj
|
||||
return obj
|
||||
|
||||
|
||||
def _initialize():
|
||||
_simulated_vector_types = {}
|
||||
for stub in _vector_type_stubs:
|
||||
num_elements = int(stub.__name__[-1])
|
||||
_simulated_vector_types[stub.__name__] = (
|
||||
make_simulated_vector_type(num_elements, stub.__name__)
|
||||
)
|
||||
_simulated_vector_types[stub.__name__].aliases = stub.aliases
|
||||
return _simulated_vector_types
|
||||
|
||||
|
||||
vector_types = _initialize()
|
||||
Reference in New Issue
Block a user