Videre
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
from numba import runtests
|
||||
from numba.core import config
|
||||
|
||||
if config.ENABLE_CUDASIM:
|
||||
from .simulator_init import *
|
||||
else:
|
||||
from .device_init import *
|
||||
from .device_init import _auto_device
|
||||
|
||||
from numba.cuda.compiler import (compile, compile_for_current_device,
|
||||
compile_ptx, compile_ptx_for_current_device)
|
||||
|
||||
# Are we the numba.cuda built in to upstream Numba, or the out-of-tree
|
||||
# NVIDIA-maintained target?
|
||||
implementation = "Built-in"
|
||||
|
||||
|
||||
def test(*args, **kwargs):
|
||||
if not is_available():
|
||||
raise cuda_error()
|
||||
|
||||
return runtests.main("numba.cuda.tests", *args, **kwargs)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,525 @@
|
||||
"""
|
||||
API that are reported to numba.cuda
|
||||
"""
|
||||
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .cudadrv import devicearray, devices, driver
|
||||
from numba.core import config
|
||||
from numba.cuda.api_util import prepare_shape_strides_dtype
|
||||
|
||||
# NDarray device helper
|
||||
|
||||
require_context = devices.require_context
|
||||
current_context = devices.get_context
|
||||
gpus = devices.gpus
|
||||
|
||||
|
||||
@require_context
|
||||
def from_cuda_array_interface(desc, owner=None, sync=True):
|
||||
"""Create a DeviceNDArray from a cuda-array-interface description.
|
||||
The ``owner`` is the owner of the underlying memory.
|
||||
The resulting DeviceNDArray will acquire a reference from it.
|
||||
|
||||
If ``sync`` is ``True``, then the imported stream (if present) will be
|
||||
synchronized.
|
||||
"""
|
||||
version = desc.get('version')
|
||||
# Mask introduced in version 1
|
||||
if 1 <= version:
|
||||
mask = desc.get('mask')
|
||||
# Would ideally be better to detect if the mask is all valid
|
||||
if mask is not None:
|
||||
raise NotImplementedError('Masked arrays are not supported')
|
||||
|
||||
shape = desc['shape']
|
||||
strides = desc.get('strides')
|
||||
dtype = np.dtype(desc['typestr'])
|
||||
|
||||
shape, strides, dtype = prepare_shape_strides_dtype(
|
||||
shape, strides, dtype, order='C')
|
||||
size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
||||
|
||||
devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
|
||||
data = driver.MemoryPointer(
|
||||
current_context(), devptr, size=size, owner=owner)
|
||||
stream_ptr = desc.get('stream', None)
|
||||
if stream_ptr is not None:
|
||||
stream = external_stream(stream_ptr)
|
||||
if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
|
||||
stream.synchronize()
|
||||
else:
|
||||
stream = 0 # No "Numba default stream", not the CUDA default stream
|
||||
da = devicearray.DeviceNDArray(shape=shape, strides=strides,
|
||||
dtype=dtype, gpu_data=data,
|
||||
stream=stream)
|
||||
return da
|
||||
|
||||
|
||||
def as_cuda_array(obj, sync=True):
|
||||
"""Create a DeviceNDArray from any object that implements
|
||||
the :ref:`cuda array interface <cuda-array-interface>`.
|
||||
|
||||
A view of the underlying GPU buffer is created. No copying of the data
|
||||
is done. The resulting DeviceNDArray will acquire a reference from `obj`.
|
||||
|
||||
If ``sync`` is ``True``, then the imported stream (if present) will be
|
||||
synchronized.
|
||||
"""
|
||||
if not is_cuda_array(obj):
|
||||
raise TypeError("*obj* doesn't implement the cuda array interface.")
|
||||
else:
|
||||
return from_cuda_array_interface(obj.__cuda_array_interface__,
|
||||
owner=obj, sync=sync)
|
||||
|
||||
|
||||
def is_cuda_array(obj):
|
||||
"""Test if the object has defined the `__cuda_array_interface__` attribute.
|
||||
|
||||
Does not verify the validity of the interface.
|
||||
"""
|
||||
return hasattr(obj, '__cuda_array_interface__')
|
||||
|
||||
|
||||
def is_float16_supported():
|
||||
"""Whether 16-bit floats are supported.
|
||||
|
||||
float16 is always supported in current versions of Numba - returns True.
|
||||
"""
|
||||
return True
|
||||
|
||||
|
||||
@require_context
|
||||
def to_device(obj, stream=0, copy=True, to=None):
|
||||
"""to_device(obj, stream=0, copy=True, to=None)
|
||||
|
||||
Allocate and transfer a numpy ndarray or structured scalar to the device.
|
||||
|
||||
To copy host->device a numpy array::
|
||||
|
||||
ary = np.arange(10)
|
||||
d_ary = cuda.to_device(ary)
|
||||
|
||||
To enqueue the transfer to a stream::
|
||||
|
||||
stream = cuda.stream()
|
||||
d_ary = cuda.to_device(ary, stream=stream)
|
||||
|
||||
The resulting ``d_ary`` is a ``DeviceNDArray``.
|
||||
|
||||
To copy device->host::
|
||||
|
||||
hary = d_ary.copy_to_host()
|
||||
|
||||
To copy device->host to an existing array::
|
||||
|
||||
ary = np.empty(shape=d_ary.shape, dtype=d_ary.dtype)
|
||||
d_ary.copy_to_host(ary)
|
||||
|
||||
To enqueue the transfer to a stream::
|
||||
|
||||
hary = d_ary.copy_to_host(stream=stream)
|
||||
"""
|
||||
if to is None:
|
||||
to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
|
||||
user_explicit=True)
|
||||
return to
|
||||
if copy:
|
||||
to.copy_to_device(obj, stream=stream)
|
||||
return to
|
||||
|
||||
|
||||
@require_context
|
||||
def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
|
||||
"""device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
|
||||
|
||||
Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
|
||||
"""
|
||||
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
||||
order)
|
||||
return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
|
||||
stream=stream)
|
||||
|
||||
|
||||
@require_context
|
||||
def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
||||
attach_global=True):
|
||||
"""managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
||||
attach_global=True)
|
||||
|
||||
Allocate a np.ndarray with a buffer that is managed.
|
||||
Similar to np.empty().
|
||||
|
||||
Managed memory is supported on Linux / x86 and PowerPC, and is considered
|
||||
experimental on Windows and Linux / AArch64.
|
||||
|
||||
:param attach_global: A flag indicating whether to attach globally. Global
|
||||
attachment implies that the memory is accessible from
|
||||
any stream on any device. If ``False``, attachment is
|
||||
*host*, and memory is only accessible by devices
|
||||
with Compute Capability 6.0 and later.
|
||||
"""
|
||||
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
||||
order)
|
||||
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
||||
buffer = current_context().memallocmanaged(bytesize,
|
||||
attach_global=attach_global)
|
||||
npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
|
||||
buffer=buffer)
|
||||
managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
|
||||
managedview.device_setup(buffer, stream=stream)
|
||||
return managedview
|
||||
|
||||
|
||||
@require_context
|
||||
def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
|
||||
"""pinned_array(shape, dtype=np.float64, strides=None, order='C')
|
||||
|
||||
Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
|
||||
(pagelocked). Similar to :func:`np.empty() <numpy.empty>`.
|
||||
"""
|
||||
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
||||
order)
|
||||
bytesize = driver.memory_size_from_info(shape, strides,
|
||||
dtype.itemsize)
|
||||
buffer = current_context().memhostalloc(bytesize)
|
||||
return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
|
||||
buffer=buffer)
|
||||
|
||||
|
||||
@require_context
|
||||
def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
||||
portable=False, wc=False):
|
||||
"""mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
||||
portable=False, wc=False)
|
||||
|
||||
Allocate a mapped ndarray with a buffer that is pinned and mapped on
|
||||
to the device. Similar to np.empty()
|
||||
|
||||
:param portable: a boolean flag to allow the allocated device memory to be
|
||||
usable in multiple devices.
|
||||
:param wc: a boolean flag to enable writecombined allocation which is faster
|
||||
to write by the host and to read by the device, but slower to
|
||||
write by the host and slower to write by the device.
|
||||
"""
|
||||
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
||||
order)
|
||||
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
||||
buffer = current_context().memhostalloc(bytesize, mapped=True)
|
||||
npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
|
||||
buffer=buffer)
|
||||
mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
|
||||
mappedview.device_setup(buffer, stream=stream)
|
||||
return mappedview
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
@require_context
|
||||
def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
|
||||
"""
|
||||
A context manager that opens a IPC *handle* (*CUipcMemHandle*) that is
|
||||
represented as a sequence of bytes (e.g. *bytes*, tuple of int)
|
||||
and represent it as an array of the given *shape*, *strides* and *dtype*.
|
||||
The *strides* can be omitted. In that case, it is assumed to be a 1D
|
||||
C contiguous array.
|
||||
|
||||
Yields a device array.
|
||||
|
||||
The IPC handle is closed automatically when context manager exits.
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
# compute size
|
||||
size = np.prod(shape) * dtype.itemsize
|
||||
# manually recreate the IPC mem handle
|
||||
if driver.USE_NV_BINDING:
|
||||
driver_handle = driver.binding.CUipcMemHandle()
|
||||
driver_handle.reserved = handle
|
||||
else:
|
||||
driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle)
|
||||
# use *IpcHandle* to open the IPC memory
|
||||
ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
|
||||
yield ipchandle.open_array(current_context(), shape=shape,
|
||||
strides=strides, dtype=dtype)
|
||||
ipchandle.close()
|
||||
|
||||
|
||||
def synchronize():
|
||||
"Synchronize the current context."
|
||||
return current_context().synchronize()
|
||||
|
||||
|
||||
def _contiguous_strides_like_array(ary):
|
||||
"""
|
||||
Given an array, compute strides for a new contiguous array of the same
|
||||
shape.
|
||||
"""
|
||||
# Don't recompute strides if the default strides will be sufficient to
|
||||
# create a contiguous array.
|
||||
if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
|
||||
return None
|
||||
|
||||
# Otherwise, we need to compute new strides using an algorithm adapted from
|
||||
# NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
|
||||
# core/src/multiarray/ctors.c. We permute the strides in ascending order
|
||||
# then compute the stride for the dimensions with the same permutation.
|
||||
|
||||
# Stride permutation. E.g. a stride array (4, -2, 12) becomes
|
||||
# [(1, -2), (0, 4), (2, 12)]
|
||||
strideperm = [ x for x in enumerate(ary.strides) ]
|
||||
strideperm.sort(key=lambda x: x[1])
|
||||
|
||||
# Compute new strides using permutation
|
||||
strides = [0] * len(ary.strides)
|
||||
stride = ary.dtype.itemsize
|
||||
for i_perm, _ in strideperm:
|
||||
strides[i_perm] = stride
|
||||
stride *= ary.shape[i_perm]
|
||||
return tuple(strides)
|
||||
|
||||
|
||||
def _order_like_array(ary):
|
||||
if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
|
||||
return 'F'
|
||||
else:
|
||||
return 'C'
|
||||
|
||||
|
||||
def device_array_like(ary, stream=0):
|
||||
"""
|
||||
Call :func:`device_array() <numba.cuda.device_array>` with information from
|
||||
the array.
|
||||
"""
|
||||
strides = _contiguous_strides_like_array(ary)
|
||||
order = _order_like_array(ary)
|
||||
return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
||||
order=order, stream=stream)
|
||||
|
||||
|
||||
def mapped_array_like(ary, stream=0, portable=False, wc=False):
|
||||
"""
|
||||
Call :func:`mapped_array() <numba.cuda.mapped_array>` with the information
|
||||
from the array.
|
||||
"""
|
||||
strides = _contiguous_strides_like_array(ary)
|
||||
order = _order_like_array(ary)
|
||||
return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
||||
order=order, stream=stream, portable=portable, wc=wc)
|
||||
|
||||
|
||||
def pinned_array_like(ary):
|
||||
"""
|
||||
Call :func:`pinned_array() <numba.cuda.pinned_array>` with the information
|
||||
from the array.
|
||||
"""
|
||||
strides = _contiguous_strides_like_array(ary)
|
||||
order = _order_like_array(ary)
|
||||
return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
||||
order=order)
|
||||
|
||||
|
||||
# Stream helper
|
||||
@require_context
|
||||
def stream():
|
||||
"""
|
||||
Create a CUDA stream that represents a command queue for the device.
|
||||
"""
|
||||
return current_context().create_stream()
|
||||
|
||||
|
||||
@require_context
|
||||
def default_stream():
|
||||
"""
|
||||
Get the default CUDA stream. CUDA semantics in general are that the default
|
||||
stream is either the legacy default stream or the per-thread default stream
|
||||
depending on which CUDA APIs are in use. In Numba, the APIs for the legacy
|
||||
default stream are always the ones in use, but an option to use APIs for
|
||||
the per-thread default stream may be provided in future.
|
||||
"""
|
||||
return current_context().get_default_stream()
|
||||
|
||||
|
||||
@require_context
|
||||
def legacy_default_stream():
|
||||
"""
|
||||
Get the legacy default CUDA stream.
|
||||
"""
|
||||
return current_context().get_legacy_default_stream()
|
||||
|
||||
|
||||
@require_context
|
||||
def per_thread_default_stream():
|
||||
"""
|
||||
Get the per-thread default CUDA stream.
|
||||
"""
|
||||
return current_context().get_per_thread_default_stream()
|
||||
|
||||
|
||||
@require_context
|
||||
def external_stream(ptr):
|
||||
"""Create a Numba stream object for a stream allocated outside Numba.
|
||||
|
||||
:param ptr: Pointer to the external stream to wrap in a Numba Stream
|
||||
:type ptr: int
|
||||
"""
|
||||
return current_context().create_external_stream(ptr)
|
||||
|
||||
|
||||
# Page lock
|
||||
@require_context
|
||||
@contextlib.contextmanager
|
||||
def pinned(*arylist):
|
||||
"""A context manager for temporary pinning a sequence of host ndarrays.
|
||||
"""
|
||||
pmlist = []
|
||||
for ary in arylist:
|
||||
pm = current_context().mempin(ary, driver.host_pointer(ary),
|
||||
driver.host_memory_size(ary),
|
||||
mapped=False)
|
||||
pmlist.append(pm)
|
||||
yield
|
||||
|
||||
|
||||
@require_context
|
||||
@contextlib.contextmanager
|
||||
def mapped(*arylist, **kws):
|
||||
"""A context manager for temporarily mapping a sequence of host ndarrays.
|
||||
"""
|
||||
assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
|
||||
stream = kws.get('stream', 0)
|
||||
pmlist = []
|
||||
devarylist = []
|
||||
for ary in arylist:
|
||||
pm = current_context().mempin(ary, driver.host_pointer(ary),
|
||||
driver.host_memory_size(ary),
|
||||
mapped=True)
|
||||
pmlist.append(pm)
|
||||
devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
|
||||
devarylist.append(devary)
|
||||
try:
|
||||
if len(devarylist) == 1:
|
||||
yield devarylist[0]
|
||||
else:
|
||||
yield devarylist
|
||||
finally:
|
||||
# When exiting from `with cuda.mapped(*arrs) as mapped_arrs:`, the name
|
||||
# `mapped_arrs` stays in scope, blocking automatic unmapping based on
|
||||
# reference count. We therefore invoke the finalizer manually.
|
||||
for pm in pmlist:
|
||||
pm.free()
|
||||
|
||||
|
||||
def event(timing=True):
|
||||
"""
|
||||
Create a CUDA event. Timing data is only recorded by the event if it is
|
||||
created with ``timing=True``.
|
||||
"""
|
||||
evt = current_context().create_event(timing=timing)
|
||||
return evt
|
||||
|
||||
|
||||
event_elapsed_time = driver.event_elapsed_time
|
||||
|
||||
|
||||
# Device selection
|
||||
|
||||
def select_device(device_id):
|
||||
"""
|
||||
Make the context associated with device *device_id* the current context.
|
||||
|
||||
Returns a Device instance.
|
||||
|
||||
Raises exception on error.
|
||||
"""
|
||||
context = devices.get_context(device_id)
|
||||
return context.device
|
||||
|
||||
|
||||
def get_current_device():
|
||||
"Get current device associated with the current thread"
|
||||
return current_context().device
|
||||
|
||||
|
||||
def list_devices():
|
||||
"Return a list of all detected devices"
|
||||
return devices.gpus
|
||||
|
||||
|
||||
def close():
|
||||
"""
|
||||
Explicitly clears all contexts in the current thread, and destroys all
|
||||
contexts if the current thread is the main thread.
|
||||
"""
|
||||
devices.reset()
|
||||
|
||||
|
||||
def _auto_device(ary, stream=0, copy=True):
|
||||
return devicearray.auto_device(ary, stream=stream, copy=copy)
|
||||
|
||||
|
||||
def detect():
|
||||
"""
|
||||
Detect supported CUDA hardware and print a summary of the detected hardware.
|
||||
|
||||
Returns a boolean indicating whether any supported devices were detected.
|
||||
"""
|
||||
devlist = list_devices()
|
||||
print('Found %d CUDA devices' % len(devlist))
|
||||
supported_count = 0
|
||||
for dev in devlist:
|
||||
attrs = []
|
||||
cc = dev.compute_capability
|
||||
kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
|
||||
tcc = dev.TCC_DRIVER
|
||||
fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
|
||||
attrs += [('Compute Capability', '%d.%d' % cc)]
|
||||
attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
|
||||
attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
|
||||
attrs += [('UUID', dev.uuid)]
|
||||
attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
|
||||
if os.name == "nt":
|
||||
attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
|
||||
attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
|
||||
if cc < (3, 5):
|
||||
support = '[NOT SUPPORTED: CC < 3.5]'
|
||||
elif cc < (5, 0):
|
||||
support = '[SUPPORTED (DEPRECATED)]'
|
||||
supported_count += 1
|
||||
else:
|
||||
support = '[SUPPORTED]'
|
||||
supported_count += 1
|
||||
|
||||
print('id %d %20s %40s' % (dev.id, dev.name, support))
|
||||
for key, val in attrs:
|
||||
print('%40s: %s' % (key, val))
|
||||
|
||||
print('Summary:')
|
||||
print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
|
||||
return supported_count > 0
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def defer_cleanup():
|
||||
"""
|
||||
Temporarily disable memory deallocation.
|
||||
Use this to prevent resource deallocation breaking asynchronous execution.
|
||||
|
||||
For example::
|
||||
|
||||
with defer_cleanup():
|
||||
# all cleanup is deferred in here
|
||||
do_speed_critical_code()
|
||||
# cleanup can occur here
|
||||
|
||||
Note: this context manager can be nested.
|
||||
"""
|
||||
with current_context().defer_cleanup():
|
||||
yield
|
||||
|
||||
|
||||
profiling = require_context(driver.profiling)
|
||||
profile_start = require_context(driver.profile_start)
|
||||
profile_stop = require_context(driver.profile_stop)
|
||||
@@ -0,0 +1,30 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
def prepare_shape_strides_dtype(shape, strides, dtype, order):
|
||||
dtype = np.dtype(dtype)
|
||||
if isinstance(shape, int):
|
||||
shape = (shape,)
|
||||
if isinstance(strides, int):
|
||||
strides = (strides,)
|
||||
else:
|
||||
strides = strides or _fill_stride_by_order(shape, dtype, order)
|
||||
return shape, strides, dtype
|
||||
|
||||
|
||||
def _fill_stride_by_order(shape, dtype, order):
|
||||
nd = len(shape)
|
||||
if nd == 0:
|
||||
return ()
|
||||
strides = [0] * nd
|
||||
if order == 'C':
|
||||
strides[-1] = dtype.itemsize
|
||||
for d in reversed(range(nd - 1)):
|
||||
strides[d] = strides[d + 1] * shape[d + 1]
|
||||
elif order == 'F':
|
||||
strides[0] = dtype.itemsize
|
||||
for d in range(1, nd):
|
||||
strides[d] = strides[d - 1] * shape[d - 1]
|
||||
else:
|
||||
raise ValueError('must be either C/F order')
|
||||
return tuple(strides)
|
||||
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Hints to wrap Kernel arguments to indicate how to manage host-device
|
||||
memory transfers before & after the kernel call.
|
||||
"""
|
||||
import abc
|
||||
|
||||
from numba.core.typing.typeof import typeof, Purpose
|
||||
|
||||
|
||||
class ArgHint(metaclass=abc.ABCMeta):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_device(self, retr, stream=0):
|
||||
"""
|
||||
:param stream: a stream to use when copying data
|
||||
:param retr:
|
||||
a list of clean-up work to do after the kernel's been run.
|
||||
Append 0-arg lambdas to it!
|
||||
:return: a value (usually an `DeviceNDArray`) to be passed to
|
||||
the kernel
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
return typeof(self.value, Purpose.argument)
|
||||
|
||||
|
||||
class In(ArgHint):
|
||||
def to_device(self, retr, stream=0):
|
||||
from .cudadrv.devicearray import auto_device
|
||||
devary, _ = auto_device(
|
||||
self.value,
|
||||
stream=stream)
|
||||
# A dummy writeback functor to keep devary alive until the kernel
|
||||
# is called.
|
||||
retr.append(lambda: devary)
|
||||
return devary
|
||||
|
||||
|
||||
class Out(ArgHint):
|
||||
def to_device(self, retr, stream=0):
|
||||
from .cudadrv.devicearray import auto_device
|
||||
devary, conv = auto_device(
|
||||
self.value,
|
||||
copy=False,
|
||||
stream=stream)
|
||||
if conv:
|
||||
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
|
||||
return devary
|
||||
|
||||
|
||||
class InOut(ArgHint):
|
||||
def to_device(self, retr, stream=0):
|
||||
from .cudadrv.devicearray import auto_device
|
||||
devary, conv = auto_device(
|
||||
self.value,
|
||||
stream=stream)
|
||||
if conv:
|
||||
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
|
||||
return devary
|
||||
|
||||
|
||||
def wrap_arg(value, default=InOut):
|
||||
return value if isinstance(value, ArgHint) else default(value)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'In',
|
||||
'Out',
|
||||
'InOut',
|
||||
|
||||
'ArgHint',
|
||||
'wrap_arg',
|
||||
]
|
||||
@@ -0,0 +1,62 @@
|
||||
from numba.core import types
|
||||
from numba.core.extending import overload, overload_method
|
||||
from numba.core.typing import signature
|
||||
from numba.cuda import nvvmutils
|
||||
from numba.cuda.extending import intrinsic
|
||||
from numba.cuda.types import grid_group, GridGroup as GridGroupClass
|
||||
|
||||
|
||||
class GridGroup:
|
||||
"""A cooperative group representing the entire grid"""
|
||||
|
||||
def sync() -> None:
|
||||
"""Synchronize this grid group"""
|
||||
|
||||
|
||||
def this_grid() -> GridGroup:
|
||||
"""Get the current grid group."""
|
||||
return GridGroup()
|
||||
|
||||
|
||||
@intrinsic
|
||||
def _this_grid(typingctx):
|
||||
sig = signature(grid_group)
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
one = context.get_constant(types.int32, 1)
|
||||
mod = builder.module
|
||||
return builder.call(
|
||||
nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
|
||||
(one,))
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
@overload(this_grid, target='cuda')
|
||||
def _ol_this_grid():
|
||||
def impl():
|
||||
return _this_grid()
|
||||
|
||||
return impl
|
||||
|
||||
|
||||
@intrinsic
|
||||
def _grid_group_sync(typingctx, group):
|
||||
sig = signature(types.int32, group)
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
flags = context.get_constant(types.int32, 0)
|
||||
mod = builder.module
|
||||
return builder.call(
|
||||
nvvmutils.declare_cudaCGSynchronize(mod),
|
||||
(*args, flags))
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
@overload_method(GridGroupClass, 'sync', target='cuda')
|
||||
def _ol_grid_group_sync(group):
|
||||
def impl(group):
|
||||
return _grid_group_sync(group)
|
||||
|
||||
return impl
|
||||
@@ -0,0 +1,378 @@
|
||||
from llvmlite import ir
|
||||
|
||||
from numba.core import config, serialize
|
||||
from numba.core.codegen import Codegen, CodeLibrary
|
||||
from .cudadrv import devices, driver, nvvm, runtime
|
||||
from numba.cuda.cudadrv.libs import get_cudalib
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
|
||||
CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
|
||||
|
||||
|
||||
def run_nvdisasm(cubin, flags):
|
||||
# nvdisasm only accepts input from a file, so we need to write out to a
|
||||
# temp file and clean up afterwards.
|
||||
fd = None
|
||||
fname = None
|
||||
try:
|
||||
fd, fname = tempfile.mkstemp()
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(cubin)
|
||||
|
||||
try:
|
||||
cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
except FileNotFoundError as e:
|
||||
msg = ("nvdisasm has not been found. You may need "
|
||||
"to install the CUDA toolkit and ensure that "
|
||||
"it is available on your PATH.\n")
|
||||
raise RuntimeError(msg) from e
|
||||
return cp.stdout.decode('utf-8')
|
||||
finally:
|
||||
if fd is not None:
|
||||
os.close(fd)
|
||||
if fname is not None:
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
def disassemble_cubin(cubin):
|
||||
# Request lineinfo in disassembly
|
||||
flags = ['-gi']
|
||||
return run_nvdisasm(cubin, flags)
|
||||
|
||||
|
||||
def disassemble_cubin_for_cfg(cubin):
|
||||
# Request control flow graph in disassembly
|
||||
flags = ['-cfg']
|
||||
return run_nvdisasm(cubin, flags)
|
||||
|
||||
|
||||
class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
||||
"""
|
||||
The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
|
||||
compute capabilities. It also loads cubins to multiple devices (via
|
||||
get_cufunc), which may be of different compute capabilities.
|
||||
"""
|
||||
|
||||
def __init__(self, codegen, name, entry_name=None, max_registers=None,
|
||||
nvvm_options=None):
|
||||
"""
|
||||
codegen:
|
||||
Codegen object.
|
||||
name:
|
||||
Name of the function in the source.
|
||||
entry_name:
|
||||
Name of the kernel function in the binary, if this is a global
|
||||
kernel and not a device function.
|
||||
max_registers:
|
||||
The maximum register usage to aim for when linking.
|
||||
nvvm_options:
|
||||
Dict of options to pass to NVVM.
|
||||
"""
|
||||
super().__init__(codegen, name)
|
||||
|
||||
# The llvmlite module for this library.
|
||||
self._module = None
|
||||
# CodeLibrary objects that will be "linked" into this library. The
|
||||
# modules within them are compiled from NVVM IR to PTX along with the
|
||||
# IR from this module - in that sense they are "linked" by NVVM at PTX
|
||||
# generation time, rather than at link time.
|
||||
self._linking_libraries = set()
|
||||
# Files to link with the generated PTX. These are linked using the
|
||||
# Driver API at link time.
|
||||
self._linking_files = set()
|
||||
# Should we link libcudadevrt?
|
||||
self.needs_cudadevrt = False
|
||||
|
||||
# Cache the LLVM IR string
|
||||
self._llvm_strs = None
|
||||
# Maps CC -> PTX string
|
||||
self._ptx_cache = {}
|
||||
# Maps CC -> LTO-IR
|
||||
self._ltoir_cache = {}
|
||||
# Maps CC -> cubin
|
||||
self._cubin_cache = {}
|
||||
# Maps CC -> linker info output for cubin
|
||||
self._linkerinfo_cache = {}
|
||||
# Maps Device numeric ID -> cufunc
|
||||
self._cufunc_cache = {}
|
||||
|
||||
self._max_registers = max_registers
|
||||
if nvvm_options is None:
|
||||
nvvm_options = {}
|
||||
self._nvvm_options = nvvm_options
|
||||
self._entry_name = entry_name
|
||||
|
||||
@property
|
||||
def llvm_strs(self):
|
||||
if self._llvm_strs is None:
|
||||
self._llvm_strs = [str(mod) for mod in self.modules]
|
||||
return self._llvm_strs
|
||||
|
||||
def get_llvm_str(self):
|
||||
return "\n\n".join(self.llvm_strs)
|
||||
|
||||
def _ensure_cc(self, cc):
|
||||
if cc is not None:
|
||||
return cc
|
||||
|
||||
device = devices.get_context().device
|
||||
return device.compute_capability
|
||||
|
||||
def get_asm_str(self, cc=None):
|
||||
cc = self._ensure_cc(cc)
|
||||
|
||||
ptxes = self._ptx_cache.get(cc, None)
|
||||
if ptxes:
|
||||
return ptxes
|
||||
|
||||
arch = nvvm.get_arch_option(*cc)
|
||||
options = self._nvvm_options.copy()
|
||||
options['arch'] = arch
|
||||
|
||||
irs = self.llvm_strs
|
||||
|
||||
ptx = nvvm.compile_ir(irs, **options)
|
||||
|
||||
# Sometimes the result from NVVM contains trailing whitespace and
|
||||
# nulls, which we strip so that the assembly dump looks a little
|
||||
# tidier.
|
||||
ptx = ptx.decode().strip('\x00').strip()
|
||||
|
||||
if config.DUMP_ASSEMBLY:
|
||||
print(("ASSEMBLY %s" % self._name).center(80, '-'))
|
||||
print(ptx)
|
||||
print('=' * 80)
|
||||
|
||||
self._ptx_cache[cc] = ptx
|
||||
|
||||
return ptx
|
||||
|
||||
def get_ltoir(self, cc=None):
|
||||
cc = self._ensure_cc(cc)
|
||||
|
||||
ltoir = self._ltoir_cache.get(cc, None)
|
||||
if ltoir is not None:
|
||||
return ltoir
|
||||
|
||||
arch = nvvm.get_arch_option(*cc)
|
||||
options = self._nvvm_options.copy()
|
||||
options['arch'] = arch
|
||||
options['gen-lto'] = None
|
||||
|
||||
irs = self.llvm_strs
|
||||
ltoir = nvvm.compile_ir(irs, **options)
|
||||
self._ltoir_cache[cc] = ltoir
|
||||
|
||||
return ltoir
|
||||
|
||||
def get_cubin(self, cc=None):
|
||||
cc = self._ensure_cc(cc)
|
||||
|
||||
cubin = self._cubin_cache.get(cc, None)
|
||||
if cubin:
|
||||
return cubin
|
||||
|
||||
linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
|
||||
|
||||
if linker.lto:
|
||||
ltoir = self.get_ltoir(cc=cc)
|
||||
linker.add_ltoir(ltoir)
|
||||
else:
|
||||
ptx = self.get_asm_str(cc=cc)
|
||||
linker.add_ptx(ptx.encode())
|
||||
|
||||
for path in self._linking_files:
|
||||
linker.add_file_guess_ext(path)
|
||||
if self.needs_cudadevrt:
|
||||
linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
|
||||
|
||||
cubin = linker.complete()
|
||||
self._cubin_cache[cc] = cubin
|
||||
self._linkerinfo_cache[cc] = linker.info_log
|
||||
|
||||
return cubin
|
||||
|
||||
def get_cufunc(self):
|
||||
if self._entry_name is None:
|
||||
msg = "Missing entry_name - are you trying to get the cufunc " \
|
||||
"for a device function?"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
ctx = devices.get_context()
|
||||
device = ctx.device
|
||||
|
||||
cufunc = self._cufunc_cache.get(device.id, None)
|
||||
if cufunc:
|
||||
return cufunc
|
||||
|
||||
cubin = self.get_cubin(cc=device.compute_capability)
|
||||
module = ctx.create_module_image(cubin)
|
||||
|
||||
# Load
|
||||
cufunc = module.get_function(self._entry_name)
|
||||
|
||||
# Populate caches
|
||||
self._cufunc_cache[device.id] = cufunc
|
||||
|
||||
return cufunc
|
||||
|
||||
def get_linkerinfo(self, cc):
|
||||
try:
|
||||
return self._linkerinfo_cache[cc]
|
||||
except KeyError:
|
||||
raise KeyError(f'No linkerinfo for CC {cc}')
|
||||
|
||||
def get_sass(self, cc=None):
|
||||
return disassemble_cubin(self.get_cubin(cc=cc))
|
||||
|
||||
def get_sass_cfg(self, cc=None):
|
||||
return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
|
||||
|
||||
def add_ir_module(self, mod):
|
||||
self._raise_if_finalized()
|
||||
if self._module is not None:
|
||||
raise RuntimeError('CUDACodeLibrary only supports one module')
|
||||
self._module = mod
|
||||
|
||||
def add_linking_library(self, library):
|
||||
library._ensure_finalized()
|
||||
|
||||
# We don't want to allow linking more libraries in after finalization
|
||||
# because our linked libraries are modified by the finalization, and we
|
||||
# won't be able to finalize again after adding new ones
|
||||
self._raise_if_finalized()
|
||||
|
||||
self._linking_libraries.add(library)
|
||||
|
||||
def add_linking_file(self, filepath):
|
||||
self._linking_files.add(filepath)
|
||||
|
||||
def get_function(self, name):
|
||||
for fn in self._module.functions:
|
||||
if fn.name == name:
|
||||
return fn
|
||||
raise KeyError(f'Function {name} not found')
|
||||
|
||||
@property
|
||||
def modules(self):
|
||||
return [self._module] + [mod for lib in self._linking_libraries
|
||||
for mod in lib.modules]
|
||||
|
||||
@property
|
||||
def linking_libraries(self):
|
||||
# Libraries we link to may link to other libraries, so we recursively
|
||||
# traverse the linking libraries property to build up a list of all
|
||||
# linked libraries.
|
||||
libs = []
|
||||
for lib in self._linking_libraries:
|
||||
libs.extend(lib.linking_libraries)
|
||||
libs.append(lib)
|
||||
return libs
|
||||
|
||||
def finalize(self):
|
||||
# Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
|
||||
# we only adjust the linkage of functions. Global kernels (with
|
||||
# external linkage) have their linkage untouched. Device functions are
|
||||
# set linkonce_odr to prevent them appearing in the PTX.
|
||||
|
||||
self._raise_if_finalized()
|
||||
|
||||
# Note in-place modification of the linkage of functions in linked
|
||||
# libraries. This presently causes no issues as only device functions
|
||||
# are shared across code libraries, so they would always need their
|
||||
# linkage set to linkonce_odr. If in a future scenario some code
|
||||
# libraries require linkonce_odr linkage of functions in linked
|
||||
# modules, and another code library requires another linkage, each code
|
||||
# library will need to take its own private copy of its linked modules.
|
||||
#
|
||||
# See also discussion on PR #890:
|
||||
# https://github.com/numba/numba/pull/890
|
||||
for library in self._linking_libraries:
|
||||
for mod in library.modules:
|
||||
for fn in mod.functions:
|
||||
if not fn.is_declaration:
|
||||
fn.linkage = 'linkonce_odr'
|
||||
|
||||
self._finalized = True
|
||||
|
||||
def _reduce_states(self):
|
||||
"""
|
||||
Reduce the instance for serialization. We retain the PTX and cubins,
|
||||
but loaded functions are discarded. They are recreated when needed
|
||||
after deserialization.
|
||||
"""
|
||||
if self._linking_files:
|
||||
msg = 'Cannot pickle CUDACodeLibrary with linking files'
|
||||
raise RuntimeError(msg)
|
||||
if not self._finalized:
|
||||
raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
|
||||
return dict(
|
||||
codegen=None,
|
||||
name=self.name,
|
||||
entry_name=self._entry_name,
|
||||
llvm_strs=self.llvm_strs,
|
||||
ptx_cache=self._ptx_cache,
|
||||
cubin_cache=self._cubin_cache,
|
||||
linkerinfo_cache=self._linkerinfo_cache,
|
||||
max_registers=self._max_registers,
|
||||
nvvm_options=self._nvvm_options,
|
||||
needs_cudadevrt=self.needs_cudadevrt
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
|
||||
cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
|
||||
needs_cudadevrt):
|
||||
"""
|
||||
Rebuild an instance.
|
||||
"""
|
||||
instance = cls(codegen, name, entry_name=entry_name)
|
||||
|
||||
instance._llvm_strs = llvm_strs
|
||||
instance._ptx_cache = ptx_cache
|
||||
instance._cubin_cache = cubin_cache
|
||||
instance._linkerinfo_cache = linkerinfo_cache
|
||||
|
||||
instance._max_registers = max_registers
|
||||
instance._nvvm_options = nvvm_options
|
||||
instance.needs_cudadevrt = needs_cudadevrt
|
||||
|
||||
instance._finalized = True
|
||||
|
||||
return instance
|
||||
|
||||
|
||||
class JITCUDACodegen(Codegen):
|
||||
"""
|
||||
This codegen implementation for CUDA only generates optimized LLVM IR.
|
||||
Generation of PTX code is done separately (see numba.cuda.compiler).
|
||||
"""
|
||||
|
||||
_library_class = CUDACodeLibrary
|
||||
|
||||
def __init__(self, module_name):
|
||||
pass
|
||||
|
||||
def _create_empty_module(self, name):
|
||||
ir_module = ir.Module(name)
|
||||
ir_module.triple = CUDA_TRIPLE
|
||||
ir_module.data_layout = nvvm.NVVM().data_layout
|
||||
nvvm.add_ir_version(ir_module)
|
||||
return ir_module
|
||||
|
||||
def _add_module(self, module):
|
||||
pass
|
||||
|
||||
def magic_tuple(self):
|
||||
"""
|
||||
Return a tuple unambiguously describing the codegen behaviour.
|
||||
"""
|
||||
ctx = devices.get_context()
|
||||
cc = ctx.device.compute_capability
|
||||
return (runtime.runtime.get_version(), cc)
|
||||
@@ -0,0 +1,422 @@
|
||||
from llvmlite import ir
|
||||
from numba.core.typing.templates import ConcreteTemplate
|
||||
from numba.core import types, typing, funcdesc, config, compiler, sigutils
|
||||
from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
|
||||
DefaultPassBuilder, Flags, Option,
|
||||
CompileResult)
|
||||
from numba.core.compiler_lock import global_compiler_lock
|
||||
from numba.core.compiler_machinery import (LoweringPass,
|
||||
PassManager, register_pass)
|
||||
from numba.core.errors import NumbaInvalidConfigWarning
|
||||
from numba.core.typed_passes import (IRLegalization, NativeLowering,
|
||||
AnnotateTypes)
|
||||
from warnings import warn
|
||||
from numba.cuda.api import get_current_device
|
||||
from numba.cuda.target import CUDACABICallConv
|
||||
|
||||
|
||||
def _nvvm_options_type(x):
|
||||
if x is None:
|
||||
return None
|
||||
|
||||
else:
|
||||
assert isinstance(x, dict)
|
||||
return x
|
||||
|
||||
|
||||
class CUDAFlags(Flags):
|
||||
nvvm_options = Option(
|
||||
type=_nvvm_options_type,
|
||||
default=None,
|
||||
doc="NVVM options",
|
||||
)
|
||||
compute_capability = Option(
|
||||
type=tuple,
|
||||
default=None,
|
||||
doc="Compute Capability",
|
||||
)
|
||||
|
||||
|
||||
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
|
||||
# id. This is because the entry point is used as a key into a dict of
|
||||
# overloads by the base dispatcher. The id of the CCR is the only small and
|
||||
# unique property of a CompileResult in the CUDA target (cf. the CPU target,
|
||||
# which uses its entry_point, which is a pointer value).
|
||||
#
|
||||
# This does feel a little hackish, and there are two ways in which this could
|
||||
# be improved:
|
||||
#
|
||||
# 1. We could change the core of Numba so that each CompileResult has its own
|
||||
# unique ID that can be used as a key - e.g. a count, similar to the way in
|
||||
# which types have unique counts.
|
||||
# 2. At some future time when kernel launch uses a compiled function, the entry
|
||||
# point will no longer need to be a synthetic value, but will instead be a
|
||||
# pointer to the compiled function as in the CPU target.
|
||||
|
||||
class CUDACompileResult(CompileResult):
|
||||
@property
|
||||
def entry_point(self):
|
||||
return id(self)
|
||||
|
||||
|
||||
def cuda_compile_result(**entries):
|
||||
entries = sanitize_compile_result_entries(entries)
|
||||
return CUDACompileResult(**entries)
|
||||
|
||||
|
||||
@register_pass(mutates_CFG=True, analysis_only=False)
|
||||
class CUDABackend(LoweringPass):
|
||||
|
||||
_name = "cuda_backend"
|
||||
|
||||
def __init__(self):
|
||||
LoweringPass.__init__(self)
|
||||
|
||||
def run_pass(self, state):
|
||||
"""
|
||||
Back-end: Packages lowering output in a compile result
|
||||
"""
|
||||
lowered = state['cr']
|
||||
signature = typing.signature(state.return_type, *state.args)
|
||||
|
||||
state.cr = cuda_compile_result(
|
||||
typing_context=state.typingctx,
|
||||
target_context=state.targetctx,
|
||||
typing_error=state.status.fail_reason,
|
||||
type_annotation=state.type_annotation,
|
||||
library=state.library,
|
||||
call_helper=lowered.call_helper,
|
||||
signature=signature,
|
||||
fndesc=lowered.fndesc,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
@register_pass(mutates_CFG=False, analysis_only=False)
|
||||
class CreateLibrary(LoweringPass):
|
||||
"""
|
||||
Create a CUDACodeLibrary for the NativeLowering pass to populate. The
|
||||
NativeLowering pass will create a code library if none exists, but we need
|
||||
to set it up with nvvm_options from the flags if they are present.
|
||||
"""
|
||||
|
||||
_name = "create_library"
|
||||
|
||||
def __init__(self):
|
||||
LoweringPass.__init__(self)
|
||||
|
||||
def run_pass(self, state):
|
||||
codegen = state.targetctx.codegen()
|
||||
name = state.func_id.func_qualname
|
||||
nvvm_options = state.flags.nvvm_options
|
||||
state.library = codegen.create_library(name, nvvm_options=nvvm_options)
|
||||
# Enable object caching upfront so that the library can be serialized.
|
||||
state.library.enable_object_caching()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class CUDACompiler(CompilerBase):
|
||||
def define_pipelines(self):
|
||||
dpb = DefaultPassBuilder
|
||||
pm = PassManager('cuda')
|
||||
|
||||
untyped_passes = dpb.define_untyped_pipeline(self.state)
|
||||
pm.passes.extend(untyped_passes.passes)
|
||||
|
||||
typed_passes = dpb.define_typed_pipeline(self.state)
|
||||
pm.passes.extend(typed_passes.passes)
|
||||
|
||||
lowering_passes = self.define_cuda_lowering_pipeline(self.state)
|
||||
pm.passes.extend(lowering_passes.passes)
|
||||
|
||||
pm.finalize()
|
||||
return [pm]
|
||||
|
||||
def define_cuda_lowering_pipeline(self, state):
|
||||
pm = PassManager('cuda_lowering')
|
||||
# legalise
|
||||
pm.add_pass(IRLegalization,
|
||||
"ensure IR is legal prior to lowering")
|
||||
pm.add_pass(AnnotateTypes, "annotate types")
|
||||
|
||||
# lower
|
||||
pm.add_pass(CreateLibrary, "create library")
|
||||
pm.add_pass(NativeLowering, "native lowering")
|
||||
pm.add_pass(CUDABackend, "cuda backend")
|
||||
|
||||
pm.finalize()
|
||||
return pm
|
||||
|
||||
|
||||
@global_compiler_lock
|
||||
def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
||||
inline=False, fastmath=False, nvvm_options=None,
|
||||
cc=None):
|
||||
if cc is None:
|
||||
raise ValueError('Compute Capability must be supplied')
|
||||
|
||||
from .descriptor import cuda_target
|
||||
typingctx = cuda_target.typing_context
|
||||
targetctx = cuda_target.target_context
|
||||
|
||||
flags = CUDAFlags()
|
||||
# Do not compile (generate native code), just lower (to LLVM)
|
||||
flags.no_compile = True
|
||||
flags.no_cpython_wrapper = True
|
||||
flags.no_cfunc_wrapper = True
|
||||
|
||||
# Both debug and lineinfo turn on debug information in the compiled code,
|
||||
# but we keep them separate arguments in case we later want to overload
|
||||
# some other behavior on the debug flag. In particular, -opt=3 is not
|
||||
# supported with debug enabled, and enabling only lineinfo should not
|
||||
# affect the error model.
|
||||
if debug or lineinfo:
|
||||
flags.debuginfo = True
|
||||
|
||||
if lineinfo:
|
||||
flags.dbg_directives_only = True
|
||||
|
||||
if debug:
|
||||
flags.error_model = 'python'
|
||||
else:
|
||||
flags.error_model = 'numpy'
|
||||
|
||||
if inline:
|
||||
flags.forceinline = True
|
||||
if fastmath:
|
||||
flags.fastmath = True
|
||||
if nvvm_options:
|
||||
flags.nvvm_options = nvvm_options
|
||||
flags.compute_capability = cc
|
||||
|
||||
# Run compilation pipeline
|
||||
from numba.core.target_extension import target_override
|
||||
with target_override('cuda'):
|
||||
cres = compiler.compile_extra(typingctx=typingctx,
|
||||
targetctx=targetctx,
|
||||
func=pyfunc,
|
||||
args=args,
|
||||
return_type=return_type,
|
||||
flags=flags,
|
||||
locals={},
|
||||
pipeline_class=CUDACompiler)
|
||||
|
||||
library = cres.library
|
||||
library.finalize()
|
||||
|
||||
return cres
|
||||
|
||||
|
||||
def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
|
||||
nvvm_options):
|
||||
"""
|
||||
Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
|
||||
|
||||
The C ABI wrapper will have the same name as the source Python function.
|
||||
"""
|
||||
# The wrapper will be contained in a new library that links to the wrapped
|
||||
# function's library
|
||||
library = lib.codegen.create_library(f'{lib.name}_function_',
|
||||
entry_name=wrapper_function_name,
|
||||
nvvm_options=nvvm_options)
|
||||
library.add_linking_library(lib)
|
||||
|
||||
# Determine the caller (C ABI) and wrapper (Numba ABI) function types
|
||||
argtypes = fndesc.argtypes
|
||||
restype = fndesc.restype
|
||||
c_call_conv = CUDACABICallConv(context)
|
||||
wrapfnty = c_call_conv.get_function_type(restype, argtypes)
|
||||
fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
|
||||
|
||||
# Create a new module and declare the callee
|
||||
wrapper_module = context.create_module("cuda.cabi.wrapper")
|
||||
func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
|
||||
|
||||
# Define the caller - populate it with a call to the callee and return
|
||||
# its return value
|
||||
|
||||
wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
|
||||
builder = ir.IRBuilder(wrapfn.append_basic_block(''))
|
||||
|
||||
arginfo = context.get_arg_packer(argtypes)
|
||||
callargs = arginfo.from_arguments(builder, wrapfn.args)
|
||||
# We get (status, return_value), but we ignore the status since we
|
||||
# can't propagate it through the C ABI anyway
|
||||
_, return_value = context.call_conv.call_function(
|
||||
builder, func, restype, argtypes, callargs)
|
||||
builder.ret(return_value)
|
||||
|
||||
library.add_ir_module(wrapper_module)
|
||||
library.finalize()
|
||||
return library
|
||||
|
||||
|
||||
@global_compiler_lock
|
||||
def compile(pyfunc, sig, debug=False, lineinfo=False, device=True,
|
||||
fastmath=False, cc=None, opt=True, abi="c", abi_info=None,
|
||||
output='ptx'):
|
||||
"""Compile a Python function to PTX or LTO-IR for a given set of argument
|
||||
types.
|
||||
|
||||
:param pyfunc: The Python function to compile.
|
||||
:param sig: The signature representing the function's input and output
|
||||
types. If this is a tuple of argument types without a return
|
||||
type, the inferred return type is returned by this function. If
|
||||
a signature including a return type is passed, the compiled code
|
||||
will include a cast from the inferred return type to the
|
||||
specified return type, and this function will return the
|
||||
specified return type.
|
||||
:param debug: Whether to include debug info in the compiled code.
|
||||
:type debug: bool
|
||||
:param lineinfo: Whether to include a line mapping from the compiled code
|
||||
to the source code. Usually this is used with optimized
|
||||
code (since debug mode would automatically include this),
|
||||
so we want debug info in the LLVM IR but only the line
|
||||
mapping in the final output.
|
||||
:type lineinfo: bool
|
||||
:param device: Whether to compile a device function.
|
||||
:type device: bool
|
||||
:param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
|
||||
prec_div=, and fma=1)
|
||||
:type fastmath: bool
|
||||
:param cc: Compute capability to compile for, as a tuple
|
||||
``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
|
||||
:type cc: tuple
|
||||
:param opt: Enable optimizations. Defaults to ``True``.
|
||||
:type opt: bool
|
||||
:param abi: The ABI for a compiled function - either ``"numba"`` or
|
||||
``"c"``. Note that the Numba ABI is not considered stable.
|
||||
The C ABI is only supported for device functions at present.
|
||||
:type abi: str
|
||||
:param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
|
||||
one option, ``"abi_name"``, for providing the wrapper
|
||||
function's name. The ``"numba"`` ABI has no options.
|
||||
:type abi_info: dict
|
||||
:param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
|
||||
:type output: str
|
||||
:return: (code, resty): The compiled code and inferred return type
|
||||
:rtype: tuple
|
||||
"""
|
||||
if abi not in ("numba", "c"):
|
||||
raise NotImplementedError(f'Unsupported ABI: {abi}')
|
||||
|
||||
if abi == 'c' and not device:
|
||||
raise NotImplementedError('The C ABI is not supported for kernels')
|
||||
|
||||
if output not in ("ptx", "ltoir"):
|
||||
raise NotImplementedError(f'Unsupported output type: {output}')
|
||||
|
||||
if debug and opt:
|
||||
msg = ("debug=True with opt=True (the default) "
|
||||
"is not supported by CUDA. This may result in a crash"
|
||||
" - set debug=False or opt=False.")
|
||||
warn(NumbaInvalidConfigWarning(msg))
|
||||
|
||||
lto = (output == 'ltoir')
|
||||
abi_info = abi_info or dict()
|
||||
|
||||
nvvm_options = {
|
||||
'fastmath': fastmath,
|
||||
'opt': 3 if opt else 0
|
||||
}
|
||||
|
||||
if lto:
|
||||
nvvm_options['gen-lto'] = None
|
||||
|
||||
args, return_type = sigutils.normalize_signature(sig)
|
||||
|
||||
cc = cc or config.CUDA_DEFAULT_PTX_CC
|
||||
cres = compile_cuda(pyfunc, return_type, args, debug=debug,
|
||||
lineinfo=lineinfo, fastmath=fastmath,
|
||||
nvvm_options=nvvm_options, cc=cc)
|
||||
resty = cres.signature.return_type
|
||||
|
||||
if resty and not device and resty != types.void:
|
||||
raise TypeError("CUDA kernel must have void return type.")
|
||||
|
||||
tgt = cres.target_context
|
||||
|
||||
if device:
|
||||
lib = cres.library
|
||||
if abi == "c":
|
||||
wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
|
||||
lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
|
||||
nvvm_options)
|
||||
else:
|
||||
code = pyfunc.__code__
|
||||
filename = code.co_filename
|
||||
linenum = code.co_firstlineno
|
||||
|
||||
lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
|
||||
lineinfo, nvvm_options, filename,
|
||||
linenum)
|
||||
|
||||
if lto:
|
||||
code = lib.get_ltoir(cc=cc)
|
||||
else:
|
||||
code = lib.get_asm_str(cc=cc)
|
||||
return code, resty
|
||||
|
||||
|
||||
def compile_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
|
||||
device=True, fastmath=False, opt=True,
|
||||
abi="c", abi_info=None, output='ptx'):
|
||||
"""Compile a Python function to PTX or LTO-IR for a given signature for the
|
||||
current device's compute capabilility. This calls :func:`compile` with an
|
||||
appropriate ``cc`` value for the current device."""
|
||||
cc = get_current_device().compute_capability
|
||||
return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
|
||||
fastmath=fastmath, cc=cc, opt=opt, abi=abi,
|
||||
abi_info=abi_info, output=output)
|
||||
|
||||
|
||||
def compile_ptx(pyfunc, sig, debug=False, lineinfo=False, device=False,
|
||||
fastmath=False, cc=None, opt=True, abi="numba", abi_info=None):
|
||||
"""Compile a Python function to PTX for a given signature. See
|
||||
:func:`compile`. The defaults for this function are to compile a kernel
|
||||
with the Numba ABI, rather than :func:`compile`'s default of compiling a
|
||||
device function with the C ABI."""
|
||||
return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
|
||||
fastmath=fastmath, cc=cc, opt=opt, abi=abi,
|
||||
abi_info=abi_info, output='ptx')
|
||||
|
||||
|
||||
def compile_ptx_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
|
||||
device=False, fastmath=False, opt=True,
|
||||
abi="numba", abi_info=None):
|
||||
"""Compile a Python function to PTX for a given signature for the current
|
||||
device's compute capabilility. See :func:`compile_ptx`."""
|
||||
cc = get_current_device().compute_capability
|
||||
return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
|
||||
device=device, fastmath=fastmath, cc=cc, opt=opt,
|
||||
abi=abi, abi_info=abi_info)
|
||||
|
||||
|
||||
def declare_device_function(name, restype, argtypes):
|
||||
return declare_device_function_template(name, restype, argtypes).key
|
||||
|
||||
|
||||
def declare_device_function_template(name, restype, argtypes):
|
||||
from .descriptor import cuda_target
|
||||
typingctx = cuda_target.typing_context
|
||||
targetctx = cuda_target.target_context
|
||||
sig = typing.signature(restype, *argtypes)
|
||||
extfn = ExternFunction(name, sig)
|
||||
|
||||
class device_function_template(ConcreteTemplate):
|
||||
key = extfn
|
||||
cases = [sig]
|
||||
|
||||
fndesc = funcdesc.ExternalFunctionDescriptor(
|
||||
name=name, restype=restype, argtypes=argtypes)
|
||||
typingctx.insert_user_function(extfn, device_function_template)
|
||||
targetctx.insert_user_function(extfn, fndesc)
|
||||
|
||||
return device_function_template
|
||||
|
||||
|
||||
class ExternFunction(object):
|
||||
def __init__(self, name, sig):
|
||||
self.name = name
|
||||
self.sig = sig
|
||||
@@ -0,0 +1,47 @@
|
||||
#include "cuda_fp16.h"
|
||||
|
||||
#define FNDEF(fname) __numba_wrapper_ ## fname
|
||||
|
||||
#define UNARY_FUNCTION(fname) extern "C" __device__ int\
|
||||
FNDEF(fname)( \
|
||||
short* return_value,\
|
||||
short x\
|
||||
)\
|
||||
{\
|
||||
__half retval = fname(__short_as_half (x));\
|
||||
\
|
||||
*return_value = __half_as_short (retval);\
|
||||
/* Signal that no Python exception occurred */ \
|
||||
return 0;\
|
||||
}\
|
||||
|
||||
extern "C" __device__ int
|
||||
FNDEF(hdiv)(
|
||||
short* return_value,
|
||||
short x,
|
||||
short y
|
||||
)
|
||||
{
|
||||
__half retval = __hdiv(__short_as_half (x), __short_as_half (y));
|
||||
|
||||
*return_value = __half_as_short (retval);
|
||||
// Signal that no Python exception occurred
|
||||
return 0;
|
||||
}
|
||||
|
||||
UNARY_FUNCTION(hsin)
|
||||
UNARY_FUNCTION(hcos)
|
||||
UNARY_FUNCTION(hlog)
|
||||
UNARY_FUNCTION(hlog10)
|
||||
UNARY_FUNCTION(hlog2)
|
||||
UNARY_FUNCTION(hexp)
|
||||
UNARY_FUNCTION(hexp10)
|
||||
UNARY_FUNCTION(hexp2)
|
||||
UNARY_FUNCTION(hsqrt)
|
||||
UNARY_FUNCTION(hrsqrt)
|
||||
UNARY_FUNCTION(hfloor)
|
||||
UNARY_FUNCTION(hceil)
|
||||
UNARY_FUNCTION(hrcp)
|
||||
UNARY_FUNCTION(hrint)
|
||||
UNARY_FUNCTION(htrunc)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,258 @@
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
from collections import namedtuple
|
||||
|
||||
from numba.core.config import IS_WIN32
|
||||
from numba.misc.findlib import find_lib, find_file
|
||||
|
||||
|
||||
_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
|
||||
|
||||
|
||||
def _find_valid_path(options):
|
||||
"""Find valid path from *options*, which is a list of 2-tuple of
|
||||
(name, path). Return first pair where *path* is not None.
|
||||
If no valid path is found, return ('<unknown>', None)
|
||||
"""
|
||||
for by, data in options:
|
||||
if data is not None:
|
||||
return by, data
|
||||
else:
|
||||
return '<unknown>', None
|
||||
|
||||
|
||||
def _get_libdevice_path_decision():
|
||||
options = [
|
||||
('Conda environment', get_conda_ctk()),
|
||||
('Conda environment (NVIDIA package)', get_nvidia_libdevice_ctk()),
|
||||
('CUDA_HOME', get_cuda_home('nvvm', 'libdevice')),
|
||||
('System', get_system_ctk('nvvm', 'libdevice')),
|
||||
('Debian package', get_debian_pkg_libdevice()),
|
||||
]
|
||||
by, libdir = _find_valid_path(options)
|
||||
return by, libdir
|
||||
|
||||
|
||||
def _nvvm_lib_dir():
|
||||
if IS_WIN32:
|
||||
return 'nvvm', 'bin'
|
||||
else:
|
||||
return 'nvvm', 'lib64'
|
||||
|
||||
|
||||
def _get_nvvm_path_decision():
|
||||
options = [
|
||||
('Conda environment', get_conda_ctk()),
|
||||
('Conda environment (NVIDIA package)', get_nvidia_nvvm_ctk()),
|
||||
('CUDA_HOME', get_cuda_home(*_nvvm_lib_dir())),
|
||||
('System', get_system_ctk(*_nvvm_lib_dir())),
|
||||
]
|
||||
by, path = _find_valid_path(options)
|
||||
return by, path
|
||||
|
||||
|
||||
def _get_libdevice_paths():
|
||||
by, libdir = _get_libdevice_path_decision()
|
||||
# Search for pattern
|
||||
pat = r'libdevice(\.\d+)*\.bc$'
|
||||
candidates = find_file(re.compile(pat), libdir)
|
||||
# Keep only the max (most recent version) of the bitcode files.
|
||||
out = max(candidates, default=None)
|
||||
return _env_path_tuple(by, out)
|
||||
|
||||
|
||||
def _cudalib_path():
|
||||
if IS_WIN32:
|
||||
return 'bin'
|
||||
else:
|
||||
return 'lib64'
|
||||
|
||||
|
||||
def _cuda_home_static_cudalib_path():
|
||||
if IS_WIN32:
|
||||
return ('lib', 'x64')
|
||||
else:
|
||||
return ('lib64',)
|
||||
|
||||
|
||||
def _get_cudalib_dir_path_decision():
|
||||
options = [
|
||||
('Conda environment', get_conda_ctk()),
|
||||
('Conda environment (NVIDIA package)', get_nvidia_cudalib_ctk()),
|
||||
('CUDA_HOME', get_cuda_home(_cudalib_path())),
|
||||
('System', get_system_ctk(_cudalib_path())),
|
||||
]
|
||||
by, libdir = _find_valid_path(options)
|
||||
return by, libdir
|
||||
|
||||
|
||||
def _get_static_cudalib_dir_path_decision():
|
||||
options = [
|
||||
('Conda environment', get_conda_ctk()),
|
||||
('Conda environment (NVIDIA package)', get_nvidia_static_cudalib_ctk()),
|
||||
('CUDA_HOME', get_cuda_home(*_cuda_home_static_cudalib_path())),
|
||||
('System', get_system_ctk(_cudalib_path())),
|
||||
]
|
||||
by, libdir = _find_valid_path(options)
|
||||
return by, libdir
|
||||
|
||||
|
||||
def _get_cudalib_dir():
|
||||
by, libdir = _get_cudalib_dir_path_decision()
|
||||
return _env_path_tuple(by, libdir)
|
||||
|
||||
|
||||
def _get_static_cudalib_dir():
|
||||
by, libdir = _get_static_cudalib_dir_path_decision()
|
||||
return _env_path_tuple(by, libdir)
|
||||
|
||||
|
||||
def get_system_ctk(*subdirs):
|
||||
"""Return path to system-wide cudatoolkit; or, None if it doesn't exist.
|
||||
"""
|
||||
# Linux?
|
||||
if sys.platform.startswith('linux'):
|
||||
# Is cuda alias to /usr/local/cuda?
|
||||
# We are intentionally not getting versioned cuda installation.
|
||||
base = '/usr/local/cuda'
|
||||
if os.path.exists(base):
|
||||
return os.path.join(base, *subdirs)
|
||||
|
||||
|
||||
def get_conda_ctk():
|
||||
"""Return path to directory containing the shared libraries of cudatoolkit.
|
||||
"""
|
||||
is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
|
||||
if not is_conda_env:
|
||||
return
|
||||
# Assume the existence of NVVM to imply cudatoolkit installed
|
||||
paths = find_lib('nvvm')
|
||||
if not paths:
|
||||
return
|
||||
# Use the directory name of the max path
|
||||
return os.path.dirname(max(paths))
|
||||
|
||||
|
||||
def get_nvidia_nvvm_ctk():
|
||||
"""Return path to directory containing the NVVM shared library.
|
||||
"""
|
||||
is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
|
||||
if not is_conda_env:
|
||||
return
|
||||
|
||||
# Assume the existence of NVVM in the conda env implies that a CUDA toolkit
|
||||
# conda package is installed.
|
||||
|
||||
# First, try the location used on Linux and the Windows 11.x packages
|
||||
libdir = os.path.join(sys.prefix, 'nvvm', _cudalib_path())
|
||||
if not os.path.exists(libdir) or not os.path.isdir(libdir):
|
||||
# If that fails, try the location used for Windows 12.x packages
|
||||
libdir = os.path.join(sys.prefix, 'Library', 'nvvm', _cudalib_path())
|
||||
if not os.path.exists(libdir) or not os.path.isdir(libdir):
|
||||
# If that doesn't exist either, assume we don't have the NVIDIA
|
||||
# conda package
|
||||
return
|
||||
|
||||
paths = find_lib('nvvm', libdir=libdir)
|
||||
if not paths:
|
||||
return
|
||||
# Use the directory name of the max path
|
||||
return os.path.dirname(max(paths))
|
||||
|
||||
|
||||
def get_nvidia_libdevice_ctk():
|
||||
"""Return path to directory containing the libdevice library.
|
||||
"""
|
||||
nvvm_ctk = get_nvidia_nvvm_ctk()
|
||||
if not nvvm_ctk:
|
||||
return
|
||||
nvvm_dir = os.path.dirname(nvvm_ctk)
|
||||
return os.path.join(nvvm_dir, 'libdevice')
|
||||
|
||||
|
||||
def get_nvidia_cudalib_ctk():
|
||||
"""Return path to directory containing the shared libraries of cudatoolkit.
|
||||
"""
|
||||
nvvm_ctk = get_nvidia_nvvm_ctk()
|
||||
if not nvvm_ctk:
|
||||
return
|
||||
env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
|
||||
subdir = 'bin' if IS_WIN32 else 'lib'
|
||||
return os.path.join(env_dir, subdir)
|
||||
|
||||
|
||||
def get_nvidia_static_cudalib_ctk():
|
||||
"""Return path to directory containing the static libraries of cudatoolkit.
|
||||
"""
|
||||
nvvm_ctk = get_nvidia_nvvm_ctk()
|
||||
if not nvvm_ctk:
|
||||
return
|
||||
|
||||
if IS_WIN32 and ("Library" not in nvvm_ctk):
|
||||
# Location specific to CUDA 11.x packages on Windows
|
||||
dirs = ('Lib', 'x64')
|
||||
else:
|
||||
# Linux, or Windows with CUDA 12.x packages
|
||||
dirs = ('lib',)
|
||||
|
||||
env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
|
||||
return os.path.join(env_dir, *dirs)
|
||||
|
||||
|
||||
def get_cuda_home(*subdirs):
|
||||
"""Get paths of CUDA_HOME.
|
||||
If *subdirs* are the subdirectory name to be appended in the resulting
|
||||
path.
|
||||
"""
|
||||
cuda_home = os.environ.get('CUDA_HOME')
|
||||
if cuda_home is None:
|
||||
# Try Windows CUDA installation without Anaconda
|
||||
cuda_home = os.environ.get('CUDA_PATH')
|
||||
if cuda_home is not None:
|
||||
return os.path.join(cuda_home, *subdirs)
|
||||
|
||||
|
||||
def _get_nvvm_path():
|
||||
by, path = _get_nvvm_path_decision()
|
||||
candidates = find_lib('nvvm', path)
|
||||
path = max(candidates) if candidates else None
|
||||
return _env_path_tuple(by, path)
|
||||
|
||||
|
||||
def get_cuda_paths():
|
||||
"""Returns a dictionary mapping component names to a 2-tuple
|
||||
of (source_variable, info).
|
||||
|
||||
The returned dictionary will have the following keys and infos:
|
||||
- "nvvm": file_path
|
||||
- "libdevice": List[Tuple[arch, file_path]]
|
||||
- "cudalib_dir": directory_path
|
||||
|
||||
Note: The result of the function is cached.
|
||||
"""
|
||||
# Check cache
|
||||
if hasattr(get_cuda_paths, '_cached_result'):
|
||||
return get_cuda_paths._cached_result
|
||||
else:
|
||||
# Not in cache
|
||||
d = {
|
||||
'nvvm': _get_nvvm_path(),
|
||||
'libdevice': _get_libdevice_paths(),
|
||||
'cudalib_dir': _get_cudalib_dir(),
|
||||
'static_cudalib_dir': _get_static_cudalib_dir(),
|
||||
}
|
||||
# Cache result
|
||||
get_cuda_paths._cached_result = d
|
||||
return d
|
||||
|
||||
|
||||
def get_debian_pkg_libdevice():
|
||||
"""
|
||||
Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
|
||||
exists.
|
||||
"""
|
||||
pkg_libdevice_location = '/usr/lib/nvidia-cuda-toolkit/libdevice'
|
||||
if not os.path.exists(pkg_libdevice_location):
|
||||
return None
|
||||
return pkg_libdevice_location
|
||||
@@ -0,0 +1,806 @@
|
||||
import operator
|
||||
from numba.core import types
|
||||
from numba.core.typing.npydecl import (parse_dtype, parse_shape,
|
||||
register_number_classes,
|
||||
register_numpy_ufunc,
|
||||
trigonometric_functions,
|
||||
comparison_functions,
|
||||
math_operations,
|
||||
bit_twiddling_functions)
|
||||
from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
|
||||
AbstractTemplate, CallableTemplate,
|
||||
signature, Registry)
|
||||
from numba.cuda.types import dim3
|
||||
from numba.core.typeconv import Conversion
|
||||
from numba import cuda
|
||||
from numba.cuda.compiler import declare_device_function_template
|
||||
|
||||
registry = Registry()
|
||||
register = registry.register
|
||||
register_attr = registry.register_attr
|
||||
register_global = registry.register_global
|
||||
|
||||
register_number_classes(register_global)
|
||||
|
||||
|
||||
class Cuda_array_decl(CallableTemplate):
|
||||
def generic(self):
|
||||
def typer(shape, dtype):
|
||||
|
||||
# Only integer literals and tuples of integer literals are valid
|
||||
# shapes
|
||||
if isinstance(shape, types.Integer):
|
||||
if not isinstance(shape, types.IntegerLiteral):
|
||||
return None
|
||||
elif isinstance(shape, (types.Tuple, types.UniTuple)):
|
||||
if any([not isinstance(s, types.IntegerLiteral)
|
||||
for s in shape]):
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
ndim = parse_shape(shape)
|
||||
nb_dtype = parse_dtype(dtype)
|
||||
if nb_dtype is not None and ndim is not None:
|
||||
return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_shared_array(Cuda_array_decl):
|
||||
key = cuda.shared.array
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_local_array(Cuda_array_decl):
|
||||
key = cuda.local.array
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_const_array_like(CallableTemplate):
|
||||
key = cuda.const.array_like
|
||||
|
||||
def generic(self):
|
||||
def typer(ndarray):
|
||||
return ndarray
|
||||
return typer
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_threadfence_device(ConcreteTemplate):
|
||||
key = cuda.threadfence
|
||||
cases = [signature(types.none)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_threadfence_block(ConcreteTemplate):
|
||||
key = cuda.threadfence_block
|
||||
cases = [signature(types.none)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_threadfence_system(ConcreteTemplate):
|
||||
key = cuda.threadfence_system
|
||||
cases = [signature(types.none)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_syncwarp(ConcreteTemplate):
|
||||
key = cuda.syncwarp
|
||||
cases = [signature(types.none), signature(types.none, types.i4)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
|
||||
key = cuda.shfl_sync_intrinsic
|
||||
cases = [
|
||||
signature(types.Tuple((types.i4, types.b1)),
|
||||
types.i4, types.i4, types.i4, types.i4, types.i4),
|
||||
signature(types.Tuple((types.i8, types.b1)),
|
||||
types.i4, types.i4, types.i8, types.i4, types.i4),
|
||||
signature(types.Tuple((types.f4, types.b1)),
|
||||
types.i4, types.i4, types.f4, types.i4, types.i4),
|
||||
signature(types.Tuple((types.f8, types.b1)),
|
||||
types.i4, types.i4, types.f8, types.i4, types.i4),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_vote_sync_intrinsic(ConcreteTemplate):
|
||||
key = cuda.vote_sync_intrinsic
|
||||
cases = [signature(types.Tuple((types.i4, types.b1)),
|
||||
types.i4, types.i4, types.b1)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_match_any_sync(ConcreteTemplate):
|
||||
key = cuda.match_any_sync
|
||||
cases = [
|
||||
signature(types.i4, types.i4, types.i4),
|
||||
signature(types.i4, types.i4, types.i8),
|
||||
signature(types.i4, types.i4, types.f4),
|
||||
signature(types.i4, types.i4, types.f8),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_match_all_sync(ConcreteTemplate):
|
||||
key = cuda.match_all_sync
|
||||
cases = [
|
||||
signature(types.Tuple((types.i4, types.b1)), types.i4, types.i4),
|
||||
signature(types.Tuple((types.i4, types.b1)), types.i4, types.i8),
|
||||
signature(types.Tuple((types.i4, types.b1)), types.i4, types.f4),
|
||||
signature(types.Tuple((types.i4, types.b1)), types.i4, types.f8),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_activemask(ConcreteTemplate):
|
||||
key = cuda.activemask
|
||||
cases = [signature(types.uint32)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_lanemask_lt(ConcreteTemplate):
|
||||
key = cuda.lanemask_lt
|
||||
cases = [signature(types.uint32)]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_popc(ConcreteTemplate):
|
||||
"""
|
||||
Supported types from `llvm.popc`
|
||||
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
|
||||
"""
|
||||
key = cuda.popc
|
||||
cases = [
|
||||
signature(types.int8, types.int8),
|
||||
signature(types.int16, types.int16),
|
||||
signature(types.int32, types.int32),
|
||||
signature(types.int64, types.int64),
|
||||
signature(types.uint8, types.uint8),
|
||||
signature(types.uint16, types.uint16),
|
||||
signature(types.uint32, types.uint32),
|
||||
signature(types.uint64, types.uint64),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_fma(ConcreteTemplate):
|
||||
"""
|
||||
Supported types from `llvm.fma`
|
||||
[here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics)
|
||||
"""
|
||||
key = cuda.fma
|
||||
cases = [
|
||||
signature(types.float32, types.float32, types.float32, types.float32),
|
||||
signature(types.float64, types.float64, types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_hfma(ConcreteTemplate):
|
||||
key = cuda.fp16.hfma
|
||||
cases = [
|
||||
signature(types.float16, types.float16, types.float16, types.float16)
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_cbrt(ConcreteTemplate):
|
||||
|
||||
key = cuda.cbrt
|
||||
cases = [
|
||||
signature(types.float32, types.float32),
|
||||
signature(types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_brev(ConcreteTemplate):
|
||||
key = cuda.brev
|
||||
cases = [
|
||||
signature(types.uint32, types.uint32),
|
||||
signature(types.uint64, types.uint64),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_clz(ConcreteTemplate):
|
||||
"""
|
||||
Supported types from `llvm.ctlz`
|
||||
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
|
||||
"""
|
||||
key = cuda.clz
|
||||
cases = [
|
||||
signature(types.int8, types.int8),
|
||||
signature(types.int16, types.int16),
|
||||
signature(types.int32, types.int32),
|
||||
signature(types.int64, types.int64),
|
||||
signature(types.uint8, types.uint8),
|
||||
signature(types.uint16, types.uint16),
|
||||
signature(types.uint32, types.uint32),
|
||||
signature(types.uint64, types.uint64),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_ffs(ConcreteTemplate):
|
||||
"""
|
||||
Supported types from `llvm.cttz`
|
||||
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
|
||||
"""
|
||||
key = cuda.ffs
|
||||
cases = [
|
||||
signature(types.uint32, types.int8),
|
||||
signature(types.uint32, types.int16),
|
||||
signature(types.uint32, types.int32),
|
||||
signature(types.uint32, types.int64),
|
||||
signature(types.uint32, types.uint8),
|
||||
signature(types.uint32, types.uint16),
|
||||
signature(types.uint32, types.uint32),
|
||||
signature(types.uint32, types.uint64),
|
||||
]
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_selp(AbstractTemplate):
|
||||
key = cuda.selp
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
test, a, b = args
|
||||
|
||||
# per docs
|
||||
# http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
|
||||
supported_types = (types.float64, types.float32,
|
||||
types.int16, types.uint16,
|
||||
types.int32, types.uint32,
|
||||
types.int64, types.uint64)
|
||||
|
||||
if a != b or a not in supported_types:
|
||||
return
|
||||
|
||||
return signature(a, test, a, a)
|
||||
|
||||
|
||||
def _genfp16_unary(l_key):
|
||||
@register
|
||||
class Cuda_fp16_unary(ConcreteTemplate):
|
||||
key = l_key
|
||||
cases = [signature(types.float16, types.float16)]
|
||||
|
||||
return Cuda_fp16_unary
|
||||
|
||||
|
||||
def _genfp16_unary_operator(l_key):
|
||||
@register_global(l_key)
|
||||
class Cuda_fp16_unary(AbstractTemplate):
|
||||
key = l_key
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
if len(args) == 1 and args[0] == types.float16:
|
||||
return signature(types.float16, types.float16)
|
||||
|
||||
return Cuda_fp16_unary
|
||||
|
||||
|
||||
def _genfp16_binary(l_key):
|
||||
@register
|
||||
class Cuda_fp16_binary(ConcreteTemplate):
|
||||
key = l_key
|
||||
cases = [signature(types.float16, types.float16, types.float16)]
|
||||
|
||||
return Cuda_fp16_binary
|
||||
|
||||
|
||||
@register_global(float)
|
||||
class Float(AbstractTemplate):
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
|
||||
[arg] = args
|
||||
|
||||
if arg == types.float16:
|
||||
return signature(arg, arg)
|
||||
|
||||
|
||||
def _genfp16_binary_comparison(l_key):
|
||||
@register
|
||||
class Cuda_fp16_cmp(ConcreteTemplate):
|
||||
key = l_key
|
||||
|
||||
cases = [
|
||||
signature(types.b1, types.float16, types.float16)
|
||||
]
|
||||
return Cuda_fp16_cmp
|
||||
|
||||
# If multiple ConcreteTemplates provide typing for a single function, then
|
||||
# function resolution will pick the first compatible typing it finds even if it
|
||||
# involves inserting a cast that would be considered undesirable (in this
|
||||
# specific case, float16s could be cast to float32s for comparisons).
|
||||
#
|
||||
# To work around this, we instead use an AbstractTemplate that implements
|
||||
# exactly the casting logic that we desire. The AbstractTemplate gets
|
||||
# considered in preference to ConcreteTemplates during typing.
|
||||
#
|
||||
# This is tracked as Issue #7863 (https://github.com/numba/numba/issues/7863) -
|
||||
# once this is resolved it should be possible to replace this AbstractTemplate
|
||||
# with a ConcreteTemplate to simplify the logic.
|
||||
|
||||
|
||||
def _fp16_binary_operator(l_key, retty):
|
||||
@register_global(l_key)
|
||||
class Cuda_fp16_operator(AbstractTemplate):
|
||||
key = l_key
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
|
||||
if len(args) == 2 and \
|
||||
(args[0] == types.float16 or args[1] == types.float16):
|
||||
if (args[0] == types.float16):
|
||||
convertible = self.context.can_convert(args[1], args[0])
|
||||
else:
|
||||
convertible = self.context.can_convert(args[0], args[1])
|
||||
|
||||
# We allow three cases here:
|
||||
#
|
||||
# 1. fp16 to fp16 - Conversion.exact
|
||||
# 2. fp16 to other types fp16 can be promoted to
|
||||
# - Conversion.promote
|
||||
# 3. fp16 to int8 (safe conversion) -
|
||||
# - Conversion.safe
|
||||
|
||||
if (convertible == Conversion.exact) or \
|
||||
(convertible == Conversion.promote) or \
|
||||
(convertible == Conversion.safe):
|
||||
return signature(retty, types.float16, types.float16)
|
||||
|
||||
return Cuda_fp16_operator
|
||||
|
||||
|
||||
def _genfp16_comparison_operator(op):
|
||||
return _fp16_binary_operator(op, types.b1)
|
||||
|
||||
|
||||
def _genfp16_binary_operator(op):
|
||||
return _fp16_binary_operator(op, types.float16)
|
||||
|
||||
|
||||
Cuda_hadd = _genfp16_binary(cuda.fp16.hadd)
|
||||
Cuda_add = _genfp16_binary_operator(operator.add)
|
||||
Cuda_iadd = _genfp16_binary_operator(operator.iadd)
|
||||
Cuda_hsub = _genfp16_binary(cuda.fp16.hsub)
|
||||
Cuda_sub = _genfp16_binary_operator(operator.sub)
|
||||
Cuda_isub = _genfp16_binary_operator(operator.isub)
|
||||
Cuda_hmul = _genfp16_binary(cuda.fp16.hmul)
|
||||
Cuda_mul = _genfp16_binary_operator(operator.mul)
|
||||
Cuda_imul = _genfp16_binary_operator(operator.imul)
|
||||
Cuda_hmax = _genfp16_binary(cuda.fp16.hmax)
|
||||
Cuda_hmin = _genfp16_binary(cuda.fp16.hmin)
|
||||
Cuda_hneg = _genfp16_unary(cuda.fp16.hneg)
|
||||
Cuda_neg = _genfp16_unary_operator(operator.neg)
|
||||
Cuda_habs = _genfp16_unary(cuda.fp16.habs)
|
||||
Cuda_abs = _genfp16_unary_operator(abs)
|
||||
Cuda_heq = _genfp16_binary_comparison(cuda.fp16.heq)
|
||||
_genfp16_comparison_operator(operator.eq)
|
||||
Cuda_hne = _genfp16_binary_comparison(cuda.fp16.hne)
|
||||
_genfp16_comparison_operator(operator.ne)
|
||||
Cuda_hge = _genfp16_binary_comparison(cuda.fp16.hge)
|
||||
_genfp16_comparison_operator(operator.ge)
|
||||
Cuda_hgt = _genfp16_binary_comparison(cuda.fp16.hgt)
|
||||
_genfp16_comparison_operator(operator.gt)
|
||||
Cuda_hle = _genfp16_binary_comparison(cuda.fp16.hle)
|
||||
_genfp16_comparison_operator(operator.le)
|
||||
Cuda_hlt = _genfp16_binary_comparison(cuda.fp16.hlt)
|
||||
_genfp16_comparison_operator(operator.lt)
|
||||
_genfp16_binary_operator(operator.truediv)
|
||||
_genfp16_binary_operator(operator.itruediv)
|
||||
|
||||
|
||||
def _resolve_wrapped_unary(fname):
|
||||
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
|
||||
types.float16,
|
||||
(types.float16,))
|
||||
return types.Function(decl)
|
||||
|
||||
|
||||
def _resolve_wrapped_binary(fname):
|
||||
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
|
||||
types.float16,
|
||||
(types.float16, types.float16,))
|
||||
return types.Function(decl)
|
||||
|
||||
|
||||
hsin_device = _resolve_wrapped_unary('hsin')
|
||||
hcos_device = _resolve_wrapped_unary('hcos')
|
||||
hlog_device = _resolve_wrapped_unary('hlog')
|
||||
hlog10_device = _resolve_wrapped_unary('hlog10')
|
||||
hlog2_device = _resolve_wrapped_unary('hlog2')
|
||||
hexp_device = _resolve_wrapped_unary('hexp')
|
||||
hexp10_device = _resolve_wrapped_unary('hexp10')
|
||||
hexp2_device = _resolve_wrapped_unary('hexp2')
|
||||
hsqrt_device = _resolve_wrapped_unary('hsqrt')
|
||||
hrsqrt_device = _resolve_wrapped_unary('hrsqrt')
|
||||
hfloor_device = _resolve_wrapped_unary('hfloor')
|
||||
hceil_device = _resolve_wrapped_unary('hceil')
|
||||
hrcp_device = _resolve_wrapped_unary('hrcp')
|
||||
hrint_device = _resolve_wrapped_unary('hrint')
|
||||
htrunc_device = _resolve_wrapped_unary('htrunc')
|
||||
hdiv_device = _resolve_wrapped_binary('hdiv')
|
||||
|
||||
|
||||
# generate atomic operations
|
||||
def _gen(l_key, supported_types):
|
||||
@register
|
||||
class Cuda_atomic(AbstractTemplate):
|
||||
key = l_key
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
ary, idx, val = args
|
||||
|
||||
if ary.dtype not in supported_types:
|
||||
return
|
||||
|
||||
if ary.ndim == 1:
|
||||
return signature(ary.dtype, ary, types.intp, ary.dtype)
|
||||
elif ary.ndim > 1:
|
||||
return signature(ary.dtype, ary, idx, ary.dtype)
|
||||
return Cuda_atomic
|
||||
|
||||
|
||||
all_numba_types = (types.float64, types.float32,
|
||||
types.int32, types.uint32,
|
||||
types.int64, types.uint64)
|
||||
|
||||
integer_numba_types = (types.int32, types.uint32,
|
||||
types.int64, types.uint64)
|
||||
|
||||
unsigned_int_numba_types = (types.uint32, types.uint64)
|
||||
|
||||
Cuda_atomic_add = _gen(cuda.atomic.add, all_numba_types)
|
||||
Cuda_atomic_sub = _gen(cuda.atomic.sub, all_numba_types)
|
||||
Cuda_atomic_max = _gen(cuda.atomic.max, all_numba_types)
|
||||
Cuda_atomic_min = _gen(cuda.atomic.min, all_numba_types)
|
||||
Cuda_atomic_nanmax = _gen(cuda.atomic.nanmax, all_numba_types)
|
||||
Cuda_atomic_nanmin = _gen(cuda.atomic.nanmin, all_numba_types)
|
||||
Cuda_atomic_and = _gen(cuda.atomic.and_, integer_numba_types)
|
||||
Cuda_atomic_or = _gen(cuda.atomic.or_, integer_numba_types)
|
||||
Cuda_atomic_xor = _gen(cuda.atomic.xor, integer_numba_types)
|
||||
Cuda_atomic_inc = _gen(cuda.atomic.inc, unsigned_int_numba_types)
|
||||
Cuda_atomic_dec = _gen(cuda.atomic.dec, unsigned_int_numba_types)
|
||||
Cuda_atomic_exch = _gen(cuda.atomic.exch, integer_numba_types)
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_atomic_compare_and_swap(AbstractTemplate):
|
||||
key = cuda.atomic.compare_and_swap
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
ary, old, val = args
|
||||
dty = ary.dtype
|
||||
|
||||
if dty in integer_numba_types and ary.ndim == 1:
|
||||
return signature(dty, ary, dty, dty)
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_atomic_cas(AbstractTemplate):
|
||||
key = cuda.atomic.cas
|
||||
|
||||
def generic(self, args, kws):
|
||||
assert not kws
|
||||
ary, idx, old, val = args
|
||||
dty = ary.dtype
|
||||
|
||||
if dty not in integer_numba_types:
|
||||
return
|
||||
|
||||
if ary.ndim == 1:
|
||||
return signature(dty, ary, types.intp, dty, dty)
|
||||
elif ary.ndim > 1:
|
||||
return signature(dty, ary, idx, dty, dty)
|
||||
|
||||
|
||||
@register
|
||||
class Cuda_nanosleep(ConcreteTemplate):
|
||||
key = cuda.nanosleep
|
||||
|
||||
cases = [signature(types.void, types.uint32)]
|
||||
|
||||
|
||||
@register_attr
|
||||
class Dim3_attrs(AttributeTemplate):
|
||||
key = dim3
|
||||
|
||||
def resolve_x(self, mod):
|
||||
return types.int32
|
||||
|
||||
def resolve_y(self, mod):
|
||||
return types.int32
|
||||
|
||||
def resolve_z(self, mod):
|
||||
return types.int32
|
||||
|
||||
|
||||
@register_attr
|
||||
class CudaSharedModuleTemplate(AttributeTemplate):
|
||||
key = types.Module(cuda.shared)
|
||||
|
||||
def resolve_array(self, mod):
|
||||
return types.Function(Cuda_shared_array)
|
||||
|
||||
|
||||
@register_attr
|
||||
class CudaConstModuleTemplate(AttributeTemplate):
|
||||
key = types.Module(cuda.const)
|
||||
|
||||
def resolve_array_like(self, mod):
|
||||
return types.Function(Cuda_const_array_like)
|
||||
|
||||
|
||||
@register_attr
|
||||
class CudaLocalModuleTemplate(AttributeTemplate):
|
||||
key = types.Module(cuda.local)
|
||||
|
||||
def resolve_array(self, mod):
|
||||
return types.Function(Cuda_local_array)
|
||||
|
||||
|
||||
@register_attr
|
||||
class CudaAtomicTemplate(AttributeTemplate):
|
||||
key = types.Module(cuda.atomic)
|
||||
|
||||
def resolve_add(self, mod):
|
||||
return types.Function(Cuda_atomic_add)
|
||||
|
||||
def resolve_sub(self, mod):
|
||||
return types.Function(Cuda_atomic_sub)
|
||||
|
||||
def resolve_and_(self, mod):
|
||||
return types.Function(Cuda_atomic_and)
|
||||
|
||||
def resolve_or_(self, mod):
|
||||
return types.Function(Cuda_atomic_or)
|
||||
|
||||
def resolve_xor(self, mod):
|
||||
return types.Function(Cuda_atomic_xor)
|
||||
|
||||
def resolve_inc(self, mod):
|
||||
return types.Function(Cuda_atomic_inc)
|
||||
|
||||
def resolve_dec(self, mod):
|
||||
return types.Function(Cuda_atomic_dec)
|
||||
|
||||
def resolve_exch(self, mod):
|
||||
return types.Function(Cuda_atomic_exch)
|
||||
|
||||
def resolve_max(self, mod):
|
||||
return types.Function(Cuda_atomic_max)
|
||||
|
||||
def resolve_min(self, mod):
|
||||
return types.Function(Cuda_atomic_min)
|
||||
|
||||
def resolve_nanmin(self, mod):
|
||||
return types.Function(Cuda_atomic_nanmin)
|
||||
|
||||
def resolve_nanmax(self, mod):
|
||||
return types.Function(Cuda_atomic_nanmax)
|
||||
|
||||
def resolve_compare_and_swap(self, mod):
|
||||
return types.Function(Cuda_atomic_compare_and_swap)
|
||||
|
||||
def resolve_cas(self, mod):
|
||||
return types.Function(Cuda_atomic_cas)
|
||||
|
||||
|
||||
@register_attr
|
||||
class CudaFp16Template(AttributeTemplate):
|
||||
key = types.Module(cuda.fp16)
|
||||
|
||||
def resolve_hadd(self, mod):
|
||||
return types.Function(Cuda_hadd)
|
||||
|
||||
def resolve_hsub(self, mod):
|
||||
return types.Function(Cuda_hsub)
|
||||
|
||||
def resolve_hmul(self, mod):
|
||||
return types.Function(Cuda_hmul)
|
||||
|
||||
def resolve_hdiv(self, mod):
|
||||
return hdiv_device
|
||||
|
||||
def resolve_hneg(self, mod):
|
||||
return types.Function(Cuda_hneg)
|
||||
|
||||
def resolve_habs(self, mod):
|
||||
return types.Function(Cuda_habs)
|
||||
|
||||
def resolve_hfma(self, mod):
|
||||
return types.Function(Cuda_hfma)
|
||||
|
||||
def resolve_hsin(self, mod):
|
||||
return hsin_device
|
||||
|
||||
def resolve_hcos(self, mod):
|
||||
return hcos_device
|
||||
|
||||
def resolve_hlog(self, mod):
|
||||
return hlog_device
|
||||
|
||||
def resolve_hlog10(self, mod):
|
||||
return hlog10_device
|
||||
|
||||
def resolve_hlog2(self, mod):
|
||||
return hlog2_device
|
||||
|
||||
def resolve_hexp(self, mod):
|
||||
return hexp_device
|
||||
|
||||
def resolve_hexp10(self, mod):
|
||||
return hexp10_device
|
||||
|
||||
def resolve_hexp2(self, mod):
|
||||
return hexp2_device
|
||||
|
||||
def resolve_hfloor(self, mod):
|
||||
return hfloor_device
|
||||
|
||||
def resolve_hceil(self, mod):
|
||||
return hceil_device
|
||||
|
||||
def resolve_hsqrt(self, mod):
|
||||
return hsqrt_device
|
||||
|
||||
def resolve_hrsqrt(self, mod):
|
||||
return hrsqrt_device
|
||||
|
||||
def resolve_hrcp(self, mod):
|
||||
return hrcp_device
|
||||
|
||||
def resolve_hrint(self, mod):
|
||||
return hrint_device
|
||||
|
||||
def resolve_htrunc(self, mod):
|
||||
return htrunc_device
|
||||
|
||||
def resolve_heq(self, mod):
|
||||
return types.Function(Cuda_heq)
|
||||
|
||||
def resolve_hne(self, mod):
|
||||
return types.Function(Cuda_hne)
|
||||
|
||||
def resolve_hge(self, mod):
|
||||
return types.Function(Cuda_hge)
|
||||
|
||||
def resolve_hgt(self, mod):
|
||||
return types.Function(Cuda_hgt)
|
||||
|
||||
def resolve_hle(self, mod):
|
||||
return types.Function(Cuda_hle)
|
||||
|
||||
def resolve_hlt(self, mod):
|
||||
return types.Function(Cuda_hlt)
|
||||
|
||||
def resolve_hmax(self, mod):
|
||||
return types.Function(Cuda_hmax)
|
||||
|
||||
def resolve_hmin(self, mod):
|
||||
return types.Function(Cuda_hmin)
|
||||
|
||||
|
||||
@register_attr
|
||||
class CudaModuleTemplate(AttributeTemplate):
|
||||
key = types.Module(cuda)
|
||||
|
||||
def resolve_cg(self, mod):
|
||||
return types.Module(cuda.cg)
|
||||
|
||||
def resolve_threadIdx(self, mod):
|
||||
return dim3
|
||||
|
||||
def resolve_blockIdx(self, mod):
|
||||
return dim3
|
||||
|
||||
def resolve_blockDim(self, mod):
|
||||
return dim3
|
||||
|
||||
def resolve_gridDim(self, mod):
|
||||
return dim3
|
||||
|
||||
def resolve_laneid(self, mod):
|
||||
return types.int32
|
||||
|
||||
def resolve_shared(self, mod):
|
||||
return types.Module(cuda.shared)
|
||||
|
||||
def resolve_popc(self, mod):
|
||||
return types.Function(Cuda_popc)
|
||||
|
||||
def resolve_brev(self, mod):
|
||||
return types.Function(Cuda_brev)
|
||||
|
||||
def resolve_clz(self, mod):
|
||||
return types.Function(Cuda_clz)
|
||||
|
||||
def resolve_ffs(self, mod):
|
||||
return types.Function(Cuda_ffs)
|
||||
|
||||
def resolve_fma(self, mod):
|
||||
return types.Function(Cuda_fma)
|
||||
|
||||
def resolve_cbrt(self, mod):
|
||||
return types.Function(Cuda_cbrt)
|
||||
|
||||
def resolve_threadfence(self, mod):
|
||||
return types.Function(Cuda_threadfence_device)
|
||||
|
||||
def resolve_threadfence_block(self, mod):
|
||||
return types.Function(Cuda_threadfence_block)
|
||||
|
||||
def resolve_threadfence_system(self, mod):
|
||||
return types.Function(Cuda_threadfence_system)
|
||||
|
||||
def resolve_syncwarp(self, mod):
|
||||
return types.Function(Cuda_syncwarp)
|
||||
|
||||
def resolve_shfl_sync_intrinsic(self, mod):
|
||||
return types.Function(Cuda_shfl_sync_intrinsic)
|
||||
|
||||
def resolve_vote_sync_intrinsic(self, mod):
|
||||
return types.Function(Cuda_vote_sync_intrinsic)
|
||||
|
||||
def resolve_match_any_sync(self, mod):
|
||||
return types.Function(Cuda_match_any_sync)
|
||||
|
||||
def resolve_match_all_sync(self, mod):
|
||||
return types.Function(Cuda_match_all_sync)
|
||||
|
||||
def resolve_activemask(self, mod):
|
||||
return types.Function(Cuda_activemask)
|
||||
|
||||
def resolve_lanemask_lt(self, mod):
|
||||
return types.Function(Cuda_lanemask_lt)
|
||||
|
||||
def resolve_selp(self, mod):
|
||||
return types.Function(Cuda_selp)
|
||||
|
||||
def resolve_nanosleep(self, mod):
|
||||
return types.Function(Cuda_nanosleep)
|
||||
|
||||
def resolve_atomic(self, mod):
|
||||
return types.Module(cuda.atomic)
|
||||
|
||||
def resolve_fp16(self, mod):
|
||||
return types.Module(cuda.fp16)
|
||||
|
||||
def resolve_const(self, mod):
|
||||
return types.Module(cuda.const)
|
||||
|
||||
def resolve_local(self, mod):
|
||||
return types.Module(cuda.local)
|
||||
|
||||
|
||||
register_global(cuda, types.Module(cuda))
|
||||
|
||||
|
||||
# NumPy
|
||||
|
||||
for func in trigonometric_functions:
|
||||
register_numpy_ufunc(func, register_global)
|
||||
|
||||
for func in comparison_functions:
|
||||
register_numpy_ufunc(func, register_global)
|
||||
|
||||
for func in bit_twiddling_functions:
|
||||
register_numpy_ufunc(func, register_global)
|
||||
|
||||
for func in math_operations:
|
||||
if func in ('log', 'log2', 'log10'):
|
||||
register_numpy_ufunc(func, register_global)
|
||||
@@ -0,0 +1,9 @@
|
||||
"""CUDA Driver
|
||||
|
||||
- Driver API binding
|
||||
- NVVM API binding
|
||||
- Device array implementation
|
||||
|
||||
"""
|
||||
from numba.core import config
|
||||
assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,904 @@
|
||||
"""
|
||||
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
|
||||
on the object. If it exists and evaluate to True, it must define shape,
|
||||
strides, dtype and size attributes similar to a NumPy ndarray.
|
||||
"""
|
||||
|
||||
import math
|
||||
import functools
|
||||
import operator
|
||||
import copy
|
||||
from ctypes import c_void_p
|
||||
|
||||
import numpy as np
|
||||
|
||||
import numba
|
||||
from numba import _devicearray
|
||||
from numba.cuda.cudadrv import devices, dummyarray
|
||||
from numba.cuda.cudadrv import driver as _driver
|
||||
from numba.core import types, config
|
||||
from numba.np.unsafe.ndarray import to_fixed_tuple
|
||||
from numba.np.numpy_support import numpy_version
|
||||
from numba.np import numpy_support
|
||||
from numba.cuda.api_util import prepare_shape_strides_dtype
|
||||
from numba.core.errors import NumbaPerformanceWarning
|
||||
from warnings import warn
|
||||
|
||||
try:
|
||||
lru_cache = getattr(functools, 'lru_cache')(None)
|
||||
except AttributeError:
|
||||
# Python 3.1 or lower
|
||||
def lru_cache(func):
|
||||
return func
|
||||
|
||||
|
||||
def is_cuda_ndarray(obj):
|
||||
"Check if an object is a CUDA ndarray"
|
||||
return getattr(obj, '__cuda_ndarray__', False)
|
||||
|
||||
|
||||
def verify_cuda_ndarray_interface(obj):
|
||||
"Verify the CUDA ndarray interface for an obj"
|
||||
require_cuda_ndarray(obj)
|
||||
|
||||
def requires_attr(attr, typ):
|
||||
if not hasattr(obj, attr):
|
||||
raise AttributeError(attr)
|
||||
if not isinstance(getattr(obj, attr), typ):
|
||||
raise AttributeError('%s must be of type %s' % (attr, typ))
|
||||
|
||||
requires_attr('shape', tuple)
|
||||
requires_attr('strides', tuple)
|
||||
requires_attr('dtype', np.dtype)
|
||||
requires_attr('size', int)
|
||||
|
||||
|
||||
def require_cuda_ndarray(obj):
|
||||
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
|
||||
if not is_cuda_ndarray(obj):
|
||||
raise ValueError('require an cuda ndarray object')
|
||||
|
||||
|
||||
class DeviceNDArrayBase(_devicearray.DeviceArray):
|
||||
"""A on GPU NDArray representation
|
||||
"""
|
||||
__cuda_memory__ = True
|
||||
__cuda_ndarray__ = True # There must be gpu_data attribute
|
||||
|
||||
def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
|
||||
"""
|
||||
Args
|
||||
----
|
||||
|
||||
shape
|
||||
array shape.
|
||||
strides
|
||||
array strides.
|
||||
dtype
|
||||
data type as np.dtype coercible object.
|
||||
stream
|
||||
cuda stream.
|
||||
gpu_data
|
||||
user provided device memory for the ndarray data buffer
|
||||
"""
|
||||
if isinstance(shape, int):
|
||||
shape = (shape,)
|
||||
if isinstance(strides, int):
|
||||
strides = (strides,)
|
||||
dtype = np.dtype(dtype)
|
||||
self.ndim = len(shape)
|
||||
if len(strides) != self.ndim:
|
||||
raise ValueError('strides not match ndim')
|
||||
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
|
||||
dtype.itemsize)
|
||||
self.shape = tuple(shape)
|
||||
self.strides = tuple(strides)
|
||||
self.dtype = dtype
|
||||
self.size = int(functools.reduce(operator.mul, self.shape, 1))
|
||||
# prepare gpu memory
|
||||
if self.size > 0:
|
||||
if gpu_data is None:
|
||||
self.alloc_size = _driver.memory_size_from_info(
|
||||
self.shape, self.strides, self.dtype.itemsize)
|
||||
gpu_data = devices.get_context().memalloc(self.alloc_size)
|
||||
else:
|
||||
self.alloc_size = _driver.device_memory_size(gpu_data)
|
||||
else:
|
||||
# Make NULL pointer for empty allocation
|
||||
if _driver.USE_NV_BINDING:
|
||||
null = _driver.binding.CUdeviceptr(0)
|
||||
else:
|
||||
null = c_void_p(0)
|
||||
gpu_data = _driver.MemoryPointer(context=devices.get_context(),
|
||||
pointer=null, size=0)
|
||||
self.alloc_size = 0
|
||||
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
@property
|
||||
def __cuda_array_interface__(self):
|
||||
if _driver.USE_NV_BINDING:
|
||||
if self.device_ctypes_pointer is not None:
|
||||
ptr = int(self.device_ctypes_pointer)
|
||||
else:
|
||||
ptr = 0
|
||||
else:
|
||||
if self.device_ctypes_pointer.value is not None:
|
||||
ptr = self.device_ctypes_pointer.value
|
||||
else:
|
||||
ptr = 0
|
||||
|
||||
return {
|
||||
'shape': tuple(self.shape),
|
||||
'strides': None if is_contiguous(self) else tuple(self.strides),
|
||||
'data': (ptr, False),
|
||||
'typestr': self.dtype.str,
|
||||
'stream': int(self.stream) if self.stream != 0 else None,
|
||||
'version': 3,
|
||||
}
|
||||
|
||||
def bind(self, stream=0):
|
||||
"""Bind a CUDA stream to this object so that all subsequent operation
|
||||
on this array defaults to the given stream.
|
||||
"""
|
||||
clone = copy.copy(self)
|
||||
clone.stream = stream
|
||||
return clone
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.transpose()
|
||||
|
||||
def transpose(self, axes=None):
|
||||
if axes and tuple(axes) == tuple(range(self.ndim)):
|
||||
return self
|
||||
elif self.ndim != 2:
|
||||
msg = "transposing a non-2D DeviceNDArray isn't supported"
|
||||
raise NotImplementedError(msg)
|
||||
elif axes is not None and set(axes) != set(range(self.ndim)):
|
||||
raise ValueError("invalid axes list %r" % (axes,))
|
||||
else:
|
||||
from numba.cuda.kernels.transpose import transpose
|
||||
return transpose(self)
|
||||
|
||||
def _default_stream(self, stream):
|
||||
return self.stream if not stream else stream
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
"""
|
||||
Magic attribute expected by Numba to get the numba type that
|
||||
represents this object.
|
||||
"""
|
||||
# Typing considerations:
|
||||
#
|
||||
# 1. The preference is to use 'C' or 'F' layout since this enables
|
||||
# hardcoding stride values into compiled kernels, which is more
|
||||
# efficient than storing a passed-in value in a register.
|
||||
#
|
||||
# 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
|
||||
# the more likely / common case.
|
||||
#
|
||||
# 3. If an array is broadcast then it must be typed as 'A' - using 'C'
|
||||
# or 'F' does not apply for broadcast arrays, because the strides, some
|
||||
# of which will be 0, will not match those hardcoded in for 'C' or 'F'
|
||||
# layouts.
|
||||
|
||||
broadcast = 0 in self.strides
|
||||
if self.flags['C_CONTIGUOUS'] and not broadcast:
|
||||
layout = 'C'
|
||||
elif self.flags['F_CONTIGUOUS'] and not broadcast:
|
||||
layout = 'F'
|
||||
else:
|
||||
layout = 'A'
|
||||
|
||||
dtype = numpy_support.from_dtype(self.dtype)
|
||||
return types.Array(dtype, self.ndim, layout)
|
||||
|
||||
@property
|
||||
def device_ctypes_pointer(self):
|
||||
"""Returns the ctypes pointer to the GPU data buffer
|
||||
"""
|
||||
if self.gpu_data is None:
|
||||
if _driver.USE_NV_BINDING:
|
||||
return _driver.binding.CUdeviceptr(0)
|
||||
else:
|
||||
return c_void_p(0)
|
||||
else:
|
||||
return self.gpu_data.device_ctypes_pointer
|
||||
|
||||
@devices.require_context
|
||||
def copy_to_device(self, ary, stream=0):
|
||||
"""Copy `ary` to `self`.
|
||||
|
||||
If `ary` is a CUDA memory, perform a device-to-device transfer.
|
||||
Otherwise, perform a a host-to-device transfer.
|
||||
"""
|
||||
if ary.size == 0:
|
||||
# Nothing to do
|
||||
return
|
||||
|
||||
sentry_contiguous(self)
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
self_core, ary_core = array_core(self), array_core(ary)
|
||||
if _driver.is_device_memory(ary):
|
||||
sentry_contiguous(ary)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
_driver.device_to_device(self, ary, self.alloc_size, stream=stream)
|
||||
else:
|
||||
# Ensure same contiguity. Only makes a host-side copy if necessary
|
||||
# (i.e., in order to materialize a writable strided view)
|
||||
ary_core = np.array(
|
||||
ary_core,
|
||||
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
|
||||
subok=True,
|
||||
copy=(not ary_core.flags['WRITEABLE'])
|
||||
if numpy_version < (2, 0) else None)
|
||||
check_array_compatibility(self_core, ary_core)
|
||||
_driver.host_to_device(self, ary_core, self.alloc_size,
|
||||
stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def copy_to_host(self, ary=None, stream=0):
|
||||
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
|
||||
if ``ary`` is ``None``.
|
||||
|
||||
If a CUDA ``stream`` is given, then the transfer will be made
|
||||
asynchronously as part as the given stream. Otherwise, the transfer is
|
||||
synchronous: the function returns after the copy is finished.
|
||||
|
||||
Always returns the host array.
|
||||
|
||||
Example::
|
||||
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
arr = np.arange(1000)
|
||||
d_arr = cuda.to_device(arr)
|
||||
|
||||
my_kernel[100, 100](d_arr)
|
||||
|
||||
result_array = d_arr.copy_to_host()
|
||||
"""
|
||||
if any(s < 0 for s in self.strides):
|
||||
msg = 'D->H copy not implemented for negative strides: {}'
|
||||
raise NotImplementedError(msg.format(self.strides))
|
||||
assert self.alloc_size >= 0, "Negative memory size"
|
||||
stream = self._default_stream(stream)
|
||||
if ary is None:
|
||||
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
|
||||
else:
|
||||
check_array_compatibility(self, ary)
|
||||
hostary = ary
|
||||
|
||||
if self.alloc_size != 0:
|
||||
_driver.device_to_host(hostary, self, self.alloc_size,
|
||||
stream=stream)
|
||||
|
||||
if ary is None:
|
||||
if self.size == 0:
|
||||
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
|
||||
buffer=hostary)
|
||||
else:
|
||||
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
|
||||
strides=self.strides, buffer=hostary)
|
||||
return hostary
|
||||
|
||||
def split(self, section, stream=0):
|
||||
"""Split the array into equal partition of the `section` size.
|
||||
If the array cannot be equally divided, the last section will be
|
||||
smaller.
|
||||
"""
|
||||
stream = self._default_stream(stream)
|
||||
if self.ndim != 1:
|
||||
raise ValueError("only support 1d array")
|
||||
if self.strides[0] != self.dtype.itemsize:
|
||||
raise ValueError("only support unit stride")
|
||||
nsect = int(math.ceil(float(self.size) / section))
|
||||
strides = self.strides
|
||||
itemsize = self.dtype.itemsize
|
||||
for i in range(nsect):
|
||||
begin = i * section
|
||||
end = min(begin + section, self.size)
|
||||
shape = (end - begin,)
|
||||
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
|
||||
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
|
||||
gpu_data=gpu_data)
|
||||
|
||||
def as_cuda_arg(self):
|
||||
"""Returns a device memory object that is used as the argument.
|
||||
"""
|
||||
return self.gpu_data
|
||||
|
||||
def get_ipc_handle(self):
|
||||
"""
|
||||
Returns a *IpcArrayHandle* object that is safe to serialize and transfer
|
||||
to another process to share the local allocation.
|
||||
|
||||
Note: this feature is only available on Linux.
|
||||
"""
|
||||
ipch = devices.get_context().get_ipc_handle(self.gpu_data)
|
||||
desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
|
||||
return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
|
||||
|
||||
def squeeze(self, axis=None, stream=0):
|
||||
"""
|
||||
Remove axes of size one from the array shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : None or int or tuple of ints, optional
|
||||
Subset of dimensions to remove. A `ValueError` is raised if an axis
|
||||
with size greater than one is selected. If `None`, all axes with
|
||||
size one are removed.
|
||||
stream : cuda stream or 0, optional
|
||||
Default stream for the returned view of the array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DeviceNDArray
|
||||
Squeezed view into the array.
|
||||
|
||||
"""
|
||||
new_dummy, _ = self._dummy.squeeze(axis=axis)
|
||||
return DeviceNDArray(
|
||||
shape=new_dummy.shape,
|
||||
strides=new_dummy.strides,
|
||||
dtype=self.dtype,
|
||||
stream=self._default_stream(stream),
|
||||
gpu_data=self.gpu_data,
|
||||
)
|
||||
|
||||
def view(self, dtype):
|
||||
"""Returns a new object by reinterpretting the dtype without making a
|
||||
copy of the data.
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
shape = list(self.shape)
|
||||
strides = list(self.strides)
|
||||
|
||||
if self.dtype.itemsize != dtype.itemsize:
|
||||
if not self.is_c_contiguous():
|
||||
raise ValueError(
|
||||
"To change to a dtype of a different size,"
|
||||
" the array must be C-contiguous"
|
||||
)
|
||||
|
||||
shape[-1], rem = divmod(
|
||||
shape[-1] * self.dtype.itemsize,
|
||||
dtype.itemsize
|
||||
)
|
||||
|
||||
if rem != 0:
|
||||
raise ValueError(
|
||||
"When changing to a larger dtype,"
|
||||
" its size must be a divisor of the total size in bytes"
|
||||
" of the last axis of the array."
|
||||
)
|
||||
|
||||
strides[-1] = dtype.itemsize
|
||||
|
||||
return DeviceNDArray(
|
||||
shape=shape,
|
||||
strides=strides,
|
||||
dtype=dtype,
|
||||
stream=self.stream,
|
||||
gpu_data=self.gpu_data,
|
||||
)
|
||||
|
||||
@property
|
||||
def nbytes(self):
|
||||
# Note: not using `alloc_size`. `alloc_size` reports memory
|
||||
# consumption of the allocation, not the size of the array
|
||||
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
|
||||
return self.dtype.itemsize * self.size
|
||||
|
||||
|
||||
class DeviceRecord(DeviceNDArrayBase):
|
||||
'''
|
||||
An on-GPU record type
|
||||
'''
|
||||
def __init__(self, dtype, stream=0, gpu_data=None):
|
||||
shape = ()
|
||||
strides = ()
|
||||
super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
|
||||
gpu_data)
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
"""
|
||||
For `numpy.ndarray` compatibility. Ideally this would return a
|
||||
`np.core.multiarray.flagsobj`, but that needs to be constructed
|
||||
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
|
||||
aren't writeable).
|
||||
"""
|
||||
return dict(self._dummy.flags) # defensive copy
|
||||
|
||||
@property
|
||||
def _numba_type_(self):
|
||||
"""
|
||||
Magic attribute expected by Numba to get the numba type that
|
||||
represents this object.
|
||||
"""
|
||||
return numpy_support.from_dtype(self.dtype)
|
||||
|
||||
@devices.require_context
|
||||
def __getitem__(self, item):
|
||||
return self._do_getitem(item)
|
||||
|
||||
@devices.require_context
|
||||
def getitem(self, item, stream=0):
|
||||
"""Do `__getitem__(item)` with CUDA stream
|
||||
"""
|
||||
return self._do_getitem(item, stream)
|
||||
|
||||
def _do_getitem(self, item, stream=0):
|
||||
stream = self._default_stream(stream)
|
||||
typ, offset = self.dtype.fields[item]
|
||||
newdata = self.gpu_data.view(offset)
|
||||
|
||||
if typ.shape == ():
|
||||
if typ.names is not None:
|
||||
return DeviceRecord(dtype=typ, stream=stream,
|
||||
gpu_data=newdata)
|
||||
else:
|
||||
hostary = np.empty(1, dtype=typ)
|
||||
_driver.device_to_host(dst=hostary, src=newdata,
|
||||
size=typ.itemsize,
|
||||
stream=stream)
|
||||
return hostary[0]
|
||||
else:
|
||||
shape, strides, dtype = \
|
||||
prepare_shape_strides_dtype(typ.shape,
|
||||
None,
|
||||
typ.subdtype[0], 'C')
|
||||
return DeviceNDArray(shape=shape, strides=strides,
|
||||
dtype=dtype, gpu_data=newdata,
|
||||
stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def __setitem__(self, key, value):
|
||||
return self._do_setitem(key, value)
|
||||
|
||||
@devices.require_context
|
||||
def setitem(self, key, value, stream=0):
|
||||
"""Do `__setitem__(key, value)` with CUDA stream
|
||||
"""
|
||||
return self._do_setitem(key, value, stream=stream)
|
||||
|
||||
def _do_setitem(self, key, value, stream=0):
|
||||
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
# If the record didn't have a default stream, and the user didn't
|
||||
# provide a stream, then we will use the default stream for the
|
||||
# assignment kernel and synchronize on it.
|
||||
synchronous = not stream
|
||||
if synchronous:
|
||||
ctx = devices.get_context()
|
||||
stream = ctx.get_default_stream()
|
||||
|
||||
# (1) prepare LHS
|
||||
|
||||
typ, offset = self.dtype.fields[key]
|
||||
newdata = self.gpu_data.view(offset)
|
||||
|
||||
lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
|
||||
|
||||
# (2) prepare RHS
|
||||
|
||||
rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
|
||||
|
||||
# (3) do the copy
|
||||
|
||||
_driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
|
||||
|
||||
if synchronous:
|
||||
stream.synchronize()
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _assign_kernel(ndim):
|
||||
"""
|
||||
A separate method so we don't need to compile code every assignment (!).
|
||||
|
||||
:param ndim: We need to have static array sizes for cuda.local.array, so
|
||||
bake in the number of dimensions into the kernel
|
||||
"""
|
||||
from numba import cuda # circular!
|
||||
|
||||
if ndim == 0:
|
||||
# the (2, ndim) allocation below is not yet supported, so avoid it
|
||||
@cuda.jit
|
||||
def kernel(lhs, rhs):
|
||||
lhs[()] = rhs[()]
|
||||
return kernel
|
||||
|
||||
@cuda.jit
|
||||
def kernel(lhs, rhs):
|
||||
location = cuda.grid(1)
|
||||
|
||||
n_elements = 1
|
||||
for i in range(lhs.ndim):
|
||||
n_elements *= lhs.shape[i]
|
||||
if location >= n_elements:
|
||||
# bake n_elements into the kernel, better than passing it in
|
||||
# as another argument.
|
||||
return
|
||||
|
||||
# [0, :] is the to-index (into `lhs`)
|
||||
# [1, :] is the from-index (into `rhs`)
|
||||
idx = cuda.local.array(
|
||||
shape=(2, ndim),
|
||||
dtype=types.int64)
|
||||
|
||||
for i in range(ndim - 1, -1, -1):
|
||||
idx[0, i] = location % lhs.shape[i]
|
||||
idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
|
||||
location //= lhs.shape[i]
|
||||
|
||||
lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
|
||||
return kernel
|
||||
|
||||
|
||||
class DeviceNDArray(DeviceNDArrayBase):
|
||||
'''
|
||||
An on-GPU array type
|
||||
'''
|
||||
def is_f_contiguous(self):
|
||||
'''
|
||||
Return true if the array is Fortran-contiguous.
|
||||
'''
|
||||
return self._dummy.is_f_contig
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
"""
|
||||
For `numpy.ndarray` compatibility. Ideally this would return a
|
||||
`np.core.multiarray.flagsobj`, but that needs to be constructed
|
||||
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
|
||||
aren't writeable).
|
||||
"""
|
||||
return dict(self._dummy.flags) # defensive copy
|
||||
|
||||
def is_c_contiguous(self):
|
||||
'''
|
||||
Return true if the array is C-contiguous.
|
||||
'''
|
||||
return self._dummy.is_c_contig
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
"""
|
||||
:return: an `numpy.ndarray`, so copies to the host.
|
||||
"""
|
||||
if dtype:
|
||||
return self.copy_to_host().__array__(dtype)
|
||||
else:
|
||||
return self.copy_to_host().__array__()
|
||||
|
||||
def __len__(self):
|
||||
return self.shape[0]
|
||||
|
||||
def reshape(self, *newshape, **kws):
|
||||
"""
|
||||
Reshape the array without changing its contents, similarly to
|
||||
:meth:`numpy.ndarray.reshape`. Example::
|
||||
|
||||
d_arr = d_arr.reshape(20, 50, order='F')
|
||||
"""
|
||||
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
|
||||
newshape = newshape[0]
|
||||
|
||||
cls = type(self)
|
||||
if newshape == self.shape:
|
||||
# nothing to do
|
||||
return cls(shape=self.shape, strides=self.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data)
|
||||
|
||||
newarr, extents = self._dummy.reshape(*newshape, **kws)
|
||||
|
||||
if extents == [self._dummy.extent]:
|
||||
return cls(shape=newarr.shape, strides=newarr.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data)
|
||||
else:
|
||||
raise NotImplementedError("operation requires copying")
|
||||
|
||||
def ravel(self, order='C', stream=0):
|
||||
'''
|
||||
Flattens a contiguous array without changing its contents, similar to
|
||||
:meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
|
||||
exception.
|
||||
'''
|
||||
stream = self._default_stream(stream)
|
||||
cls = type(self)
|
||||
newarr, extents = self._dummy.ravel(order=order)
|
||||
|
||||
if extents == [self._dummy.extent]:
|
||||
return cls(shape=newarr.shape, strides=newarr.strides,
|
||||
dtype=self.dtype, gpu_data=self.gpu_data,
|
||||
stream=stream)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("operation requires copying")
|
||||
|
||||
@devices.require_context
|
||||
def __getitem__(self, item):
|
||||
return self._do_getitem(item)
|
||||
|
||||
@devices.require_context
|
||||
def getitem(self, item, stream=0):
|
||||
"""Do `__getitem__(item)` with CUDA stream
|
||||
"""
|
||||
return self._do_getitem(item, stream)
|
||||
|
||||
def _do_getitem(self, item, stream=0):
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
arr = self._dummy.__getitem__(item)
|
||||
extents = list(arr.iter_contiguous_extent())
|
||||
cls = type(self)
|
||||
if len(extents) == 1:
|
||||
newdata = self.gpu_data.view(*extents[0])
|
||||
|
||||
if not arr.is_array:
|
||||
# Check for structured array type (record)
|
||||
if self.dtype.names is not None:
|
||||
return DeviceRecord(dtype=self.dtype, stream=stream,
|
||||
gpu_data=newdata)
|
||||
else:
|
||||
# Element indexing
|
||||
hostary = np.empty(1, dtype=self.dtype)
|
||||
_driver.device_to_host(dst=hostary, src=newdata,
|
||||
size=self._dummy.itemsize,
|
||||
stream=stream)
|
||||
return hostary[0]
|
||||
else:
|
||||
return cls(shape=arr.shape, strides=arr.strides,
|
||||
dtype=self.dtype, gpu_data=newdata, stream=stream)
|
||||
else:
|
||||
newdata = self.gpu_data.view(*arr.extent)
|
||||
return cls(shape=arr.shape, strides=arr.strides,
|
||||
dtype=self.dtype, gpu_data=newdata, stream=stream)
|
||||
|
||||
@devices.require_context
|
||||
def __setitem__(self, key, value):
|
||||
return self._do_setitem(key, value)
|
||||
|
||||
@devices.require_context
|
||||
def setitem(self, key, value, stream=0):
|
||||
"""Do `__setitem__(key, value)` with CUDA stream
|
||||
"""
|
||||
return self._do_setitem(key, value, stream=stream)
|
||||
|
||||
def _do_setitem(self, key, value, stream=0):
|
||||
|
||||
stream = self._default_stream(stream)
|
||||
|
||||
# If the array didn't have a default stream, and the user didn't provide
|
||||
# a stream, then we will use the default stream for the assignment
|
||||
# kernel and synchronize on it.
|
||||
synchronous = not stream
|
||||
if synchronous:
|
||||
ctx = devices.get_context()
|
||||
stream = ctx.get_default_stream()
|
||||
|
||||
# (1) prepare LHS
|
||||
|
||||
arr = self._dummy.__getitem__(key)
|
||||
newdata = self.gpu_data.view(*arr.extent)
|
||||
|
||||
if isinstance(arr, dummyarray.Element):
|
||||
# convert to a 0d array
|
||||
shape = ()
|
||||
strides = ()
|
||||
else:
|
||||
shape = arr.shape
|
||||
strides = arr.strides
|
||||
|
||||
lhs = type(self)(
|
||||
shape=shape,
|
||||
strides=strides,
|
||||
dtype=self.dtype,
|
||||
gpu_data=newdata,
|
||||
stream=stream)
|
||||
|
||||
# (2) prepare RHS
|
||||
|
||||
rhs, _ = auto_device(value, stream=stream, user_explicit=True)
|
||||
if rhs.ndim > lhs.ndim:
|
||||
raise ValueError("Can't assign %s-D array to %s-D self" % (
|
||||
rhs.ndim,
|
||||
lhs.ndim))
|
||||
rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
|
||||
# negative indices would not work if rhs.ndim == 0
|
||||
rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
|
||||
rhs = rhs.reshape(*rhs_shape)
|
||||
for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
|
||||
if r != 1 and l != r:
|
||||
raise ValueError("Can't copy sequence with size %d to array "
|
||||
"axis %d with dimension %d" % ( r, i, l))
|
||||
|
||||
# (3) do the copy
|
||||
|
||||
n_elements = functools.reduce(operator.mul, lhs.shape, 1)
|
||||
_assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
|
||||
if synchronous:
|
||||
stream.synchronize()
|
||||
|
||||
|
||||
class IpcArrayHandle(object):
|
||||
"""
|
||||
An IPC array handle that can be serialized and transfer to another process
|
||||
in the same machine for share a GPU allocation.
|
||||
|
||||
On the destination process, use the *.open()* method to creates a new
|
||||
*DeviceNDArray* object that shares the allocation from the original process.
|
||||
To release the resources, call the *.close()* method. After that, the
|
||||
destination can no longer use the shared array object. (Note: the
|
||||
underlying weakref to the resource is now dead.)
|
||||
|
||||
This object implements the context-manager interface that calls the
|
||||
*.open()* and *.close()* method automatically::
|
||||
|
||||
with the_ipc_array_handle as ipc_array:
|
||||
# use ipc_array here as a normal gpu array object
|
||||
some_code(ipc_array)
|
||||
# ipc_array is dead at this point
|
||||
"""
|
||||
def __init__(self, ipc_handle, array_desc):
|
||||
self._array_desc = array_desc
|
||||
self._ipc_handle = ipc_handle
|
||||
|
||||
def open(self):
|
||||
"""
|
||||
Returns a new *DeviceNDArray* that shares the allocation from the
|
||||
original process. Must not be used on the original process.
|
||||
"""
|
||||
dptr = self._ipc_handle.open(devices.get_context())
|
||||
return DeviceNDArray(gpu_data=dptr, **self._array_desc)
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Closes the IPC handle to the array.
|
||||
"""
|
||||
self._ipc_handle.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self.open()
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.close()
|
||||
|
||||
|
||||
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
|
||||
"""
|
||||
A host array that uses CUDA mapped memory.
|
||||
"""
|
||||
|
||||
def device_setup(self, gpu_data, stream=0):
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
|
||||
class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
|
||||
"""
|
||||
A host array that uses CUDA managed memory.
|
||||
"""
|
||||
|
||||
def device_setup(self, gpu_data, stream=0):
|
||||
self.gpu_data = gpu_data
|
||||
self.stream = stream
|
||||
|
||||
|
||||
def from_array_like(ary, stream=0, gpu_data=None):
|
||||
"Create a DeviceNDArray object that is like ary."
|
||||
return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
|
||||
gpu_data=gpu_data)
|
||||
|
||||
|
||||
def from_record_like(rec, stream=0, gpu_data=None):
|
||||
"Create a DeviceRecord object that is like rec."
|
||||
return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
|
||||
|
||||
|
||||
def array_core(ary):
|
||||
"""
|
||||
Extract the repeated core of a broadcast array.
|
||||
|
||||
Broadcast arrays are by definition non-contiguous due to repeated
|
||||
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
|
||||
contiguity and copy the underlying data from such arrays, we must create
|
||||
a view without the repeated dimensions.
|
||||
|
||||
"""
|
||||
if not ary.strides or not ary.size:
|
||||
return ary
|
||||
core_index = []
|
||||
for stride in ary.strides:
|
||||
core_index.append(0 if stride == 0 else slice(None))
|
||||
return ary[tuple(core_index)]
|
||||
|
||||
|
||||
def is_contiguous(ary):
|
||||
"""
|
||||
Returns True iff `ary` is C-style contiguous while ignoring
|
||||
broadcasted and 1-sized dimensions.
|
||||
As opposed to array_core(), it does not call require_context(),
|
||||
which can be quite expensive.
|
||||
"""
|
||||
size = ary.dtype.itemsize
|
||||
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
|
||||
if shape > 1 and stride != 0:
|
||||
if size != stride:
|
||||
return False
|
||||
size *= shape
|
||||
return True
|
||||
|
||||
|
||||
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
|
||||
"be transferred as a single memory region. Please "
|
||||
"ensure contiguous buffer with numpy "
|
||||
".ascontiguousarray()")
|
||||
|
||||
|
||||
def sentry_contiguous(ary):
|
||||
core = array_core(ary)
|
||||
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
|
||||
raise ValueError(errmsg_contiguous_buffer)
|
||||
|
||||
|
||||
def auto_device(obj, stream=0, copy=True, user_explicit=False):
|
||||
"""
|
||||
Create a DeviceRecord or DeviceArray like obj and optionally copy data from
|
||||
host to device. If obj already represents device memory, it is returned and
|
||||
no copy is made.
|
||||
"""
|
||||
if _driver.is_device_memory(obj):
|
||||
return obj, False
|
||||
elif hasattr(obj, '__cuda_array_interface__'):
|
||||
return numba.cuda.as_cuda_array(obj), False
|
||||
else:
|
||||
if isinstance(obj, np.void):
|
||||
devobj = from_record_like(obj, stream=stream)
|
||||
else:
|
||||
# This allows you to pass non-array objects like constants and
|
||||
# objects implementing the array interface
|
||||
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
|
||||
# into this function (with no overhead -- copies -- for `obj`s
|
||||
# that are already `ndarray`s.
|
||||
obj = np.array(
|
||||
obj,
|
||||
copy=False if numpy_version < (2, 0) else None,
|
||||
subok=True)
|
||||
sentry_contiguous(obj)
|
||||
devobj = from_array_like(obj, stream=stream)
|
||||
if copy:
|
||||
if config.CUDA_WARN_ON_IMPLICIT_COPY:
|
||||
if (
|
||||
not user_explicit and
|
||||
(not isinstance(obj, DeviceNDArray)
|
||||
and isinstance(obj, np.ndarray))
|
||||
):
|
||||
msg = ("Host array used in CUDA kernel will incur "
|
||||
"copy overhead to/from device.")
|
||||
warn(NumbaPerformanceWarning(msg))
|
||||
devobj.copy_to_device(obj, stream=stream)
|
||||
return devobj, True
|
||||
|
||||
|
||||
def check_array_compatibility(ary1, ary2):
|
||||
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
|
||||
if ary1.dtype != ary2.dtype:
|
||||
raise TypeError('incompatible dtype: %s vs. %s' %
|
||||
(ary1.dtype, ary2.dtype))
|
||||
if ary1sq.shape != ary2sq.shape:
|
||||
raise ValueError('incompatible shape: %s vs. %s' %
|
||||
(ary1.shape, ary2.shape))
|
||||
# We check strides only if the size is nonzero, because strides are
|
||||
# irrelevant (and can differ) for zero-length copies.
|
||||
if ary1.size and ary1sq.strides != ary2sq.strides:
|
||||
raise ValueError('incompatible strides: %s vs. %s' %
|
||||
(ary1.strides, ary2.strides))
|
||||
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Expose each GPU devices directly.
|
||||
|
||||
This module implements a API that is like the "CUDA runtime" context manager
|
||||
for managing CUDA context stack and clean up. It relies on thread-local globals
|
||||
to separate the context stack management of each thread. Contexts are also
|
||||
shareable among threads. Only the main thread can destroy Contexts.
|
||||
|
||||
Note:
|
||||
- This module must be imported by the main-thread.
|
||||
|
||||
"""
|
||||
import functools
|
||||
import threading
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .driver import driver, USE_NV_BINDING
|
||||
|
||||
|
||||
class _DeviceList(object):
|
||||
def __getattr__(self, attr):
|
||||
# First time looking at "lst" attribute.
|
||||
if attr == "lst":
|
||||
# Device list is not initialized.
|
||||
# Query all CUDA devices.
|
||||
numdev = driver.get_device_count()
|
||||
gpus = [_DeviceContextManager(driver.get_device(devid))
|
||||
for devid in range(numdev)]
|
||||
# Define "lst" to avoid re-initialization
|
||||
self.lst = gpus
|
||||
return gpus
|
||||
|
||||
# Other attributes
|
||||
return super(_DeviceList, self).__getattr__(attr)
|
||||
|
||||
def __getitem__(self, devnum):
|
||||
'''
|
||||
Returns the context manager for device *devnum*.
|
||||
'''
|
||||
return self.lst[devnum]
|
||||
|
||||
def __str__(self):
|
||||
return ', '.join([str(d) for d in self.lst])
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.lst)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lst)
|
||||
|
||||
@property
|
||||
def current(self):
|
||||
"""Returns the active device or None if there's no active device
|
||||
"""
|
||||
with driver.get_active_context() as ac:
|
||||
devnum = ac.devnum
|
||||
if devnum is not None:
|
||||
return self[devnum]
|
||||
|
||||
|
||||
class _DeviceContextManager(object):
|
||||
"""
|
||||
Provides a context manager for executing in the context of the chosen
|
||||
device. The normal use of instances of this type is from
|
||||
``numba.cuda.gpus``. For example, to execute on device 2::
|
||||
|
||||
with numba.cuda.gpus[2]:
|
||||
d_a = numba.cuda.to_device(a)
|
||||
|
||||
to copy the array *a* onto device 2, referred to by *d_a*.
|
||||
"""
|
||||
|
||||
def __init__(self, device):
|
||||
self._device = device
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self._device, item)
|
||||
|
||||
def __enter__(self):
|
||||
_runtime.get_or_create_context(self._device.id)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
# this will verify that we are popping the right device context.
|
||||
self._device.get_primary_context().pop()
|
||||
|
||||
def __str__(self):
|
||||
return "<Managed Device {self.id}>".format(self=self)
|
||||
|
||||
|
||||
class _Runtime(object):
|
||||
"""Emulate the CUDA runtime context management.
|
||||
|
||||
It owns all Devices and Contexts.
|
||||
Keeps at most one Context per Device
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.gpus = _DeviceList()
|
||||
|
||||
# For caching the attached CUDA Context
|
||||
self._tls = threading.local()
|
||||
|
||||
# Remember the main thread
|
||||
# Only the main thread can *actually* destroy
|
||||
self._mainthread = threading.current_thread()
|
||||
|
||||
# Avoid mutation of runtime state in multithreaded programs
|
||||
self._lock = threading.RLock()
|
||||
|
||||
@contextmanager
|
||||
def ensure_context(self):
|
||||
"""Ensure a CUDA context is available inside the context.
|
||||
|
||||
On entrance, queries the CUDA driver for an active CUDA context and
|
||||
attaches it in TLS for subsequent calls so they do not need to query
|
||||
the CUDA driver again. On exit, detach the CUDA context from the TLS.
|
||||
|
||||
This will allow us to pickup thirdparty activated CUDA context in
|
||||
any top-level Numba CUDA API.
|
||||
"""
|
||||
with driver.get_active_context():
|
||||
oldctx = self._get_attached_context()
|
||||
newctx = self.get_or_create_context(None)
|
||||
self._set_attached_context(newctx)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._set_attached_context(oldctx)
|
||||
|
||||
def get_or_create_context(self, devnum):
|
||||
"""Returns the primary context and push+create it if needed
|
||||
for *devnum*. If *devnum* is None, use the active CUDA context (must
|
||||
be primary) or create a new one with ``devnum=0``.
|
||||
"""
|
||||
if devnum is None:
|
||||
attached_ctx = self._get_attached_context()
|
||||
if attached_ctx is None:
|
||||
return self._get_or_create_context_uncached(devnum)
|
||||
else:
|
||||
return attached_ctx
|
||||
else:
|
||||
if USE_NV_BINDING:
|
||||
devnum = int(devnum)
|
||||
return self._activate_context_for(devnum)
|
||||
|
||||
def _get_or_create_context_uncached(self, devnum):
|
||||
"""See also ``get_or_create_context(devnum)``.
|
||||
This version does not read the cache.
|
||||
"""
|
||||
with self._lock:
|
||||
# Try to get the active context in the CUDA stack or
|
||||
# activate GPU-0 with the primary context
|
||||
with driver.get_active_context() as ac:
|
||||
if not ac:
|
||||
return self._activate_context_for(0)
|
||||
else:
|
||||
# Get primary context for the active device
|
||||
ctx = self.gpus[ac.devnum].get_primary_context()
|
||||
# Is active context the primary context?
|
||||
if USE_NV_BINDING:
|
||||
ctx_handle = int(ctx.handle)
|
||||
ac_ctx_handle = int(ac.context_handle)
|
||||
else:
|
||||
ctx_handle = ctx.handle.value
|
||||
ac_ctx_handle = ac.context_handle.value
|
||||
if ctx_handle != ac_ctx_handle:
|
||||
msg = ('Numba cannot operate on non-primary'
|
||||
' CUDA context {:x}')
|
||||
raise RuntimeError(msg.format(ac_ctx_handle))
|
||||
# Ensure the context is ready
|
||||
ctx.prepare_for_use()
|
||||
return ctx
|
||||
|
||||
def _activate_context_for(self, devnum):
|
||||
with self._lock:
|
||||
gpu = self.gpus[devnum]
|
||||
newctx = gpu.get_primary_context()
|
||||
# Detect unexpected context switch
|
||||
cached_ctx = self._get_attached_context()
|
||||
if cached_ctx is not None and cached_ctx is not newctx:
|
||||
raise RuntimeError('Cannot switch CUDA-context.')
|
||||
newctx.push()
|
||||
return newctx
|
||||
|
||||
def _get_attached_context(self):
|
||||
return getattr(self._tls, 'attached_context', None)
|
||||
|
||||
def _set_attached_context(self, ctx):
|
||||
self._tls.attached_context = ctx
|
||||
|
||||
def reset(self):
|
||||
"""Clear all contexts in the thread. Destroy the context if and only
|
||||
if we are in the main thread.
|
||||
"""
|
||||
# Pop all active context.
|
||||
while driver.pop_active_context() is not None:
|
||||
pass
|
||||
|
||||
# If it is the main thread
|
||||
if threading.current_thread() == self._mainthread:
|
||||
self._destroy_all_contexts()
|
||||
|
||||
def _destroy_all_contexts(self):
|
||||
# Reset all devices
|
||||
for gpu in self.gpus:
|
||||
gpu.reset()
|
||||
|
||||
|
||||
_runtime = _Runtime()
|
||||
|
||||
# ================================ PUBLIC API ================================
|
||||
|
||||
gpus = _runtime.gpus
|
||||
|
||||
|
||||
def get_context(devnum=None):
|
||||
"""Get the current device or use a device by device number, and
|
||||
return the CUDA context.
|
||||
"""
|
||||
return _runtime.get_or_create_context(devnum)
|
||||
|
||||
|
||||
def require_context(fn):
|
||||
"""
|
||||
A decorator that ensures a CUDA context is available when *fn* is executed.
|
||||
|
||||
Note: The function *fn* cannot switch CUDA-context.
|
||||
"""
|
||||
@functools.wraps(fn)
|
||||
def _require_cuda_context(*args, **kws):
|
||||
with _runtime.ensure_context():
|
||||
return fn(*args, **kws)
|
||||
|
||||
return _require_cuda_context
|
||||
|
||||
|
||||
def reset():
|
||||
"""Reset the CUDA subsystem for the current thread.
|
||||
|
||||
In the main thread:
|
||||
This removes all CUDA contexts. Only use this at shutdown or for
|
||||
cleaning up between tests.
|
||||
|
||||
In non-main threads:
|
||||
This clear the CUDA context stack only.
|
||||
|
||||
"""
|
||||
_runtime.reset()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,394 @@
|
||||
from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
|
||||
c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
|
||||
|
||||
from numba.cuda.cudadrv import _extras
|
||||
|
||||
cu_device = c_int
|
||||
cu_device_attribute = c_int # enum
|
||||
cu_context = c_void_p # an opaque handle
|
||||
cu_module = c_void_p # an opaque handle
|
||||
cu_jit_option = c_int # enum
|
||||
cu_jit_input_type = c_int # enum
|
||||
cu_function = c_void_p # an opaque handle
|
||||
cu_device_ptr = c_size_t # defined as unsigned long long
|
||||
cu_stream = c_void_p # an opaque handle
|
||||
cu_event = c_void_p
|
||||
cu_link_state = c_void_p
|
||||
cu_function_attribute = c_int
|
||||
cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide
|
||||
cu_uuid = (c_byte * 16) # Device UUID
|
||||
|
||||
cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
|
||||
|
||||
cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
|
||||
|
||||
# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
|
||||
CU_STREAM_DEFAULT = 0
|
||||
CU_STREAM_LEGACY = 1
|
||||
CU_STREAM_PER_THREAD = 2
|
||||
|
||||
API_PROTOTYPES = {
|
||||
# CUresult cuInit(unsigned int Flags);
|
||||
'cuInit' : (c_int, c_uint),
|
||||
|
||||
# CUresult cuDriverGetVersion (int* driverVersion )
|
||||
'cuDriverGetVersion': (c_int, POINTER(c_int)),
|
||||
|
||||
# CUresult cuDeviceGetCount(int *count);
|
||||
'cuDeviceGetCount': (c_int, POINTER(c_int)),
|
||||
|
||||
# CUresult cuDeviceGet(CUdevice *device, int ordinal);
|
||||
'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
|
||||
|
||||
# CUresult cuDeviceGetName ( char* name, int len, CUdevice dev )
|
||||
'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
|
||||
|
||||
# CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
|
||||
# CUdevice dev);
|
||||
'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
|
||||
cu_device),
|
||||
|
||||
# CUresult cuDeviceComputeCapability(int *major, int *minor,
|
||||
# CUdevice dev);
|
||||
'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
|
||||
cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxGetState(
|
||||
# CUdevice dev,
|
||||
# unsigned int* flags,
|
||||
# int* active)
|
||||
'cuDevicePrimaryCtxGetState': (c_int,
|
||||
cu_device, POINTER(c_uint), POINTER(c_int)),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
|
||||
'cuDevicePrimaryCtxRelease': (c_int, cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
|
||||
'cuDevicePrimaryCtxReset': (c_int, cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
|
||||
'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
|
||||
|
||||
# CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int flags )
|
||||
'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
|
||||
|
||||
# CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
|
||||
# CUdevice dev);
|
||||
'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
|
||||
|
||||
# CUresult cuCtxGetDevice ( CUdevice * device )
|
||||
'cuCtxGetDevice': (c_int, POINTER(cu_device)),
|
||||
|
||||
# CUresult cuCtxGetCurrent (CUcontext *pctx);
|
||||
'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
|
||||
|
||||
# CUresult cuCtxPushCurrent (CUcontext pctx);
|
||||
'cuCtxPushCurrent': (c_int, cu_context),
|
||||
|
||||
# CUresult cuCtxPopCurrent (CUcontext *pctx);
|
||||
'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
|
||||
|
||||
# CUresult cuCtxDestroy(CUcontext pctx);
|
||||
'cuCtxDestroy': (c_int, cu_context),
|
||||
|
||||
# CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
|
||||
# unsigned int numOptions,
|
||||
# CUjit_option *options,
|
||||
# void **optionValues);
|
||||
'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
|
||||
POINTER(cu_jit_option), POINTER(c_void_p)),
|
||||
|
||||
# CUresult cuModuleUnload(CUmodule hmod);
|
||||
'cuModuleUnload': (c_int, cu_module),
|
||||
|
||||
# CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
|
||||
# const char *name);
|
||||
'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
|
||||
|
||||
# CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
|
||||
# hmod, const char* name )
|
||||
'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
|
||||
cu_module, c_char_p),
|
||||
|
||||
# CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
|
||||
# CUfunc_cache config);
|
||||
'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
|
||||
|
||||
# CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
|
||||
'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
|
||||
|
||||
# CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
|
||||
# unsigned int flags);
|
||||
'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
|
||||
|
||||
# CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
|
||||
'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
|
||||
|
||||
# CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
|
||||
# size_t N, CUstream hStream);
|
||||
'cuMemsetD8Async': (c_int,
|
||||
cu_device_ptr, c_uint8, c_size_t, cu_stream),
|
||||
|
||||
# CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
|
||||
# size_t ByteCount);
|
||||
'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
|
||||
|
||||
# CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
|
||||
# size_t ByteCount, CUstream hStream);
|
||||
'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
|
||||
cu_stream),
|
||||
|
||||
# CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
|
||||
# size_t ByteCount);
|
||||
'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
|
||||
|
||||
# CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
|
||||
# size_t ByteCount, CUstream hStream);
|
||||
'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
|
||||
cu_stream),
|
||||
|
||||
|
||||
# CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
|
||||
# size_t ByteCount);
|
||||
'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
|
||||
|
||||
# CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
|
||||
# size_t ByteCount, CUstream hStream);
|
||||
'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
|
||||
cu_stream),
|
||||
|
||||
# CUresult cuMemFree(CUdeviceptr dptr);
|
||||
'cuMemFree': (c_int, cu_device_ptr),
|
||||
|
||||
# CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
|
||||
'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
|
||||
|
||||
# CUresult cuStreamDestroy(CUstream hStream);
|
||||
'cuStreamDestroy': (c_int, cu_stream),
|
||||
|
||||
# CUresult cuStreamSynchronize(CUstream hStream);
|
||||
'cuStreamSynchronize': (c_int, cu_stream),
|
||||
|
||||
# CUresult cuStreamAddCallback(
|
||||
# CUstream hStream,
|
||||
# CUstreamCallback callback,
|
||||
# void* userData,
|
||||
# unsigned int flags)
|
||||
'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
|
||||
py_object, c_uint),
|
||||
|
||||
# CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
|
||||
# unsigned int gridDimY,
|
||||
# unsigned int gridDimZ,
|
||||
# unsigned int blockDimX,
|
||||
# unsigned int blockDimY,
|
||||
# unsigned int blockDimZ,
|
||||
# unsigned int sharedMemBytes,
|
||||
# CUstream hStream, void **kernelParams,
|
||||
# void ** extra)
|
||||
'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
|
||||
c_uint, c_uint, c_uint, c_uint, cu_stream,
|
||||
POINTER(c_void_p), POINTER(c_void_p)),
|
||||
|
||||
# CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
|
||||
# unsigned int gridDimY,
|
||||
# unsigned int gridDimZ,
|
||||
# unsigned int blockDimX,
|
||||
# unsigned int blockDimY,
|
||||
# unsigned int blockDimZ,
|
||||
# unsigned int sharedMemBytes,
|
||||
# CUstream hStream, void **kernelParams)
|
||||
'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
|
||||
c_uint, c_uint, c_uint, c_uint, cu_stream,
|
||||
POINTER(c_void_p)),
|
||||
|
||||
# CUresult cuMemHostAlloc ( void ** pp,
|
||||
# size_t bytesize,
|
||||
# unsigned int Flags
|
||||
# )
|
||||
'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
|
||||
|
||||
# CUresult cuMemFreeHost ( void * p )
|
||||
'cuMemFreeHost': (c_int, c_void_p),
|
||||
|
||||
# CUresult cuMemHostRegister(void * p,
|
||||
# size_t bytesize,
|
||||
# unsigned int Flags)
|
||||
'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
|
||||
|
||||
# CUresult cuMemHostUnregister(void * p)
|
||||
'cuMemHostUnregister': (c_int, c_void_p),
|
||||
|
||||
# CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
|
||||
# void * p,
|
||||
# unsigned int Flags)
|
||||
'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
|
||||
c_void_p, c_uint),
|
||||
|
||||
# CUresult cuMemGetInfo(size_t * free, size_t * total)
|
||||
'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
|
||||
|
||||
# CUresult cuEventCreate ( CUevent * phEvent,
|
||||
# unsigned int Flags )
|
||||
'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
|
||||
|
||||
# CUresult cuEventDestroy ( CUevent hEvent )
|
||||
'cuEventDestroy': (c_int, cu_event),
|
||||
|
||||
# CUresult cuEventElapsedTime ( float * pMilliseconds,
|
||||
# CUevent hStart,
|
||||
# CUevent hEnd )
|
||||
'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
|
||||
|
||||
# CUresult cuEventQuery ( CUevent hEvent )
|
||||
'cuEventQuery': (c_int, cu_event),
|
||||
|
||||
# CUresult cuEventRecord ( CUevent hEvent,
|
||||
# CUstream hStream )
|
||||
'cuEventRecord': (c_int, cu_event, cu_stream),
|
||||
|
||||
# CUresult cuEventSynchronize ( CUevent hEvent )
|
||||
'cuEventSynchronize': (c_int, cu_event),
|
||||
|
||||
|
||||
# CUresult cuStreamWaitEvent ( CUstream hStream,
|
||||
# CUevent hEvent,
|
||||
# unsigned int Flags )
|
||||
'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
|
||||
|
||||
# CUresult cuPointerGetAttribute (
|
||||
# void *data,
|
||||
# CUpointer_attribute attribute,
|
||||
# CUdeviceptr ptr)
|
||||
'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
|
||||
|
||||
# CUresult cuMemGetAddressRange ( CUdeviceptr * pbase,
|
||||
# size_t * psize,
|
||||
# CUdeviceptr dptr
|
||||
# )
|
||||
'cuMemGetAddressRange': (c_int,
|
||||
POINTER(cu_device_ptr),
|
||||
POINTER(c_size_t),
|
||||
cu_device_ptr),
|
||||
|
||||
# CUresult cuMemHostGetFlags ( unsigned int * pFlags,
|
||||
# void * p )
|
||||
'cuMemHostGetFlags': (c_int,
|
||||
POINTER(c_uint),
|
||||
c_void_p),
|
||||
|
||||
# CUresult cuCtxSynchronize ( void )
|
||||
'cuCtxSynchronize' : (c_int,),
|
||||
|
||||
# CUresult
|
||||
# cuLinkCreate(unsigned int numOptions, CUjit_option *options,
|
||||
# void **optionValues, CUlinkState *stateOut);
|
||||
'cuLinkCreate': (c_int,
|
||||
c_uint, POINTER(cu_jit_option),
|
||||
POINTER(c_void_p), POINTER(cu_link_state)),
|
||||
|
||||
# CUresult
|
||||
# cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
|
||||
# size_t size, const char *name, unsigned
|
||||
# int numOptions, CUjit_option *options,
|
||||
# void **optionValues);
|
||||
'cuLinkAddData': (c_int,
|
||||
cu_link_state, cu_jit_input_type, c_void_p,
|
||||
c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
|
||||
POINTER(c_void_p)),
|
||||
|
||||
# CUresult
|
||||
# cuLinkAddFile(CUlinkState state, CUjitInputType type,
|
||||
# const char *path, unsigned int numOptions,
|
||||
# CUjit_option *options, void **optionValues);
|
||||
|
||||
'cuLinkAddFile': (c_int,
|
||||
cu_link_state, cu_jit_input_type, c_char_p, c_uint,
|
||||
POINTER(cu_jit_option), POINTER(c_void_p)),
|
||||
|
||||
# CUresult CUDAAPI
|
||||
# cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
|
||||
'cuLinkComplete': (c_int,
|
||||
cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
|
||||
|
||||
# CUresult CUDAAPI
|
||||
# cuLinkDestroy(CUlinkState state)
|
||||
'cuLinkDestroy': (c_int, cu_link_state),
|
||||
|
||||
# cuProfilerStart ( void )
|
||||
'cuProfilerStart': (c_int,),
|
||||
|
||||
# cuProfilerStop ( void )
|
||||
'cuProfilerStop': (c_int,),
|
||||
|
||||
# CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
|
||||
# CUfunction hfunc )
|
||||
'cuFuncGetAttribute': (c_int,
|
||||
POINTER(c_int), cu_function_attribute, cu_function),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
# int *numBlocks,
|
||||
# CUfunction func,
|
||||
# int blockSize,
|
||||
# size_t dynamicSMemSize);
|
||||
'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
|
||||
cu_function, c_size_t,
|
||||
c_uint),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
# int *numBlocks,
|
||||
# CUfunction func,
|
||||
# int blockSize,
|
||||
# size_t dynamicSMemSize,
|
||||
# unsigned int flags);
|
||||
'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
|
||||
POINTER(c_int),
|
||||
cu_function,
|
||||
c_size_t, c_uint),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
|
||||
# int *minGridSize, int *blockSize,
|
||||
# CUfunction func,
|
||||
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
|
||||
# size_t dynamicSMemSize, int blockSizeLimit);
|
||||
'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
|
||||
cu_function, cu_occupancy_b2d_size,
|
||||
c_size_t, c_int),
|
||||
|
||||
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
|
||||
# int *minGridSize, int *blockSize,
|
||||
# CUfunction func,
|
||||
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
|
||||
# size_t dynamicSMemSize, int blockSizeLimit,
|
||||
# unsigned int flags);
|
||||
'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
|
||||
POINTER(c_int), cu_function,
|
||||
cu_occupancy_b2d_size,
|
||||
c_size_t, c_int, c_uint),
|
||||
|
||||
# CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
|
||||
'cuIpcGetMemHandle': (c_int,
|
||||
POINTER(cu_ipc_mem_handle), cu_device_ptr),
|
||||
|
||||
# CUresult cuIpcOpenMemHandle(
|
||||
# CUdeviceptr* pdptr,
|
||||
# CUipcMemHandle handle,
|
||||
# unsigned int Flags)
|
||||
'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
|
||||
c_uint),
|
||||
|
||||
# CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
|
||||
|
||||
'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
|
||||
|
||||
# CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
|
||||
'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
|
||||
|
||||
# CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
|
||||
# CUdevice dev, CUdevice peerDev )
|
||||
'cuDeviceCanAccessPeer': (c_int,
|
||||
POINTER(c_int), cu_device, cu_device),
|
||||
|
||||
# CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
|
||||
'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
|
||||
}
|
||||
@@ -0,0 +1,452 @@
|
||||
from collections import namedtuple
|
||||
import itertools
|
||||
import functools
|
||||
import operator
|
||||
import ctypes
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numba import _helperlib
|
||||
|
||||
Extent = namedtuple("Extent", ["begin", "end"])
|
||||
|
||||
attempt_nocopy_reshape = ctypes.CFUNCTYPE(
|
||||
ctypes.c_int,
|
||||
ctypes.c_long, # nd
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # dims
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # strides
|
||||
ctypes.c_long, # newnd
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newdims
|
||||
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newstrides
|
||||
ctypes.c_long, # itemsize
|
||||
ctypes.c_int, # is_f_order
|
||||
)(_helperlib.c_helpers['attempt_nocopy_reshape'])
|
||||
|
||||
|
||||
class Dim(object):
|
||||
"""A single dimension of the array
|
||||
|
||||
Attributes
|
||||
----------
|
||||
start:
|
||||
start offset
|
||||
stop:
|
||||
stop offset
|
||||
size:
|
||||
number of items
|
||||
stride:
|
||||
item stride
|
||||
"""
|
||||
__slots__ = 'start', 'stop', 'size', 'stride', 'single'
|
||||
|
||||
def __init__(self, start, stop, size, stride, single):
|
||||
self.start = start
|
||||
self.stop = stop
|
||||
self.size = size
|
||||
self.stride = stride
|
||||
self.single = single
|
||||
assert not single or size == 1
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, slice):
|
||||
start, stop, step = item.indices(self.size)
|
||||
stride = step * self.stride
|
||||
start = self.start + start * abs(self.stride)
|
||||
stop = self.start + stop * abs(self.stride)
|
||||
if stride == 0:
|
||||
size = 1
|
||||
else:
|
||||
size = _compute_size(start, stop, stride)
|
||||
ret = Dim(
|
||||
start=start,
|
||||
stop=stop,
|
||||
size=size,
|
||||
stride=stride,
|
||||
single=False
|
||||
)
|
||||
return ret
|
||||
else:
|
||||
sliced = self[item:item + 1] if item != -1 else self[-1:]
|
||||
if sliced.size != 1:
|
||||
raise IndexError
|
||||
return Dim(
|
||||
start=sliced.start,
|
||||
stop=sliced.stop,
|
||||
size=sliced.size,
|
||||
stride=sliced.stride,
|
||||
single=True,
|
||||
)
|
||||
|
||||
def get_offset(self, idx):
|
||||
return self.start + idx * self.stride
|
||||
|
||||
def __repr__(self):
|
||||
strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
|
||||
return strfmt % (self.start, self.stop, self.size, self.stride)
|
||||
|
||||
def normalize(self, base):
|
||||
return Dim(start=self.start - base, stop=self.stop - base,
|
||||
size=self.size, stride=self.stride, single=self.single)
|
||||
|
||||
def copy(self, start=None, stop=None, size=None, stride=None, single=None):
|
||||
if start is None:
|
||||
start = self.start
|
||||
if stop is None:
|
||||
stop = self.stop
|
||||
if size is None:
|
||||
size = self.size
|
||||
if stride is None:
|
||||
stride = self.stride
|
||||
if single is None:
|
||||
single = self.single
|
||||
return Dim(start, stop, size, stride, single)
|
||||
|
||||
def is_contiguous(self, itemsize):
|
||||
return self.stride == itemsize
|
||||
|
||||
|
||||
def compute_index(indices, dims):
|
||||
return sum(d.get_offset(i) for i, d in zip(indices, dims))
|
||||
|
||||
|
||||
class Element(object):
|
||||
is_array = False
|
||||
|
||||
def __init__(self, extent):
|
||||
self.extent = extent
|
||||
|
||||
def iter_contiguous_extent(self):
|
||||
yield self.extent
|
||||
|
||||
|
||||
class Array(object):
|
||||
"""A dummy numpy array-like object. Consider it an array without the
|
||||
actual data, but offset from the base data pointer.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dims: tuple of Dim
|
||||
describing each dimension of the array
|
||||
|
||||
ndim: int
|
||||
number of dimension
|
||||
|
||||
shape: tuple of int
|
||||
size of each dimension
|
||||
|
||||
strides: tuple of int
|
||||
stride of each dimension
|
||||
|
||||
itemsize: int
|
||||
itemsize
|
||||
|
||||
extent: (start, end)
|
||||
start and end offset containing the memory region
|
||||
"""
|
||||
is_array = True
|
||||
|
||||
@classmethod
|
||||
def from_desc(cls, offset, shape, strides, itemsize):
|
||||
dims = []
|
||||
for ashape, astride in zip(shape, strides):
|
||||
dim = Dim(offset, offset + ashape * astride, ashape, astride,
|
||||
single=False)
|
||||
dims.append(dim)
|
||||
offset = 0 # offset only applies to first dimension
|
||||
return cls(dims, itemsize)
|
||||
|
||||
def __init__(self, dims, itemsize):
|
||||
self.dims = tuple(dims)
|
||||
self.ndim = len(self.dims)
|
||||
self.shape = tuple(dim.size for dim in self.dims)
|
||||
self.strides = tuple(dim.stride for dim in self.dims)
|
||||
self.itemsize = itemsize
|
||||
self.size = functools.reduce(operator.mul, self.shape, 1)
|
||||
self.extent = self._compute_extent()
|
||||
self.flags = self._compute_layout()
|
||||
|
||||
def _compute_layout(self):
|
||||
# The logic here is based on that in _UpdateContiguousFlags from
|
||||
# numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
|
||||
# 13661ac70).
|
||||
# https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
|
||||
|
||||
# Records have no dims, and we can treat them as contiguous
|
||||
if not self.dims:
|
||||
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
|
||||
|
||||
# If this is a broadcast array then it is not contiguous
|
||||
if any([dim.stride == 0 for dim in self.dims]):
|
||||
return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
|
||||
|
||||
flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
|
||||
|
||||
# Check C contiguity
|
||||
sd = self.itemsize
|
||||
for dim in reversed(self.dims):
|
||||
if dim.size == 0:
|
||||
# Contiguous by definition
|
||||
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
|
||||
if dim.size != 1:
|
||||
if dim.stride != sd:
|
||||
flags['C_CONTIGUOUS'] = False
|
||||
sd *= dim.size
|
||||
|
||||
# Check F contiguity
|
||||
sd = self.itemsize
|
||||
for dim in self.dims:
|
||||
if dim.size != 1:
|
||||
if dim.stride != sd:
|
||||
flags['F_CONTIGUOUS'] = False
|
||||
return flags
|
||||
sd *= dim.size
|
||||
|
||||
return flags
|
||||
|
||||
def _compute_extent(self):
|
||||
firstidx = [0] * self.ndim
|
||||
lastidx = [s - 1 for s in self.shape]
|
||||
start = compute_index(firstidx, self.dims)
|
||||
stop = compute_index(lastidx, self.dims) + self.itemsize
|
||||
stop = max(stop, start) # ensure positive extent
|
||||
return Extent(start, stop)
|
||||
|
||||
def __repr__(self):
|
||||
return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if not isinstance(item, tuple):
|
||||
item = [item]
|
||||
else:
|
||||
item = list(item)
|
||||
|
||||
nitem = len(item)
|
||||
ndim = len(self.dims)
|
||||
if nitem > ndim:
|
||||
raise IndexError("%d extra indices given" % (nitem - ndim,))
|
||||
|
||||
# Add empty slices for missing indices
|
||||
while len(item) < ndim:
|
||||
item.append(slice(None, None))
|
||||
|
||||
dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
|
||||
newshape = [d.size for d in dims if not d.single]
|
||||
|
||||
arr = Array(dims, self.itemsize)
|
||||
if newshape:
|
||||
return arr.reshape(*newshape)[0]
|
||||
else:
|
||||
return Element(arr.extent)
|
||||
|
||||
@property
|
||||
def is_c_contig(self):
|
||||
return self.flags['C_CONTIGUOUS']
|
||||
|
||||
@property
|
||||
def is_f_contig(self):
|
||||
return self.flags['F_CONTIGUOUS']
|
||||
|
||||
def iter_contiguous_extent(self):
|
||||
""" Generates extents
|
||||
"""
|
||||
if self.is_c_contig or self.is_f_contig:
|
||||
yield self.extent
|
||||
else:
|
||||
if self.dims[0].stride < self.dims[-1].stride:
|
||||
innerdim = self.dims[0]
|
||||
outerdims = self.dims[1:]
|
||||
outershape = self.shape[1:]
|
||||
else:
|
||||
innerdim = self.dims[-1]
|
||||
outerdims = self.dims[:-1]
|
||||
outershape = self.shape[:-1]
|
||||
|
||||
if innerdim.is_contiguous(self.itemsize):
|
||||
oslen = [range(s) for s in outershape]
|
||||
for indices in itertools.product(*oslen):
|
||||
base = compute_index(indices, outerdims)
|
||||
yield base + innerdim.start, base + innerdim.stop
|
||||
else:
|
||||
oslen = [range(s) for s in self.shape]
|
||||
for indices in itertools.product(*oslen):
|
||||
offset = compute_index(indices, self.dims)
|
||||
yield offset, offset + self.itemsize
|
||||
|
||||
def reshape(self, *newdims, **kws):
|
||||
oldnd = self.ndim
|
||||
newnd = len(newdims)
|
||||
|
||||
if newdims == self.shape:
|
||||
return self, None
|
||||
|
||||
order = kws.pop('order', 'C')
|
||||
if kws:
|
||||
raise TypeError('unknown keyword arguments %s' % kws.keys())
|
||||
if order not in 'CFA':
|
||||
raise ValueError('order not C|F|A')
|
||||
|
||||
# check for exactly one instance of -1 in newdims
|
||||
# https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515 # noqa: E501
|
||||
unknownidx = -1
|
||||
knownsize = 1
|
||||
for i, dim in enumerate(newdims):
|
||||
if dim < 0:
|
||||
if unknownidx == -1:
|
||||
unknownidx = i
|
||||
else:
|
||||
raise ValueError("can only specify one unknown dimension")
|
||||
else:
|
||||
knownsize *= dim
|
||||
|
||||
# compute the missing dimension
|
||||
if unknownidx >= 0:
|
||||
if knownsize == 0 or self.size % knownsize != 0:
|
||||
raise ValueError("cannot infer valid shape "
|
||||
"for unknown dimension")
|
||||
else:
|
||||
newdims = newdims[0:unknownidx] \
|
||||
+ (self.size // knownsize,) \
|
||||
+ newdims[unknownidx + 1:]
|
||||
|
||||
newsize = functools.reduce(operator.mul, newdims, 1)
|
||||
|
||||
if order == 'A':
|
||||
order = 'F' if self.is_f_contig else 'C'
|
||||
|
||||
if newsize != self.size:
|
||||
raise ValueError("reshape changes the size of the array")
|
||||
|
||||
if self.is_c_contig or self.is_f_contig:
|
||||
if order == 'C':
|
||||
newstrides = list(iter_strides_c_contig(self, newdims))
|
||||
elif order == 'F':
|
||||
newstrides = list(iter_strides_f_contig(self, newdims))
|
||||
else:
|
||||
raise AssertionError("unreachable")
|
||||
else:
|
||||
newstrides = np.empty(newnd, np.ctypeslib.c_intp)
|
||||
|
||||
# need to keep these around in variables, not temporaries, so they
|
||||
# don't get GC'ed before we call into the C code
|
||||
olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
|
||||
oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
|
||||
newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
|
||||
|
||||
if not attempt_nocopy_reshape(
|
||||
oldnd,
|
||||
olddims,
|
||||
oldstrides,
|
||||
newnd,
|
||||
newdims,
|
||||
newstrides,
|
||||
self.itemsize,
|
||||
order == 'F',
|
||||
):
|
||||
raise NotImplementedError('reshape would require copy')
|
||||
|
||||
ret = self.from_desc(self.extent.begin, shape=newdims,
|
||||
strides=newstrides, itemsize=self.itemsize)
|
||||
|
||||
return ret, list(self.iter_contiguous_extent())
|
||||
|
||||
def squeeze(self, axis=None):
|
||||
newshape, newstrides = [], []
|
||||
if axis is None:
|
||||
for length, stride in zip(self.shape, self.strides):
|
||||
if length != 1:
|
||||
newshape.append(length)
|
||||
newstrides.append(stride)
|
||||
else:
|
||||
if not isinstance(axis, tuple):
|
||||
axis = (axis,)
|
||||
for ax in axis:
|
||||
if self.shape[ax] != 1:
|
||||
raise ValueError(
|
||||
"cannot select an axis to squeeze out which has size "
|
||||
"not equal to one"
|
||||
)
|
||||
for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
|
||||
if i not in axis:
|
||||
newshape.append(length)
|
||||
newstrides.append(stride)
|
||||
newarr = self.from_desc(
|
||||
self.extent.begin,
|
||||
shape=newshape,
|
||||
strides=newstrides,
|
||||
itemsize=self.itemsize,
|
||||
)
|
||||
return newarr, list(self.iter_contiguous_extent())
|
||||
|
||||
def ravel(self, order='C'):
|
||||
if order not in 'CFA':
|
||||
raise ValueError('order not C|F|A')
|
||||
|
||||
if (order in 'CA' and self.is_c_contig
|
||||
or order in 'FA' and self.is_f_contig):
|
||||
newshape = (self.size,)
|
||||
newstrides = (self.itemsize,)
|
||||
arr = self.from_desc(self.extent.begin, newshape, newstrides,
|
||||
self.itemsize)
|
||||
return arr, list(self.iter_contiguous_extent())
|
||||
|
||||
else:
|
||||
raise NotImplementedError("ravel on non-contiguous array")
|
||||
|
||||
|
||||
def iter_strides_f_contig(arr, shape=None):
|
||||
"""yields the f-contiguous strides
|
||||
"""
|
||||
shape = arr.shape if shape is None else shape
|
||||
itemsize = arr.itemsize
|
||||
yield itemsize
|
||||
sum = 1
|
||||
for s in shape[:-1]:
|
||||
sum *= s
|
||||
yield sum * itemsize
|
||||
|
||||
|
||||
def iter_strides_c_contig(arr, shape=None):
|
||||
"""yields the c-contiguous strides
|
||||
"""
|
||||
shape = arr.shape if shape is None else shape
|
||||
itemsize = arr.itemsize
|
||||
|
||||
def gen():
|
||||
yield itemsize
|
||||
sum = 1
|
||||
for s in reversed(shape[1:]):
|
||||
sum *= s
|
||||
yield sum * itemsize
|
||||
|
||||
for i in reversed(list(gen())):
|
||||
yield i
|
||||
|
||||
|
||||
def is_element_indexing(item, ndim):
|
||||
if isinstance(item, slice):
|
||||
return False
|
||||
|
||||
elif isinstance(item, tuple):
|
||||
if len(item) == ndim:
|
||||
if not any(isinstance(it, slice) for it in item):
|
||||
return True
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _compute_size(start, stop, step):
|
||||
"""Algorithm adapted from cpython rangeobject.c
|
||||
"""
|
||||
if step > 0:
|
||||
lo = start
|
||||
hi = stop
|
||||
else:
|
||||
lo = stop
|
||||
hi = start
|
||||
step = -step
|
||||
if lo >= hi:
|
||||
return 0
|
||||
return (hi - lo - 1) // step + 1
|
||||
@@ -0,0 +1,607 @@
|
||||
"""
|
||||
Enum values for CUDA driver. Information about the values
|
||||
can be found on the official NVIDIA documentation website.
|
||||
ref: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
|
||||
anchor: #group__CUDA__TYPES
|
||||
"""
|
||||
|
||||
|
||||
# Error codes
|
||||
|
||||
CUDA_SUCCESS = 0
|
||||
CUDA_ERROR_INVALID_VALUE = 1
|
||||
CUDA_ERROR_OUT_OF_MEMORY = 2
|
||||
CUDA_ERROR_NOT_INITIALIZED = 3
|
||||
CUDA_ERROR_DEINITIALIZED = 4
|
||||
CUDA_ERROR_PROFILER_DISABLED = 5
|
||||
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
|
||||
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
|
||||
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
|
||||
CUDA_ERROR_STUB_LIBRARY = 34
|
||||
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
|
||||
CUDA_ERROR_NO_DEVICE = 100
|
||||
CUDA_ERROR_INVALID_DEVICE = 101
|
||||
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
|
||||
CUDA_ERROR_INVALID_IMAGE = 200
|
||||
CUDA_ERROR_INVALID_CONTEXT = 201
|
||||
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
|
||||
CUDA_ERROR_MAP_FAILED = 205
|
||||
CUDA_ERROR_UNMAP_FAILED = 206
|
||||
CUDA_ERROR_ARRAY_IS_MAPPED = 207
|
||||
CUDA_ERROR_ALREADY_MAPPED = 208
|
||||
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
|
||||
CUDA_ERROR_ALREADY_ACQUIRED = 210
|
||||
CUDA_ERROR_NOT_MAPPED = 211
|
||||
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
|
||||
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
|
||||
CUDA_ERROR_ECC_UNCORRECTABLE = 214
|
||||
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
|
||||
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
|
||||
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
|
||||
CUDA_ERROR_INVALID_PTX = 218
|
||||
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
|
||||
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
|
||||
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
|
||||
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
|
||||
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
|
||||
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
|
||||
CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
|
||||
CUDA_ERROR_INVALID_SOURCE = 300
|
||||
CUDA_ERROR_FILE_NOT_FOUND = 301
|
||||
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
|
||||
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
|
||||
CUDA_ERROR_OPERATING_SYSTEM = 304
|
||||
CUDA_ERROR_INVALID_HANDLE = 400
|
||||
CUDA_ERROR_ILLEGAL_STATE = 401
|
||||
CUDA_ERROR_NOT_FOUND = 500
|
||||
CUDA_ERROR_NOT_READY = 600
|
||||
CUDA_ERROR_LAUNCH_FAILED = 700
|
||||
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
|
||||
CUDA_ERROR_LAUNCH_TIMEOUT = 702
|
||||
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
|
||||
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
|
||||
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
|
||||
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
|
||||
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
|
||||
CUDA_ERROR_ASSERT = 710
|
||||
CUDA_ERROR_TOO_MANY_PEERS = 711
|
||||
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
|
||||
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
|
||||
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
|
||||
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
|
||||
CUDA_ERROR_MISALIGNED_ADDRESS = 716
|
||||
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
|
||||
CUDA_ERROR_INVALID_PC = 718
|
||||
CUDA_ERROR_LAUNCH_FAILED = 719
|
||||
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
|
||||
CUDA_ERROR_NOT_PERMITTED = 800
|
||||
CUDA_ERROR_NOT_SUPPORTED = 801
|
||||
CUDA_ERROR_SYSTEM_NOT_READY = 802
|
||||
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
|
||||
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
|
||||
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
|
||||
CUDA_ERROR_MPS_RPC_FAILURE = 806
|
||||
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
|
||||
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
|
||||
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
|
||||
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
|
||||
CUDA_ERROR_CDP_NOT_SUPPORTED = 811
|
||||
CUDA_ERROR_CDP_VERSION_MISMATCH = 812
|
||||
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
|
||||
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
|
||||
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
|
||||
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
|
||||
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
|
||||
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
|
||||
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
|
||||
CUDA_ERROR_CAPTURED_EVENT = 907
|
||||
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
|
||||
CUDA_ERROR_TIMEOUT = 909
|
||||
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
|
||||
CUDA_ERROR_EXTERNAL_DEVICE = 911
|
||||
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
|
||||
CUDA_ERROR_UNKNOWN = 999
|
||||
|
||||
|
||||
# Function cache configurations
|
||||
|
||||
# no preference for shared memory or L1 (default)
|
||||
CU_FUNC_CACHE_PREFER_NONE = 0x00
|
||||
# prefer larger shared memory and smaller L1 cache
|
||||
CU_FUNC_CACHE_PREFER_SHARED = 0x01
|
||||
# prefer larger L1 cache and smaller shared memory
|
||||
CU_FUNC_CACHE_PREFER_L1 = 0x02
|
||||
# prefer equal sized L1 cache and shared memory
|
||||
CU_FUNC_CACHE_PREFER_EQUAL = 0x03
|
||||
|
||||
|
||||
# Context creation flags
|
||||
|
||||
# Automatic scheduling
|
||||
CU_CTX_SCHED_AUTO = 0x00
|
||||
# Set spin as default scheduling
|
||||
CU_CTX_SCHED_SPIN = 0x01
|
||||
# Set yield as default scheduling
|
||||
CU_CTX_SCHED_YIELD = 0x02
|
||||
# Set blocking synchronization as default scheduling
|
||||
CU_CTX_SCHED_BLOCKING_SYNC = 0x04
|
||||
|
||||
CU_CTX_SCHED_MASK = 0x07
|
||||
# Support mapped pinned allocations
|
||||
# This flag was deprecated as of CUDA 11.0 and it no longer has effect.
|
||||
# All contexts as of CUDA 3.2 behave as though the flag is enabled.
|
||||
CU_CTX_MAP_HOST = 0x08
|
||||
# Keep local memory allocation after launch
|
||||
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10
|
||||
# Trigger coredumps from exceptions in this context
|
||||
CU_CTX_COREDUMP_ENABLE = 0x20
|
||||
# Enable user pipe to trigger coredumps in this context
|
||||
CU_CTX_USER_COREDUMP_ENABLE = 0x40
|
||||
# Force synchronous blocking on cudaMemcpy/cudaMemset
|
||||
CU_CTX_SYNC_MEMOPS = 0x80
|
||||
|
||||
CU_CTX_FLAGS_MASK = 0xff
|
||||
|
||||
|
||||
# DEFINES
|
||||
|
||||
# If set, host memory is portable between CUDA contexts.
|
||||
# Flag for cuMemHostAlloc()
|
||||
CU_MEMHOSTALLOC_PORTABLE = 0x01
|
||||
|
||||
# If set, host memory is mapped into CUDA address space and
|
||||
# cuMemHostGetDevicePointer() may be called on the host pointer.
|
||||
# Flag for cuMemHostAlloc()
|
||||
CU_MEMHOSTALLOC_DEVICEMAP = 0x02
|
||||
|
||||
# If set, host memory is allocated as write-combined - fast to write,
|
||||
# faster to DMA, slow to read except via SSE4 streaming load instruction
|
||||
# (MOVNTDQA).
|
||||
# Flag for cuMemHostAlloc()
|
||||
CU_MEMHOSTALLOC_WRITECOMBINED = 0x04
|
||||
|
||||
|
||||
# If set, host memory is portable between CUDA contexts.
|
||||
# Flag for cuMemHostRegister()
|
||||
CU_MEMHOSTREGISTER_PORTABLE = 0x01
|
||||
|
||||
# If set, host memory is mapped into CUDA address space and
|
||||
# cuMemHostGetDevicePointer() may be called on the host pointer.
|
||||
# Flag for cuMemHostRegister()
|
||||
CU_MEMHOSTREGISTER_DEVICEMAP = 0x02
|
||||
|
||||
# If set, the passed memory pointer is treated as pointing to some
|
||||
# memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
|
||||
# On Windows the flag is a no-op. On Linux that memory is marked
|
||||
# as non cache-coherent for the GPU and is expected
|
||||
# to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED
|
||||
# if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older
|
||||
# Linux kernel versions. On all other platforms, it is not supported
|
||||
# and CUDA_ERROR_NOT_SUPPORTED is returned.
|
||||
# Flag for cuMemHostRegister()
|
||||
CU_MEMHOSTREGISTER_IOMEMORY = 0x04
|
||||
|
||||
# If set, the passed memory pointer is treated as pointing to memory
|
||||
# that is considered read-only by the device. On platforms without
|
||||
# CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
|
||||
# this flag is required in order to register memory mapped
|
||||
# to the CPU as read-only. Support for the use of this flag can be
|
||||
# queried from the device attribute
|
||||
# CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.
|
||||
# Using this flag with a current context associated with a device
|
||||
# that does not have this attribute set will cause cuMemHostRegister
|
||||
# to error with CUDA_ERROR_NOT_SUPPORTED.
|
||||
CU_MEMHOSTREGISTER_READ_ONLY = 0x08
|
||||
|
||||
|
||||
# CUDA Mem Attach Flags
|
||||
|
||||
# If set, managed memory is accessible from all streams on all devices.
|
||||
CU_MEM_ATTACH_GLOBAL = 0x01
|
||||
|
||||
# If set on a platform where the device attribute
|
||||
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory is
|
||||
# only accessible on the host (unless explicitly attached to a stream
|
||||
# with cudaStreamAttachMemAsync, in which case it can be used in kernels
|
||||
# launched on that stream).
|
||||
CU_MEM_ATTACH_HOST = 0x02
|
||||
|
||||
# If set on a platform where the device attribute
|
||||
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory accesses
|
||||
# on the associated device must only be from a single stream.
|
||||
CU_MEM_ATTACH_SINGLE = 0x04
|
||||
|
||||
|
||||
# Event creation flags
|
||||
|
||||
# Default event flag
|
||||
CU_EVENT_DEFAULT = 0x0
|
||||
# Event uses blocking synchronization
|
||||
CU_EVENT_BLOCKING_SYNC = 0x1
|
||||
# Event will not record timing data
|
||||
CU_EVENT_DISABLE_TIMING = 0x2
|
||||
# Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
|
||||
CU_EVENT_INTERPROCESS = 0x4
|
||||
|
||||
|
||||
# Pointer information
|
||||
|
||||
# The CUcontext on which a pointer was allocated or registered
|
||||
CU_POINTER_ATTRIBUTE_CONTEXT = 1
|
||||
# The CUmemorytype describing the physical location of a pointer
|
||||
CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2
|
||||
# The address at which a pointer's memory may be accessed on the device
|
||||
CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3
|
||||
# The address at which a pointer's memory may be accessed on the host
|
||||
CU_POINTER_ATTRIBUTE_HOST_POINTER = 4
|
||||
# A pair of tokens for use with the nv-p2p.h Linux kernel interface
|
||||
CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5
|
||||
# Synchronize every synchronous memory operation initiated on this region
|
||||
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6
|
||||
# A process-wide unique ID for an allocated memory region
|
||||
CU_POINTER_ATTRIBUTE_BUFFER_ID = 7
|
||||
# Indicates if the pointer points to managed memory
|
||||
CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
|
||||
# A device ordinal of a device on which a pointer was allocated or registered
|
||||
CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9
|
||||
# 1 if this pointer maps to an allocation
|
||||
# that is suitable for cudaIpcGetMemHandle, 0 otherwise
|
||||
CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10
|
||||
# Starting address for this requested pointer
|
||||
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11
|
||||
# Size of the address range for this requested pointer
|
||||
CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12
|
||||
# 1 if this pointer is in a valid address range
|
||||
# that is mapped to a backing allocation, 0 otherwise
|
||||
CU_POINTER_ATTRIBUTE_MAPPED = 13
|
||||
# Bitmask of allowed CUmemAllocationHandleType for this allocation
|
||||
CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14
|
||||
# 1 if the memory this pointer is referencing
|
||||
# can be used with the GPUDirect RDMA API
|
||||
CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
|
||||
# Returns the access flags the device associated
|
||||
# with the current context has on the corresponding
|
||||
# memory referenced by the pointer given
|
||||
CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16
|
||||
# Returns the mempool handle for the allocation
|
||||
# if it was allocated from a mempool. Otherwise returns NULL
|
||||
CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17
|
||||
# Size of the actual underlying mapping that the pointer belongs to
|
||||
CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18
|
||||
# The start address of the mapping that the pointer belongs to
|
||||
CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19
|
||||
# A process-wide unique id corresponding to the
|
||||
# physical allocation the pointer belongs to
|
||||
CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20
|
||||
|
||||
|
||||
# Memory types
|
||||
|
||||
# Host memory
|
||||
CU_MEMORYTYPE_HOST = 0x01
|
||||
# Device memory
|
||||
CU_MEMORYTYPE_DEVICE = 0x02
|
||||
# Array memory
|
||||
CU_MEMORYTYPE_ARRAY = 0x03
|
||||
# Unified device or host memory
|
||||
CU_MEMORYTYPE_UNIFIED = 0x04
|
||||
|
||||
|
||||
# Device code formats
|
||||
|
||||
# Compiled device-class-specific device code
|
||||
# Applicable options: none
|
||||
CU_JIT_INPUT_CUBIN = 0
|
||||
|
||||
# PTX source code
|
||||
# Applicable options: PTX compiler options
|
||||
CU_JIT_INPUT_PTX = 1
|
||||
|
||||
# Bundle of multiple cubins and/or PTX of some device code
|
||||
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
|
||||
CU_JIT_INPUT_FATBINARY = 2
|
||||
|
||||
# Host object with embedded device code
|
||||
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
|
||||
CU_JIT_INPUT_OBJECT = 3
|
||||
|
||||
# Archive of host objects with embedded device code
|
||||
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
|
||||
CU_JIT_INPUT_LIBRARY = 4
|
||||
|
||||
CU_JIT_NUM_INPUT_TYPES = 6
|
||||
|
||||
|
||||
# Online compiler and linker options
|
||||
|
||||
# Max number of registers that a thread may use.
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_MAX_REGISTERS = 0
|
||||
|
||||
# IN: Specifies minimum number of threads per block to target compilation
|
||||
# for
|
||||
# OUT: Returns the number of threads the compiler actually targeted.
|
||||
# This restricts the resource utilization fo the compiler (e.g. max
|
||||
# registers) such that a block with the given number of threads should be
|
||||
# able to launch based on register limitations. Note, this option does not
|
||||
# currently take into account any other resource limitations, such as
|
||||
# shared memory utilization.
|
||||
# Cannot be combined with ::CU_JIT_TARGET.
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_THREADS_PER_BLOCK = 1
|
||||
|
||||
# Overwrites the option value with the total wall clock time, in
|
||||
# milliseconds, spent in the compiler and linker
|
||||
# Option type: float
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_WALL_TIME = 2
|
||||
|
||||
# Pointer to a buffer in which to print any log messages
|
||||
# that are informational in nature (the buffer size is specified via
|
||||
# option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
|
||||
# Option type: char *
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_INFO_LOG_BUFFER = 3
|
||||
|
||||
# IN: Log buffer size in bytes. Log messages will be capped at this size
|
||||
# (including null terminator)
|
||||
# OUT: Amount of log buffer filled with messages
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4
|
||||
|
||||
# Pointer to a buffer in which to print any log messages that
|
||||
# reflect errors (the buffer size is specified via option
|
||||
# ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
|
||||
# Option type: char *
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_ERROR_LOG_BUFFER = 5
|
||||
|
||||
# IN: Log buffer size in bytes. Log messages will be capped at this size
|
||||
# (including null terminator)
|
||||
# OUT: Amount of log buffer filled with messages
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6
|
||||
|
||||
# Level of optimizations to apply to generated code (0 - 4), with 4
|
||||
# being the default and highest level of optimizations.
|
||||
# Option type: unsigned int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_OPTIMIZATION_LEVEL = 7
|
||||
|
||||
# No option value required. Determines the target based on the current
|
||||
# attached context (default)
|
||||
# Option type: No option value needed
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_TARGET_FROM_CUCONTEXT = 8
|
||||
|
||||
# Target is chosen based on supplied ::CUjit_target. Cannot be
|
||||
# combined with ::CU_JIT_THREADS_PER_BLOCK.
|
||||
# Option type: unsigned int for enumerated type ::CUjit_target
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_TARGET = 9
|
||||
|
||||
# Specifies choice of fallback strategy if matching cubin is not found.
|
||||
# Choice is based on supplied ::CUjit_fallback.
|
||||
# Option type: unsigned int for enumerated type ::CUjit_fallback
|
||||
# Applies to: compiler only
|
||||
CU_JIT_FALLBACK_STRATEGY = 10
|
||||
|
||||
# Specifies whether to create debug information in output (-g)
|
||||
# (0: false, default)
|
||||
# Option type: int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_GENERATE_DEBUG_INFO = 11
|
||||
|
||||
# Generate verbose log messages (0: false, default)
|
||||
# Option type: int
|
||||
# Applies to: compiler and linker
|
||||
CU_JIT_LOG_VERBOSE = 12
|
||||
|
||||
# Generate line number information (-lineinfo) (0: false, default)
|
||||
# Option type: int
|
||||
# Applies to: compiler only
|
||||
CU_JIT_GENERATE_LINE_INFO = 13
|
||||
|
||||
# Specifies whether to enable caching explicitly (-dlcm)
|
||||
# Choice is based on supplied ::CUjit_cacheMode_enum.
|
||||
# Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum
|
||||
# Applies to: compiler only
|
||||
CU_JIT_CACHE_MODE = 14
|
||||
|
||||
|
||||
# CUfunction_attribute
|
||||
|
||||
# The maximum number of threads per block, beyond which a launch of the
|
||||
# function would fail. This number depends on both the function and the
|
||||
# device on which the function is currently loaded.
|
||||
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0
|
||||
|
||||
# The size in bytes of statically-allocated shared memory required by
|
||||
# this function. This does not include dynamically-allocated shared
|
||||
# memory requested by the user at runtime.
|
||||
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1
|
||||
|
||||
# The size in bytes of user-allocated constant memory required by this
|
||||
# function.
|
||||
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2
|
||||
|
||||
# The size in bytes of local memory used by each thread of this function.
|
||||
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3
|
||||
|
||||
# The number of registers used by each thread of this function.
|
||||
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
|
||||
|
||||
# The PTX virtual architecture version for which the function was
|
||||
# compiled. This value is the major PTX version * 10 + the minor PTX
|
||||
# version, so a PTX version 1.3 function would return the value 13.
|
||||
# Note that this may return the undefined value of 0 for cubins
|
||||
# compiled prior to CUDA 3.0.
|
||||
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5
|
||||
|
||||
# The binary architecture version for which the function was compiled.
|
||||
# This value is the major binary version * 10 + the minor binary version,
|
||||
# so a binary version 1.3 function would return the value 13. Note that
|
||||
# this will return a value of 10 for legacy cubins that do not have a
|
||||
# properly-encoded binary architecture version.
|
||||
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6
|
||||
|
||||
# The attribute to indicate whether the function has been compiled
|
||||
# with user specified option "-Xptxas --dlcm=ca" set
|
||||
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
|
||||
|
||||
# The maximum size in bytes of dynamically-allocated shared memory
|
||||
# that can be used by this function. If the user-specified
|
||||
# dynamic shared memory size is larger than this value,
|
||||
# the launch will fail. See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8
|
||||
|
||||
# On devices where the L1 cache and shared memory use the same
|
||||
# hardware resources, this sets the shared memory carveout preference,
|
||||
# in percent of the total shared memory. Refer to
|
||||
# CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
|
||||
# This is only a hint, and the driver can choose a different ratio
|
||||
# if required to execute the function.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9
|
||||
|
||||
# If this attribute is set, the kernel must launch with a valid cluster
|
||||
# size specified. See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10
|
||||
|
||||
# The required cluster width in blocks. The values must either all be 0
|
||||
# or all be positive. The validity of the cluster dimensions
|
||||
# is otherwise checked at launch time. If the value is set during
|
||||
# compile time, it cannot be set at runtime.
|
||||
# Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11
|
||||
|
||||
# The required cluster height in blocks. The values must either all be 0
|
||||
# or all be positive. The validity of the cluster dimensions
|
||||
# is otherwise checked at launch time.If the value is set during
|
||||
# compile time, it cannot be set at runtime.
|
||||
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12
|
||||
|
||||
# The required cluster depth in blocks. The values must either all be 0
|
||||
# or all be positive. The validity of the cluster dimensions
|
||||
# is otherwise checked at launch time.If the value is set during
|
||||
# compile time, it cannot be set at runtime.
|
||||
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
|
||||
|
||||
# Whether the function can be launched with non-portable cluster size.
|
||||
# 1 is allowed, 0 is disallowed. A non-portable cluster size may only
|
||||
# function on the specific SKUs the program is tested on.
|
||||
# The launch might fail if the program is run on a different hardware platform.
|
||||
# For more details refer to link :
|
||||
# https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
|
||||
CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
|
||||
|
||||
# The block scheduling policy of a function.
|
||||
# The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
|
||||
# See cuFuncSetAttribute, cuKernelSetAttribute
|
||||
CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
|
||||
|
||||
|
||||
# Device attributes
|
||||
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4
|
||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5
|
||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6
|
||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
|
||||
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9
|
||||
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10
|
||||
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12
|
||||
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
|
||||
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14
|
||||
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
|
||||
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
|
||||
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
|
||||
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_WIDTH = 21
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_WIDTH = 22
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_HEIGHT = 23
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH = 24
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT = 25
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH = 26
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_WIDTH = 27
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_HEIGHT = 28
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_LAYERS = 29
|
||||
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30
|
||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31
|
||||
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32
|
||||
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
|
||||
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
|
||||
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
|
||||
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37
|
||||
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTI_PROCESSOR = 39
|
||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40
|
||||
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_WIDTH = 42
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_LAYERS = 43
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_WIDTH = 45
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_HEIGHT = 46
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH_ALT = 47
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT_ALT = 48
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH_ALT = 49
|
||||
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
|
||||
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_WIDTH = 52
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_WIDTH = 53
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_LAYERS = 54
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_WIDTH = 55
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_WIDTH = 56
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_HEIGHT = 57
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_WIDTH = 58
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_HEIGHT = 59
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_DEPTH = 60
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_WIDTH = 61
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_LAYERS = 62
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_WIDTH = 63
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_HEIGHT = 64
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_LAYERS = 65
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_WIDTH = 66
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_WIDTH = 67
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_LAYERS = 68
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LINEAR_WIDTH = 69
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_WIDTH = 70
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_HEIGHT = 71
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_PITCH = 72
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_MIPMAPPED_WIDTH = 73
|
||||
CU_DEVICE_ATTRIBUTE_MAX_MAX_TEXTURE_2D_MIPMAPPED_HEIGHT = 74
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
|
||||
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_MIPMAPPED_WIDTH = 77
|
||||
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78
|
||||
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79
|
||||
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
|
||||
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83
|
||||
CU_DEVICE_ATTRIBUTE_IS_MULTI_GPU_BOARD = 84
|
||||
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85
|
||||
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86
|
||||
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87
|
||||
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
|
||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90
|
||||
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
|
||||
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95
|
||||
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96
|
||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
|
||||
@@ -0,0 +1,36 @@
|
||||
class CudaDriverError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CudaRuntimeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CudaSupportError(ImportError):
|
||||
pass
|
||||
|
||||
|
||||
class NvvmError(Exception):
|
||||
def __str__(self):
|
||||
return '\n'.join(map(str, self.args))
|
||||
|
||||
|
||||
class NvvmSupportError(ImportError):
|
||||
pass
|
||||
|
||||
|
||||
class NvvmWarning(Warning):
|
||||
pass
|
||||
|
||||
|
||||
class NvrtcError(Exception):
|
||||
def __str__(self):
|
||||
return '\n'.join(map(str, self.args))
|
||||
|
||||
|
||||
class NvrtcCompilationError(NvrtcError):
|
||||
pass
|
||||
|
||||
|
||||
class NvrtcSupportError(ImportError):
|
||||
pass
|
||||
@@ -0,0 +1,176 @@
|
||||
"""CUDA Toolkit libraries lookup utilities.
|
||||
|
||||
CUDA Toolkit libraries can be available via either:
|
||||
|
||||
- the `cuda-nvcc` and `cuda-nvrtc` conda packages for CUDA 12,
|
||||
- the `cudatoolkit` conda package for CUDA 11,
|
||||
- a user supplied location from CUDA_HOME,
|
||||
- a system wide location,
|
||||
- package-specific locations (e.g. the Debian NVIDIA packages),
|
||||
- or can be discovered by the system loader.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ctypes
|
||||
|
||||
from numba.misc.findlib import find_lib
|
||||
from numba.cuda.cuda_paths import get_cuda_paths
|
||||
from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
|
||||
from numba.cuda.cudadrv.error import CudaSupportError
|
||||
|
||||
|
||||
if sys.platform == 'win32':
|
||||
_dllnamepattern = '%s.dll'
|
||||
_staticnamepattern = '%s.lib'
|
||||
elif sys.platform == 'darwin':
|
||||
_dllnamepattern = 'lib%s.dylib'
|
||||
_staticnamepattern = 'lib%s.a'
|
||||
else:
|
||||
_dllnamepattern = 'lib%s.so'
|
||||
_staticnamepattern = 'lib%s.a'
|
||||
|
||||
|
||||
def get_libdevice():
|
||||
d = get_cuda_paths()
|
||||
paths = d['libdevice'].info
|
||||
return paths
|
||||
|
||||
|
||||
def open_libdevice():
|
||||
with open(get_libdevice(), 'rb') as bcfile:
|
||||
return bcfile.read()
|
||||
|
||||
|
||||
def get_cudalib(lib, static=False):
|
||||
"""
|
||||
Find the path of a CUDA library based on a search of known locations. If
|
||||
the search fails, return a generic filename for the library (e.g.
|
||||
'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
|
||||
loader's search mechanism.
|
||||
"""
|
||||
if lib == 'nvvm':
|
||||
return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
|
||||
else:
|
||||
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
|
||||
libdir = get_cuda_paths()[dir_type].info
|
||||
|
||||
candidates = find_lib(lib, libdir, static=static)
|
||||
namepattern = _staticnamepattern if static else _dllnamepattern
|
||||
return max(candidates) if candidates else namepattern % lib
|
||||
|
||||
|
||||
def open_cudalib(lib):
|
||||
path = get_cudalib(lib)
|
||||
return ctypes.CDLL(path)
|
||||
|
||||
|
||||
def check_static_lib(path):
|
||||
if not os.path.isfile(path):
|
||||
raise FileNotFoundError(f'{path} not found')
|
||||
|
||||
|
||||
def _get_source_variable(lib, static=False):
|
||||
if lib == 'nvvm':
|
||||
return get_cuda_paths()['nvvm'].by
|
||||
elif lib == 'libdevice':
|
||||
return get_cuda_paths()['libdevice'].by
|
||||
else:
|
||||
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
|
||||
return get_cuda_paths()[dir_type].by
|
||||
|
||||
|
||||
def test():
|
||||
"""Test library lookup. Path info is printed to stdout.
|
||||
"""
|
||||
failed = False
|
||||
|
||||
# Check for the driver
|
||||
try:
|
||||
dlloader, candidates = locate_driver_and_loader()
|
||||
print('Finding driver from candidates:')
|
||||
for location in candidates:
|
||||
print(f'\t{location}')
|
||||
print(f'Using loader {dlloader}')
|
||||
print('\tTrying to load driver', end='...')
|
||||
dll, path = load_driver(dlloader, candidates)
|
||||
print('\tok')
|
||||
print(f'\t\tLoaded from {path}')
|
||||
except CudaSupportError as e:
|
||||
print(f'\tERROR: failed to open driver: {e}')
|
||||
failed = True
|
||||
|
||||
# Find the absolute location of the driver on Linux. Various driver-related
|
||||
# issues have been reported by WSL2 users, and it is almost always due to a
|
||||
# Linux (i.e. not- WSL2) driver being installed in a WSL2 system.
|
||||
# Providing the absolute location of the driver indicates its version
|
||||
# number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
|
||||
# look up whether the driver was intended for "native" Linux.
|
||||
if sys.platform == 'linux' and not failed:
|
||||
pid = os.getpid()
|
||||
mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
|
||||
try:
|
||||
with open(mapsfile) as f:
|
||||
maps = f.read()
|
||||
# It's difficult to predict all that might go wrong reading the maps
|
||||
# file - in case various error conditions ensue (the file is not found,
|
||||
# not readable, etc.) we use OSError to hopefully catch any of them.
|
||||
except OSError:
|
||||
# It's helpful to report that this went wrong to the user, but we
|
||||
# don't set failed to True because this doesn't have any connection
|
||||
# to actual CUDA functionality.
|
||||
print(f'\tERROR: Could not open {mapsfile} to determine absolute '
|
||||
'path to libcuda.so')
|
||||
else:
|
||||
# In this case we could read the maps, so we can report the
|
||||
# relevant ones to the user
|
||||
locations = set(s for s in maps.split() if 'libcuda.so' in s)
|
||||
print('\tMapped libcuda.so paths:')
|
||||
for location in locations:
|
||||
print(f'\t\t{location}')
|
||||
|
||||
# Checks for dynamic libraries
|
||||
libs = 'nvvm nvrtc cudart'.split()
|
||||
for lib in libs:
|
||||
path = get_cudalib(lib)
|
||||
print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
|
||||
print('\tLocated at', path)
|
||||
|
||||
try:
|
||||
print('\tTrying to open library', end='...')
|
||||
open_cudalib(lib)
|
||||
print('\tok')
|
||||
except OSError as e:
|
||||
print('\tERROR: failed to open %s:\n%s' % (lib, e))
|
||||
failed = True
|
||||
|
||||
# Check for cudadevrt (the only static library)
|
||||
lib = 'cudadevrt'
|
||||
path = get_cudalib(lib, static=True)
|
||||
print('Finding {} from {}'.format(lib, _get_source_variable(lib,
|
||||
static=True)))
|
||||
print('\tLocated at', path)
|
||||
|
||||
try:
|
||||
print('\tChecking library', end='...')
|
||||
check_static_lib(path)
|
||||
print('\tok')
|
||||
except FileNotFoundError as e:
|
||||
print('\tERROR: failed to find %s:\n%s' % (lib, e))
|
||||
failed = True
|
||||
|
||||
# Check for libdevice
|
||||
where = _get_source_variable('libdevice')
|
||||
print(f'Finding libdevice from {where}')
|
||||
path = get_libdevice()
|
||||
print('\tLocated at', path)
|
||||
|
||||
try:
|
||||
print('\tChecking library', end='...')
|
||||
check_static_lib(path)
|
||||
print('\tok')
|
||||
except FileNotFoundError as e:
|
||||
print('\tERROR: failed to find %s:\n%s' % (lib, e))
|
||||
failed = True
|
||||
|
||||
return not failed
|
||||
@@ -0,0 +1,20 @@
|
||||
from numba.cuda.cudadrv import devices, driver
|
||||
from numba.core.registry import cpu_target
|
||||
|
||||
|
||||
def _calc_array_sizeof(ndim):
|
||||
"""
|
||||
Use the ABI size in the CPU target
|
||||
"""
|
||||
ctx = cpu_target.target_context
|
||||
return ctx.calc_array_sizeof(ndim)
|
||||
|
||||
|
||||
def ndarray_device_allocate_data(ary):
|
||||
"""
|
||||
Allocate gpu data buffer
|
||||
"""
|
||||
datasize = driver.host_memory_size(ary)
|
||||
# allocate
|
||||
gpu_data = devices.get_context().memalloc(datasize)
|
||||
return gpu_data
|
||||
@@ -0,0 +1,260 @@
|
||||
from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
|
||||
from enum import IntEnum
|
||||
from numba.core import config
|
||||
from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
|
||||
NvrtcSupportError)
|
||||
|
||||
import functools
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
# Opaque handle for compilation unit
|
||||
nvrtc_program = c_void_p
|
||||
|
||||
# Result code
|
||||
nvrtc_result = c_int
|
||||
|
||||
|
||||
class NvrtcResult(IntEnum):
|
||||
NVRTC_SUCCESS = 0
|
||||
NVRTC_ERROR_OUT_OF_MEMORY = 1
|
||||
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
|
||||
NVRTC_ERROR_INVALID_INPUT = 3
|
||||
NVRTC_ERROR_INVALID_PROGRAM = 4
|
||||
NVRTC_ERROR_INVALID_OPTION = 5
|
||||
NVRTC_ERROR_COMPILATION = 6
|
||||
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
|
||||
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
|
||||
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
|
||||
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
|
||||
NVRTC_ERROR_INTERNAL_ERROR = 11
|
||||
|
||||
|
||||
_nvrtc_lock = threading.Lock()
|
||||
|
||||
|
||||
class NvrtcProgram:
|
||||
"""
|
||||
A class for managing the lifetime of nvrtcProgram instances. Instances of
|
||||
the class own an nvrtcProgram; when an instance is deleted, the underlying
|
||||
nvrtcProgram is destroyed using the appropriate NVRTC API.
|
||||
"""
|
||||
def __init__(self, nvrtc, handle):
|
||||
self._nvrtc = nvrtc
|
||||
self._handle = handle
|
||||
|
||||
@property
|
||||
def handle(self):
|
||||
return self._handle
|
||||
|
||||
def __del__(self):
|
||||
if self._handle:
|
||||
self._nvrtc.destroy_program(self)
|
||||
|
||||
|
||||
class NVRTC:
|
||||
"""
|
||||
Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
|
||||
calls.
|
||||
|
||||
The sole instance of this class is a process-wide singleton, similar to the
|
||||
NVVM interface. Initialization is protected by a lock and uses the standard
|
||||
(for Numba) open_cudalib function to load the NVRTC library.
|
||||
"""
|
||||
_PROTOTYPES = {
|
||||
# nvrtcResult nvrtcVersion(int *major, int *minor)
|
||||
'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
|
||||
# nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
||||
# const char *src,
|
||||
# const char *name,
|
||||
# int numHeaders,
|
||||
# const char * const *headers,
|
||||
# const char * const *includeNames)
|
||||
'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
|
||||
c_int, POINTER(c_char_p), POINTER(c_char_p)),
|
||||
# nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
|
||||
'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
|
||||
# nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
|
||||
# int numOptions,
|
||||
# const char * const *options)
|
||||
'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
|
||||
POINTER(c_char_p)),
|
||||
# nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
|
||||
'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
|
||||
# nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
|
||||
'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
|
||||
# nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
|
||||
# size_t *cubinSizeRet);
|
||||
'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
|
||||
# nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
|
||||
'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
|
||||
# nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
|
||||
# size_t *logSizeRet);
|
||||
'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
|
||||
POINTER(c_size_t)),
|
||||
# nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
||||
'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
|
||||
}
|
||||
|
||||
# Singleton reference
|
||||
__INSTANCE = None
|
||||
|
||||
def __new__(cls):
|
||||
with _nvrtc_lock:
|
||||
if cls.__INSTANCE is None:
|
||||
from numba.cuda.cudadrv.libs import open_cudalib
|
||||
cls.__INSTANCE = inst = object.__new__(cls)
|
||||
try:
|
||||
lib = open_cudalib('nvrtc')
|
||||
except OSError as e:
|
||||
cls.__INSTANCE = None
|
||||
raise NvrtcSupportError("NVRTC cannot be loaded") from e
|
||||
|
||||
# Find & populate functions
|
||||
for name, proto in inst._PROTOTYPES.items():
|
||||
func = getattr(lib, name)
|
||||
func.restype = proto[0]
|
||||
func.argtypes = proto[1:]
|
||||
|
||||
@functools.wraps(func)
|
||||
def checked_call(*args, func=func, name=name):
|
||||
error = func(*args)
|
||||
if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
|
||||
raise NvrtcCompilationError()
|
||||
elif error != NvrtcResult.NVRTC_SUCCESS:
|
||||
try:
|
||||
error_name = NvrtcResult(error).name
|
||||
except ValueError:
|
||||
error_name = ('Unknown nvrtc_result '
|
||||
f'(error code: {error})')
|
||||
msg = f'Failed to call {name}: {error_name}'
|
||||
raise NvrtcError(msg)
|
||||
|
||||
setattr(inst, name, checked_call)
|
||||
|
||||
return cls.__INSTANCE
|
||||
|
||||
def get_version(self):
|
||||
"""
|
||||
Get the NVRTC version as a tuple (major, minor).
|
||||
"""
|
||||
major = c_int()
|
||||
minor = c_int()
|
||||
self.nvrtcVersion(byref(major), byref(minor))
|
||||
return major.value, minor.value
|
||||
|
||||
def create_program(self, src, name):
|
||||
"""
|
||||
Create an NVRTC program with managed lifetime.
|
||||
"""
|
||||
if isinstance(src, str):
|
||||
src = src.encode()
|
||||
if isinstance(name, str):
|
||||
name = name.encode()
|
||||
|
||||
handle = nvrtc_program()
|
||||
|
||||
# The final three arguments are for passing the contents of headers -
|
||||
# this is not supported, so there are 0 headers and the header names
|
||||
# and contents are null.
|
||||
self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
|
||||
return NvrtcProgram(self, handle)
|
||||
|
||||
def compile_program(self, program, options):
|
||||
"""
|
||||
Compile an NVRTC program. Compilation may fail due to a user error in
|
||||
the source; this function returns ``True`` if there is a compilation
|
||||
error and ``False`` on success.
|
||||
"""
|
||||
# We hold a list of encoded options to ensure they can't be collected
|
||||
# prior to the call to nvrtcCompileProgram
|
||||
encoded_options = [opt.encode() for opt in options]
|
||||
option_pointers = [c_char_p(opt) for opt in encoded_options]
|
||||
c_options_type = (c_char_p * len(options))
|
||||
c_options = c_options_type(*option_pointers)
|
||||
try:
|
||||
self.nvrtcCompileProgram(program.handle, len(options), c_options)
|
||||
return False
|
||||
except NvrtcCompilationError:
|
||||
return True
|
||||
|
||||
def destroy_program(self, program):
|
||||
"""
|
||||
Destroy an NVRTC program.
|
||||
"""
|
||||
self.nvrtcDestroyProgram(byref(program.handle))
|
||||
|
||||
def get_compile_log(self, program):
|
||||
"""
|
||||
Get the compile log as a Python string.
|
||||
"""
|
||||
log_size = c_size_t()
|
||||
self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
|
||||
|
||||
log = (c_char * log_size.value)()
|
||||
self.nvrtcGetProgramLog(program.handle, log)
|
||||
|
||||
return log.value.decode()
|
||||
|
||||
def get_ptx(self, program):
|
||||
"""
|
||||
Get the compiled PTX as a Python string.
|
||||
"""
|
||||
ptx_size = c_size_t()
|
||||
self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
|
||||
|
||||
ptx = (c_char * ptx_size.value)()
|
||||
self.nvrtcGetPTX(program.handle, ptx)
|
||||
|
||||
return ptx.value.decode()
|
||||
|
||||
|
||||
def compile(src, name, cc):
|
||||
"""
|
||||
Compile a CUDA C/C++ source to PTX for a given compute capability.
|
||||
|
||||
:param src: The source code to compile
|
||||
:type src: str
|
||||
:param name: The filename of the source (for information only)
|
||||
:type name: str
|
||||
:param cc: A tuple ``(major, minor)`` of the compute capability
|
||||
:type cc: tuple
|
||||
:return: The compiled PTX and compilation log
|
||||
:rtype: tuple
|
||||
"""
|
||||
nvrtc = NVRTC()
|
||||
program = nvrtc.create_program(src, name)
|
||||
|
||||
# Compilation options:
|
||||
# - Compile for the current device's compute capability.
|
||||
# - The CUDA include path is added.
|
||||
# - Relocatable Device Code (rdc) is needed to prevent device functions
|
||||
# being optimized away.
|
||||
major, minor = cc
|
||||
arch = f'--gpu-architecture=compute_{major}{minor}'
|
||||
include = f'-I{config.CUDA_INCLUDE_PATH}'
|
||||
|
||||
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
|
||||
numba_cuda_path = os.path.dirname(cudadrv_path)
|
||||
numba_include = f'-I{numba_cuda_path}'
|
||||
options = [arch, include, numba_include, '-rdc', 'true']
|
||||
|
||||
# Compile the program
|
||||
compile_error = nvrtc.compile_program(program, options)
|
||||
|
||||
# Get log from compilation
|
||||
log = nvrtc.get_compile_log(program)
|
||||
|
||||
# If the compile failed, provide the log in an exception
|
||||
if compile_error:
|
||||
msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
|
||||
raise NvrtcError(msg)
|
||||
|
||||
# Otherwise, if there's any content in the log, present it as a warning
|
||||
if log:
|
||||
msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
|
||||
warnings.warn(msg)
|
||||
|
||||
ptx = nvrtc.get_ptx(program)
|
||||
return ptx, log
|
||||
@@ -0,0 +1,707 @@
|
||||
"""
|
||||
This is a direct translation of nvvm.h
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
|
||||
c_char)
|
||||
|
||||
import threading
|
||||
|
||||
from llvmlite import ir
|
||||
|
||||
from .error import NvvmError, NvvmSupportError, NvvmWarning
|
||||
from .libs import get_libdevice, open_libdevice, open_cudalib
|
||||
from numba.core import cgutils, config
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ADDRSPACE_GENERIC = 0
|
||||
ADDRSPACE_GLOBAL = 1
|
||||
ADDRSPACE_SHARED = 3
|
||||
ADDRSPACE_CONSTANT = 4
|
||||
ADDRSPACE_LOCAL = 5
|
||||
|
||||
# Opaque handle for compilation unit
|
||||
nvvm_program = c_void_p
|
||||
|
||||
# Result code
|
||||
nvvm_result = c_int
|
||||
|
||||
RESULT_CODE_NAMES = '''
|
||||
NVVM_SUCCESS
|
||||
NVVM_ERROR_OUT_OF_MEMORY
|
||||
NVVM_ERROR_PROGRAM_CREATION_FAILURE
|
||||
NVVM_ERROR_IR_VERSION_MISMATCH
|
||||
NVVM_ERROR_INVALID_INPUT
|
||||
NVVM_ERROR_INVALID_PROGRAM
|
||||
NVVM_ERROR_INVALID_IR
|
||||
NVVM_ERROR_INVALID_OPTION
|
||||
NVVM_ERROR_NO_MODULE_IN_PROGRAM
|
||||
NVVM_ERROR_COMPILATION
|
||||
'''.split()
|
||||
|
||||
for i, k in enumerate(RESULT_CODE_NAMES):
|
||||
setattr(sys.modules[__name__], k, i)
|
||||
|
||||
# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
|
||||
|
||||
_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
|
||||
'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
|
||||
'v64:64:64-v128:128:128-n16:32:64')
|
||||
_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
|
||||
'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
|
||||
'v64:64:64-v128:128:128-n16:32:64')
|
||||
|
||||
|
||||
def is_available():
|
||||
"""
|
||||
Return if libNVVM is available
|
||||
"""
|
||||
try:
|
||||
NVVM()
|
||||
except NvvmSupportError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
_nvvm_lock = threading.Lock()
|
||||
|
||||
|
||||
class NVVM(object):
|
||||
'''Process-wide singleton.
|
||||
'''
|
||||
_PROTOTYPES = {
|
||||
|
||||
# nvvmResult nvvmVersion(int *major, int *minor)
|
||||
'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
|
||||
|
||||
# nvvmResult nvvmCreateProgram(nvvmProgram *cu)
|
||||
'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
|
||||
|
||||
# nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
|
||||
'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
|
||||
|
||||
# nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
|
||||
# size_t size, const char *name)
|
||||
'nvvmAddModuleToProgram': (
|
||||
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
|
||||
|
||||
# nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
|
||||
# const char* buffer,
|
||||
# size_t size,
|
||||
# const char *name)
|
||||
'nvvmLazyAddModuleToProgram': (
|
||||
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
|
||||
|
||||
# nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
|
||||
# const char **options)
|
||||
'nvvmCompileProgram': (
|
||||
nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
|
||||
|
||||
# nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
|
||||
# size_t *bufferSizeRet)
|
||||
'nvvmGetCompiledResultSize': (
|
||||
nvvm_result, nvvm_program, POINTER(c_size_t)),
|
||||
|
||||
# nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
|
||||
'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
|
||||
|
||||
# nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
|
||||
# size_t *bufferSizeRet)
|
||||
'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
|
||||
|
||||
# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
|
||||
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
|
||||
|
||||
# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
|
||||
# int* minorDbg )
|
||||
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
|
||||
POINTER(c_int), POINTER(c_int)),
|
||||
# nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
|
||||
# const char** options)
|
||||
'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
|
||||
POINTER(c_char_p))
|
||||
}
|
||||
|
||||
# Singleton reference
|
||||
__INSTANCE = None
|
||||
|
||||
def __new__(cls):
|
||||
with _nvvm_lock:
|
||||
if cls.__INSTANCE is None:
|
||||
cls.__INSTANCE = inst = object.__new__(cls)
|
||||
try:
|
||||
inst.driver = open_cudalib('nvvm')
|
||||
except OSError as e:
|
||||
cls.__INSTANCE = None
|
||||
errmsg = ("libNVVM cannot be found. Do `conda install "
|
||||
"cudatoolkit`:\n%s")
|
||||
raise NvvmSupportError(errmsg % e)
|
||||
|
||||
# Find & populate functions
|
||||
for name, proto in inst._PROTOTYPES.items():
|
||||
func = getattr(inst.driver, name)
|
||||
func.restype = proto[0]
|
||||
func.argtypes = proto[1:]
|
||||
setattr(inst, name, func)
|
||||
|
||||
return cls.__INSTANCE
|
||||
|
||||
def __init__(self):
|
||||
ir_versions = self.get_ir_version()
|
||||
self._majorIR = ir_versions[0]
|
||||
self._minorIR = ir_versions[1]
|
||||
self._majorDbg = ir_versions[2]
|
||||
self._minorDbg = ir_versions[3]
|
||||
self._supported_ccs = get_supported_ccs()
|
||||
|
||||
@property
|
||||
def data_layout(self):
|
||||
if (self._majorIR, self._minorIR) < (1, 8):
|
||||
return _datalayout_original
|
||||
else:
|
||||
return _datalayout_i128
|
||||
|
||||
@property
|
||||
def supported_ccs(self):
|
||||
return self._supported_ccs
|
||||
|
||||
def get_version(self):
|
||||
major = c_int()
|
||||
minor = c_int()
|
||||
err = self.nvvmVersion(byref(major), byref(minor))
|
||||
self.check_error(err, 'Failed to get version.')
|
||||
return major.value, minor.value
|
||||
|
||||
def get_ir_version(self):
|
||||
majorIR = c_int()
|
||||
minorIR = c_int()
|
||||
majorDbg = c_int()
|
||||
minorDbg = c_int()
|
||||
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
|
||||
byref(majorDbg), byref(minorDbg))
|
||||
self.check_error(err, 'Failed to get IR version.')
|
||||
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
|
||||
|
||||
def check_error(self, error, msg, exit=False):
|
||||
if error:
|
||||
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
|
||||
if exit:
|
||||
print(exc)
|
||||
sys.exit(1)
|
||||
else:
|
||||
raise exc
|
||||
|
||||
|
||||
class CompilationUnit(object):
|
||||
def __init__(self):
|
||||
self.driver = NVVM()
|
||||
self._handle = nvvm_program()
|
||||
err = self.driver.nvvmCreateProgram(byref(self._handle))
|
||||
self.driver.check_error(err, 'Failed to create CU')
|
||||
|
||||
def __del__(self):
|
||||
driver = NVVM()
|
||||
err = driver.nvvmDestroyProgram(byref(self._handle))
|
||||
driver.check_error(err, 'Failed to destroy CU', exit=True)
|
||||
|
||||
def add_module(self, buffer):
|
||||
"""
|
||||
Add a module level NVVM IR to a compilation unit.
|
||||
- The buffer should contain an NVVM module IR either in the bitcode
|
||||
representation (LLVM3.0) or in the text representation.
|
||||
"""
|
||||
err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
|
||||
len(buffer), None)
|
||||
self.driver.check_error(err, 'Failed to add module')
|
||||
|
||||
def lazy_add_module(self, buffer):
|
||||
"""
|
||||
Lazily add an NVVM IR module to a compilation unit.
|
||||
The buffer should contain NVVM module IR either in the bitcode
|
||||
representation or in the text representation.
|
||||
"""
|
||||
err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
|
||||
len(buffer), None)
|
||||
self.driver.check_error(err, 'Failed to add module')
|
||||
|
||||
def compile(self, **options):
|
||||
"""Perform Compilation.
|
||||
|
||||
Compilation options are accepted as keyword arguments, with the
|
||||
following considerations:
|
||||
|
||||
- Underscores (`_`) in option names are converted to dashes (`-`), to
|
||||
match NVVM's option name format.
|
||||
- Options that take a value will be emitted in the form
|
||||
"-<name>=<value>".
|
||||
- Booleans passed as option values will be converted to integers.
|
||||
- Options which take no value (such as `-gen-lto`) should have a value
|
||||
of `None` passed in and will be emitted in the form "-<name>".
|
||||
|
||||
For documentation on NVVM compilation options, see the CUDA Toolkit
|
||||
Documentation:
|
||||
|
||||
https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
|
||||
"""
|
||||
|
||||
def stringify_option(k, v):
|
||||
k = k.replace('_', '-')
|
||||
|
||||
if v is None:
|
||||
return f'-{k}'
|
||||
|
||||
if isinstance(v, bool):
|
||||
v = int(v)
|
||||
|
||||
return f'-{k}={v}'
|
||||
|
||||
options = [stringify_option(k, v) for k, v in options.items()]
|
||||
|
||||
c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
|
||||
for x in options])
|
||||
# verify
|
||||
err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
|
||||
self._try_error(err, 'Failed to verify\n')
|
||||
|
||||
# compile
|
||||
err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
|
||||
self._try_error(err, 'Failed to compile\n')
|
||||
|
||||
# get result
|
||||
reslen = c_size_t()
|
||||
err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
|
||||
|
||||
self._try_error(err, 'Failed to get size of compiled result.')
|
||||
|
||||
output_buffer = (c_char * reslen.value)()
|
||||
err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
|
||||
self._try_error(err, 'Failed to get compiled result.')
|
||||
|
||||
# get log
|
||||
self.log = self.get_log()
|
||||
if self.log:
|
||||
warnings.warn(self.log, category=NvvmWarning)
|
||||
|
||||
return output_buffer[:]
|
||||
|
||||
def _try_error(self, err, msg):
|
||||
self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
|
||||
|
||||
def get_log(self):
|
||||
reslen = c_size_t()
|
||||
err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
|
||||
self.driver.check_error(err, 'Failed to get compilation log size.')
|
||||
|
||||
if reslen.value > 1:
|
||||
logbuf = (c_char * reslen.value)()
|
||||
err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
|
||||
self.driver.check_error(err, 'Failed to get compilation log.')
|
||||
|
||||
return logbuf.value.decode('utf8') # populate log attribute
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
COMPUTE_CAPABILITIES = (
|
||||
(3, 5), (3, 7),
|
||||
(5, 0), (5, 2), (5, 3),
|
||||
(6, 0), (6, 1), (6, 2),
|
||||
(7, 0), (7, 2), (7, 5),
|
||||
(8, 0), (8, 6), (8, 7), (8, 9),
|
||||
(9, 0)
|
||||
)
|
||||
|
||||
# Maps CTK version -> (min supported cc, max supported cc) inclusive
|
||||
CTK_SUPPORTED = {
|
||||
(11, 2): ((3, 5), (8, 6)),
|
||||
(11, 3): ((3, 5), (8, 6)),
|
||||
(11, 4): ((3, 5), (8, 7)),
|
||||
(11, 5): ((3, 5), (8, 7)),
|
||||
(11, 6): ((3, 5), (8, 7)),
|
||||
(11, 7): ((3, 5), (8, 7)),
|
||||
(11, 8): ((3, 5), (9, 0)),
|
||||
(12, 0): ((5, 0), (9, 0)),
|
||||
(12, 1): ((5, 0), (9, 0)),
|
||||
(12, 2): ((5, 0), (9, 0)),
|
||||
(12, 3): ((5, 0), (9, 0)),
|
||||
(12, 4): ((5, 0), (9, 0)),
|
||||
}
|
||||
|
||||
|
||||
def ccs_supported_by_ctk(ctk_version):
|
||||
try:
|
||||
# For supported versions, we look up the range of supported CCs
|
||||
min_cc, max_cc = CTK_SUPPORTED[ctk_version]
|
||||
return tuple([cc for cc in COMPUTE_CAPABILITIES
|
||||
if min_cc <= cc <= max_cc])
|
||||
except KeyError:
|
||||
# For unsupported CUDA toolkit versions, all we can do is assume all
|
||||
# non-deprecated versions we are aware of are supported.
|
||||
return tuple([cc for cc in COMPUTE_CAPABILITIES
|
||||
if cc >= config.CUDA_DEFAULT_PTX_CC])
|
||||
|
||||
|
||||
def get_supported_ccs():
|
||||
try:
|
||||
from numba.cuda.cudadrv.runtime import runtime
|
||||
cudart_version = runtime.get_version()
|
||||
except: # noqa: E722
|
||||
# We can't support anything if there's an error getting the runtime
|
||||
# version (e.g. if it's not present or there's another issue)
|
||||
_supported_cc = ()
|
||||
return _supported_cc
|
||||
|
||||
# Ensure the minimum CTK version requirement is met
|
||||
min_cudart = min(CTK_SUPPORTED)
|
||||
if cudart_version < min_cudart:
|
||||
_supported_cc = ()
|
||||
ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
|
||||
unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
|
||||
f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
|
||||
"required version.")
|
||||
warnings.warn(unsupported_ver)
|
||||
return _supported_cc
|
||||
|
||||
_supported_cc = ccs_supported_by_ctk(cudart_version)
|
||||
return _supported_cc
|
||||
|
||||
|
||||
def find_closest_arch(mycc):
|
||||
"""
|
||||
Given a compute capability, return the closest compute capability supported
|
||||
by the CUDA toolkit.
|
||||
|
||||
:param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
|
||||
:return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
|
||||
"""
|
||||
supported_ccs = NVVM().supported_ccs
|
||||
|
||||
if not supported_ccs:
|
||||
msg = "No supported GPU compute capabilities found. " \
|
||||
"Please check your cudatoolkit version matches your CUDA version."
|
||||
raise NvvmSupportError(msg)
|
||||
|
||||
for i, cc in enumerate(supported_ccs):
|
||||
if cc == mycc:
|
||||
# Matches
|
||||
return cc
|
||||
elif cc > mycc:
|
||||
# Exceeded
|
||||
if i == 0:
|
||||
# CC lower than supported
|
||||
msg = "GPU compute capability %d.%d is not supported" \
|
||||
"(requires >=%d.%d)" % (mycc + cc)
|
||||
raise NvvmSupportError(msg)
|
||||
else:
|
||||
# return the previous CC
|
||||
return supported_ccs[i - 1]
|
||||
|
||||
# CC higher than supported
|
||||
return supported_ccs[-1] # Choose the highest
|
||||
|
||||
|
||||
def get_arch_option(major, minor):
|
||||
"""Matches with the closest architecture option
|
||||
"""
|
||||
if config.FORCE_CUDA_CC:
|
||||
arch = config.FORCE_CUDA_CC
|
||||
else:
|
||||
arch = find_closest_arch((major, minor))
|
||||
return 'compute_%d%d' % arch
|
||||
|
||||
|
||||
MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
|
||||
Please ensure you have a CUDA Toolkit 11.2 or higher.
|
||||
For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
|
||||
|
||||
$ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
|
||||
|
||||
For CUDA 11, ``cudatoolkit`` is required:
|
||||
|
||||
$ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
|
||||
'''
|
||||
|
||||
|
||||
class LibDevice(object):
|
||||
_cache_ = None
|
||||
|
||||
def __init__(self):
|
||||
if self._cache_ is None:
|
||||
if get_libdevice() is None:
|
||||
raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
|
||||
self._cache_ = open_libdevice()
|
||||
|
||||
self.bc = self._cache_
|
||||
|
||||
def get(self):
|
||||
return self.bc
|
||||
|
||||
|
||||
cas_nvvm = """
|
||||
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
|
||||
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
# Translation of code from CUDA Programming Guide v6.5, section B.12
|
||||
ir_numba_atomic_binary_template = """
|
||||
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%iptr = bitcast {T}* %ptr to {Ti}*
|
||||
%old2 = load volatile {Ti}, {Ti}* %iptr
|
||||
br label %attempt
|
||||
|
||||
attempt:
|
||||
%old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
|
||||
%dold = bitcast {Ti} %old to {T}
|
||||
%dnew = {OP} {T} %dold, %val
|
||||
%new = bitcast {T} %dnew to {Ti}
|
||||
{CAS}
|
||||
%repeat = icmp ne {Ti} %cas, %old
|
||||
br i1 %repeat, label %attempt, label %done
|
||||
|
||||
done:
|
||||
%result = bitcast {Ti} %old to {T}
|
||||
ret {T} %result
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
ir_numba_atomic_inc_template = """
|
||||
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%old2 = load volatile {T}, {T}* %iptr
|
||||
br label %attempt
|
||||
|
||||
attempt:
|
||||
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
|
||||
%bndchk = icmp ult {T} %old, %val
|
||||
%inc = add {T} %old, 1
|
||||
%new = select i1 %bndchk, {T} %inc, {T} 0
|
||||
{CAS}
|
||||
%repeat = icmp ne {T} %cas, %old
|
||||
br i1 %repeat, label %attempt, label %done
|
||||
|
||||
done:
|
||||
ret {T} %old
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
ir_numba_atomic_dec_template = """
|
||||
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%old2 = load volatile {T}, {T}* %iptr
|
||||
br label %attempt
|
||||
|
||||
attempt:
|
||||
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
|
||||
%dec = add {T} %old, -1
|
||||
%bndchk = icmp ult {T} %dec, %val
|
||||
%new = select i1 %bndchk, {T} %dec, {T} %val
|
||||
{CAS}
|
||||
%repeat = icmp ne {T} %cas, %old
|
||||
br i1 %repeat, label %attempt, label %done
|
||||
|
||||
done:
|
||||
ret {T} %old
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
ir_numba_atomic_minmax_template = """
|
||||
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
|
||||
entry:
|
||||
%ptrval = load volatile {T}, {T}* %ptr
|
||||
; Return early when:
|
||||
; - For nanmin / nanmax when val is a NaN
|
||||
; - For min / max when val or ptr is a NaN
|
||||
%early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
|
||||
br i1 %early_return, label %done, label %lt_check
|
||||
|
||||
lt_check:
|
||||
%dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
|
||||
; Continue attempts if dold less or greater than val (depending on whether min or max)
|
||||
; or if dold is NaN (for nanmin / nanmax)
|
||||
%cmp = fcmp {OP} {T} %dold, %val
|
||||
br i1 %cmp, label %attempt, label %done
|
||||
|
||||
attempt:
|
||||
; Attempt to swap in the value
|
||||
%old = bitcast {T} %dold to {Ti}
|
||||
%iptr = bitcast {T}* %ptr to {Ti}*
|
||||
%new = bitcast {T} %val to {Ti}
|
||||
{CAS}
|
||||
%dcas = bitcast {Ti} %cas to {T}
|
||||
br label %lt_check
|
||||
|
||||
done:
|
||||
ret {T} %ptrval
|
||||
}}
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
def ir_cas(Ti):
|
||||
return cas_nvvm.format(Ti=Ti)
|
||||
|
||||
|
||||
def ir_numba_atomic_binary(T, Ti, OP, FUNC):
|
||||
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
|
||||
return ir_numba_atomic_binary_template.format(**params)
|
||||
|
||||
|
||||
def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
|
||||
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
|
||||
FUNC=FUNC, CAS=ir_cas(Ti))
|
||||
|
||||
return ir_numba_atomic_minmax_template.format(**params)
|
||||
|
||||
|
||||
def ir_numba_atomic_inc(T, Tu):
|
||||
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
|
||||
|
||||
|
||||
def ir_numba_atomic_dec(T, Tu):
|
||||
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
|
||||
|
||||
|
||||
def llvm_replace(llvmir):
|
||||
replacements = [
|
||||
('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
|
||||
('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
|
||||
('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
|
||||
('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
|
||||
ir_numba_atomic_inc(T='i64', Tu='u64')),
|
||||
('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
|
||||
ir_numba_atomic_dec(T='i64', Tu='u64')),
|
||||
('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
|
||||
PTR_OR_VAL='ptr', FUNC='max')),
|
||||
('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
|
||||
PTR_OR_VAL='ptr', FUNC='max')),
|
||||
('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
|
||||
PTR_OR_VAL='ptr', FUNC='min')),
|
||||
('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
|
||||
PTR_OR_VAL='ptr', FUNC='min')),
|
||||
('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
|
||||
PTR_OR_VAL='', FUNC='max')),
|
||||
('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
|
||||
PTR_OR_VAL='', FUNC='max')),
|
||||
('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
|
||||
PTR_OR_VAL='', FUNC='min')),
|
||||
('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
|
||||
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
|
||||
PTR_OR_VAL='', FUNC='min')),
|
||||
('immarg', '')
|
||||
]
|
||||
|
||||
for decl, fn in replacements:
|
||||
llvmir = llvmir.replace(decl, fn)
|
||||
|
||||
llvmir = llvm140_to_70_ir(llvmir)
|
||||
|
||||
return llvmir
|
||||
|
||||
|
||||
def compile_ir(llvmir, **opts):
|
||||
if isinstance(llvmir, str):
|
||||
llvmir = [llvmir]
|
||||
|
||||
if opts.pop('fastmath', False):
|
||||
opts.update({
|
||||
'ftz': True,
|
||||
'fma': True,
|
||||
'prec_div': False,
|
||||
'prec_sqrt': False,
|
||||
})
|
||||
|
||||
cu = CompilationUnit()
|
||||
libdevice = LibDevice()
|
||||
|
||||
for mod in llvmir:
|
||||
mod = llvm_replace(mod)
|
||||
cu.add_module(mod.encode('utf8'))
|
||||
cu.lazy_add_module(libdevice.get())
|
||||
|
||||
return cu.compile(**opts)
|
||||
|
||||
|
||||
re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
|
||||
|
||||
|
||||
def llvm140_to_70_ir(ir):
|
||||
"""
|
||||
Convert LLVM 14.0 IR for LLVM 7.0.
|
||||
"""
|
||||
buf = []
|
||||
for line in ir.splitlines():
|
||||
if line.startswith('attributes #'):
|
||||
# Remove function attributes unsupported by LLVM 7.0
|
||||
m = re_attributes_def.match(line)
|
||||
attrs = m.group(1).split()
|
||||
attrs = ' '.join(a for a in attrs if a != 'willreturn')
|
||||
line = line.replace(m.group(1), attrs)
|
||||
|
||||
buf.append(line)
|
||||
|
||||
return '\n'.join(buf)
|
||||
|
||||
|
||||
def set_cuda_kernel(function):
|
||||
"""
|
||||
Mark a function as a CUDA kernel. Kernels have the following requirements:
|
||||
|
||||
- Metadata that marks them as a kernel.
|
||||
- Addition to the @llvm.used list, so that they will not be discarded.
|
||||
- The noinline attribute is not permitted, because this causes NVVM to emit
|
||||
a warning, which counts as failing IR verification.
|
||||
|
||||
Presently it is assumed that there is one kernel per module, which holds
|
||||
for Numba-jitted functions. If this changes in future or this function is
|
||||
to be used externally, this function may need modification to add to the
|
||||
@llvm.used list rather than creating it.
|
||||
"""
|
||||
module = function.module
|
||||
|
||||
# Add kernel metadata
|
||||
mdstr = ir.MetaDataString(module, "kernel")
|
||||
mdvalue = ir.Constant(ir.IntType(32), 1)
|
||||
md = module.add_metadata((function, mdstr, mdvalue))
|
||||
|
||||
nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
|
||||
nmd.add(md)
|
||||
|
||||
# Create the used list
|
||||
ptrty = ir.IntType(8).as_pointer()
|
||||
usedty = ir.ArrayType(ptrty, 1)
|
||||
|
||||
fnptr = function.bitcast(ptrty)
|
||||
|
||||
llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
|
||||
llvm_used.linkage = 'appending'
|
||||
llvm_used.section = 'llvm.metadata'
|
||||
llvm_used.initializer = ir.Constant(usedty, [fnptr])
|
||||
|
||||
# Remove 'noinline' if it is present.
|
||||
function.attributes.discard('noinline')
|
||||
|
||||
|
||||
def add_ir_version(mod):
|
||||
"""Add NVVM IR version to module"""
|
||||
# We specify the IR version to match the current NVVM's IR version
|
||||
i32 = ir.IntType(32)
|
||||
ir_versions = [i32(v) for v in NVVM().get_ir_version()]
|
||||
md_ver = mod.add_metadata(ir_versions)
|
||||
mod.add_named_metadata('nvvmir.version', md_ver)
|
||||
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
Declarations of the Runtime API functions.
|
||||
"""
|
||||
|
||||
from ctypes import c_int, POINTER
|
||||
|
||||
API_PROTOTYPES = {
|
||||
# cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
|
||||
'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
CUDA Runtime wrapper.
|
||||
|
||||
This provides a very minimal set of bindings, since the Runtime API is not
|
||||
really used in Numba except for querying the Runtime version.
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import functools
|
||||
import sys
|
||||
|
||||
from numba.core import config
|
||||
from numba.cuda.cudadrv.driver import ERROR_MAP, make_logger
|
||||
from numba.cuda.cudadrv.error import CudaSupportError, CudaRuntimeError
|
||||
from numba.cuda.cudadrv.libs import open_cudalib
|
||||
from numba.cuda.cudadrv.rtapi import API_PROTOTYPES
|
||||
from numba.cuda.cudadrv import enums
|
||||
|
||||
|
||||
class CudaRuntimeAPIError(CudaRuntimeError):
|
||||
"""
|
||||
Raised when there is an error accessing a C API from the CUDA Runtime.
|
||||
"""
|
||||
def __init__(self, code, msg):
|
||||
self.code = code
|
||||
self.msg = msg
|
||||
super().__init__(code, msg)
|
||||
|
||||
def __str__(self):
|
||||
return "[%s] %s" % (self.code, self.msg)
|
||||
|
||||
|
||||
class Runtime:
|
||||
"""
|
||||
Runtime object that lazily binds runtime API functions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.is_initialized = False
|
||||
|
||||
def _initialize(self):
|
||||
# lazily initialize logger
|
||||
global _logger
|
||||
_logger = make_logger()
|
||||
|
||||
if config.DISABLE_CUDA:
|
||||
msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
|
||||
"in the environment, or because CUDA is unsupported on "
|
||||
"32-bit systems.")
|
||||
raise CudaSupportError(msg)
|
||||
self.lib = open_cudalib('cudart')
|
||||
|
||||
self.is_initialized = True
|
||||
|
||||
def __getattr__(self, fname):
|
||||
# First request of a runtime API function
|
||||
try:
|
||||
proto = API_PROTOTYPES[fname]
|
||||
except KeyError:
|
||||
raise AttributeError(fname)
|
||||
restype = proto[0]
|
||||
argtypes = proto[1:]
|
||||
|
||||
if not self.is_initialized:
|
||||
self._initialize()
|
||||
|
||||
# Find function in runtime library
|
||||
libfn = self._find_api(fname)
|
||||
libfn.restype = restype
|
||||
libfn.argtypes = argtypes
|
||||
|
||||
safe_call = self._wrap_api_call(fname, libfn)
|
||||
setattr(self, fname, safe_call)
|
||||
return safe_call
|
||||
|
||||
def _wrap_api_call(self, fname, libfn):
|
||||
@functools.wraps(libfn)
|
||||
def safe_cuda_api_call(*args):
|
||||
_logger.debug('call runtime api: %s', libfn.__name__)
|
||||
retcode = libfn(*args)
|
||||
self._check_error(fname, retcode)
|
||||
return safe_cuda_api_call
|
||||
|
||||
def _check_error(self, fname, retcode):
|
||||
if retcode != enums.CUDA_SUCCESS:
|
||||
errname = ERROR_MAP.get(retcode, "cudaErrorUnknown")
|
||||
msg = "Call to %s results in %s" % (fname, errname)
|
||||
_logger.error(msg)
|
||||
raise CudaRuntimeAPIError(retcode, msg)
|
||||
|
||||
def _find_api(self, fname):
|
||||
try:
|
||||
return getattr(self.lib, fname)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Not found.
|
||||
# Delay missing function error to use
|
||||
def absent_function(*args, **kws):
|
||||
msg = "runtime missing function: %s."
|
||||
raise CudaRuntimeError(msg % fname)
|
||||
|
||||
setattr(self, fname, absent_function)
|
||||
return absent_function
|
||||
|
||||
def get_version(self):
|
||||
"""
|
||||
Returns the CUDA Runtime version as a tuple (major, minor).
|
||||
"""
|
||||
rtver = ctypes.c_int()
|
||||
self.cudaRuntimeGetVersion(ctypes.byref(rtver))
|
||||
# The version is encoded as (1000 * major) + (10 * minor)
|
||||
major = rtver.value // 1000
|
||||
minor = (rtver.value - (major * 1000)) // 10
|
||||
return (major, minor)
|
||||
|
||||
def is_supported_version(self):
|
||||
"""
|
||||
Returns True if the CUDA Runtime is a supported version.
|
||||
"""
|
||||
|
||||
return self.get_version() in self.supported_versions
|
||||
|
||||
@property
|
||||
def supported_versions(self):
|
||||
"""A tuple of all supported CUDA toolkit versions. Versions are given in
|
||||
the form ``(major_version, minor_version)``."""
|
||||
if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
|
||||
# Only 64-bit Linux and Windows are supported
|
||||
return ()
|
||||
return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
|
||||
(11, 7))
|
||||
|
||||
|
||||
runtime = Runtime()
|
||||
|
||||
|
||||
def get_version():
|
||||
"""
|
||||
Return the runtime version as a tuple of (major, minor)
|
||||
"""
|
||||
return runtime.get_version()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,140 @@
|
||||
import math
|
||||
from numba.core import types
|
||||
from numba.core.typing.templates import ConcreteTemplate, signature, Registry
|
||||
|
||||
|
||||
registry = Registry()
|
||||
infer_global = registry.register_global
|
||||
|
||||
|
||||
@infer_global(math.acos)
|
||||
@infer_global(math.acosh)
|
||||
@infer_global(math.asin)
|
||||
@infer_global(math.asinh)
|
||||
@infer_global(math.atan)
|
||||
@infer_global(math.atanh)
|
||||
@infer_global(math.cosh)
|
||||
@infer_global(math.degrees)
|
||||
@infer_global(math.erf)
|
||||
@infer_global(math.erfc)
|
||||
@infer_global(math.expm1)
|
||||
@infer_global(math.gamma)
|
||||
@infer_global(math.lgamma)
|
||||
@infer_global(math.log1p)
|
||||
@infer_global(math.radians)
|
||||
@infer_global(math.sinh)
|
||||
@infer_global(math.tanh)
|
||||
@infer_global(math.tan)
|
||||
class Math_unary(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.float64, types.int64),
|
||||
signature(types.float64, types.uint64),
|
||||
signature(types.float32, types.float32),
|
||||
signature(types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.sin)
|
||||
@infer_global(math.cos)
|
||||
@infer_global(math.ceil)
|
||||
@infer_global(math.floor)
|
||||
@infer_global(math.sqrt)
|
||||
@infer_global(math.log)
|
||||
@infer_global(math.log2)
|
||||
@infer_global(math.log10)
|
||||
@infer_global(math.exp)
|
||||
@infer_global(math.fabs)
|
||||
@infer_global(math.trunc)
|
||||
class Math_unary_with_fp16(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.float64, types.int64),
|
||||
signature(types.float64, types.uint64),
|
||||
signature(types.float32, types.float32),
|
||||
signature(types.float64, types.float64),
|
||||
signature(types.float16, types.float16),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.atan2)
|
||||
class Math_atan2(ConcreteTemplate):
|
||||
key = math.atan2
|
||||
cases = [
|
||||
signature(types.float64, types.int64, types.int64),
|
||||
signature(types.float64, types.uint64, types.uint64),
|
||||
signature(types.float32, types.float32, types.float32),
|
||||
signature(types.float64, types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.hypot)
|
||||
class Math_hypot(ConcreteTemplate):
|
||||
key = math.hypot
|
||||
cases = [
|
||||
signature(types.float64, types.int64, types.int64),
|
||||
signature(types.float64, types.uint64, types.uint64),
|
||||
signature(types.float32, types.float32, types.float32),
|
||||
signature(types.float64, types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.copysign)
|
||||
@infer_global(math.fmod)
|
||||
class Math_binary(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.float32, types.float32, types.float32),
|
||||
signature(types.float64, types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.remainder)
|
||||
class Math_remainder(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.float32, types.float32, types.float32),
|
||||
signature(types.float64, types.float64, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.pow)
|
||||
class Math_pow(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.float32, types.float32, types.float32),
|
||||
signature(types.float64, types.float64, types.float64),
|
||||
signature(types.float32, types.float32, types.int32),
|
||||
signature(types.float64, types.float64, types.int32),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.frexp)
|
||||
class Math_frexp(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.Tuple([types.float32, types.int32]), types.float32),
|
||||
signature(types.Tuple([types.float64, types.int32]), types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.ldexp)
|
||||
class Math_ldexp(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.float32, types.float32, types.int32),
|
||||
signature(types.float64, types.float64, types.int32),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.isinf)
|
||||
@infer_global(math.isnan)
|
||||
@infer_global(math.isfinite)
|
||||
class Math_isnan(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.boolean, types.int64),
|
||||
signature(types.boolean, types.uint64),
|
||||
signature(types.boolean, types.float32),
|
||||
signature(types.boolean, types.float64),
|
||||
]
|
||||
|
||||
|
||||
@infer_global(math.modf)
|
||||
class Math_modf(ConcreteTemplate):
|
||||
cases = [
|
||||
signature(types.UniTuple(types.float64, 2), types.float64),
|
||||
signature(types.UniTuple(types.float32, 2), types.float32)
|
||||
]
|
||||
@@ -0,0 +1,191 @@
|
||||
from warnings import warn
|
||||
from numba.core import types, config, sigutils
|
||||
from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning
|
||||
from numba.cuda.compiler import declare_device_function
|
||||
from numba.cuda.dispatcher import CUDADispatcher
|
||||
from numba.cuda.simulator.kernel import FakeCUDAKernel
|
||||
|
||||
|
||||
_msg_deprecated_signature_arg = ("Deprecated keyword argument `{0}`. "
|
||||
"Signatures should be passed as the first "
|
||||
"positional argument.")
|
||||
|
||||
|
||||
def jit(func_or_sig=None, device=False, inline=False, link=None, debug=None,
|
||||
opt=True, lineinfo=False, cache=False, **kws):
|
||||
"""
|
||||
JIT compile a Python function for CUDA GPUs.
|
||||
|
||||
:param func_or_sig: A function to JIT compile, or *signatures* of a
|
||||
function to compile. If a function is supplied, then a
|
||||
:class:`Dispatcher <numba.cuda.dispatcher.CUDADispatcher>` is returned.
|
||||
Otherwise, ``func_or_sig`` may be a signature or a list of signatures,
|
||||
and a function is returned. The returned function accepts another
|
||||
function, which it will compile and then return a :class:`Dispatcher
|
||||
<numba.cuda.dispatcher.CUDADispatcher>`. See :ref:`jit-decorator` for
|
||||
more information about passing signatures.
|
||||
|
||||
.. note:: A kernel cannot have any return value.
|
||||
:param device: Indicates whether this is a device function.
|
||||
:type device: bool
|
||||
:param link: A list of files containing PTX or CUDA C/C++ source to link
|
||||
with the function
|
||||
:type link: list
|
||||
:param debug: If True, check for exceptions thrown when executing the
|
||||
kernel. Since this degrades performance, this should only be used for
|
||||
debugging purposes. If set to True, then ``opt`` should be set to False.
|
||||
Defaults to False. (The default value can be overridden by setting
|
||||
environment variable ``NUMBA_CUDA_DEBUGINFO=1``.)
|
||||
:param fastmath: When True, enables fastmath optimizations as outlined in
|
||||
the :ref:`CUDA Fast Math documentation <cuda-fast-math>`.
|
||||
:param max_registers: Request that the kernel is limited to using at most
|
||||
this number of registers per thread. The limit may not be respected if
|
||||
the ABI requires a greater number of registers than that requested.
|
||||
Useful for increasing occupancy.
|
||||
:param opt: Whether to compile from LLVM IR to PTX with optimization
|
||||
enabled. When ``True``, ``-opt=3`` is passed to NVVM. When
|
||||
``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``.
|
||||
:type opt: bool
|
||||
:param lineinfo: If True, generate a line mapping between source code and
|
||||
assembly code. This enables inspection of the source code in NVIDIA
|
||||
profiling tools and correlation with program counter sampling.
|
||||
:type lineinfo: bool
|
||||
:param cache: If True, enables the file-based cache for this function.
|
||||
:type cache: bool
|
||||
"""
|
||||
|
||||
if link is None:
|
||||
link = []
|
||||
if link and config.ENABLE_CUDASIM:
|
||||
raise NotImplementedError('Cannot link PTX in the simulator')
|
||||
|
||||
if kws.get('boundscheck'):
|
||||
raise NotImplementedError("bounds checking is not supported for CUDA")
|
||||
|
||||
if kws.get('argtypes') is not None:
|
||||
msg = _msg_deprecated_signature_arg.format('argtypes')
|
||||
raise DeprecationError(msg)
|
||||
if kws.get('restype') is not None:
|
||||
msg = _msg_deprecated_signature_arg.format('restype')
|
||||
raise DeprecationError(msg)
|
||||
if kws.get('bind') is not None:
|
||||
msg = _msg_deprecated_signature_arg.format('bind')
|
||||
raise DeprecationError(msg)
|
||||
|
||||
debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
|
||||
fastmath = kws.get('fastmath', False)
|
||||
extensions = kws.get('extensions', [])
|
||||
|
||||
if debug and opt:
|
||||
msg = ("debug=True with opt=True (the default) "
|
||||
"is not supported by CUDA. This may result in a crash"
|
||||
" - set debug=False or opt=False.")
|
||||
warn(NumbaInvalidConfigWarning(msg))
|
||||
|
||||
if debug and lineinfo:
|
||||
msg = ("debug and lineinfo are mutually exclusive. Use debug to get "
|
||||
"full debug info (this disables some optimizations), or "
|
||||
"lineinfo for line info only with code generation unaffected.")
|
||||
warn(NumbaInvalidConfigWarning(msg))
|
||||
|
||||
if device and kws.get('link'):
|
||||
raise ValueError("link keyword invalid for device function")
|
||||
|
||||
if sigutils.is_signature(func_or_sig):
|
||||
signatures = [func_or_sig]
|
||||
specialized = True
|
||||
elif isinstance(func_or_sig, list):
|
||||
signatures = func_or_sig
|
||||
specialized = False
|
||||
else:
|
||||
signatures = None
|
||||
|
||||
if signatures is not None:
|
||||
if config.ENABLE_CUDASIM:
|
||||
def jitwrapper(func):
|
||||
return FakeCUDAKernel(func, device=device, fastmath=fastmath)
|
||||
return jitwrapper
|
||||
|
||||
def _jit(func):
|
||||
targetoptions = kws.copy()
|
||||
targetoptions['debug'] = debug
|
||||
targetoptions['lineinfo'] = lineinfo
|
||||
targetoptions['link'] = link
|
||||
targetoptions['opt'] = opt
|
||||
targetoptions['fastmath'] = fastmath
|
||||
targetoptions['device'] = device
|
||||
targetoptions['extensions'] = extensions
|
||||
|
||||
disp = CUDADispatcher(func, targetoptions=targetoptions)
|
||||
|
||||
if cache:
|
||||
disp.enable_caching()
|
||||
|
||||
for sig in signatures:
|
||||
argtypes, restype = sigutils.normalize_signature(sig)
|
||||
|
||||
if restype and not device and restype != types.void:
|
||||
raise TypeError("CUDA kernel must have void return type.")
|
||||
|
||||
if device:
|
||||
from numba.core import typeinfer
|
||||
with typeinfer.register_dispatcher(disp):
|
||||
disp.compile_device(argtypes, restype)
|
||||
else:
|
||||
disp.compile(argtypes)
|
||||
|
||||
disp._specialized = specialized
|
||||
disp.disable_compile()
|
||||
|
||||
return disp
|
||||
|
||||
return _jit
|
||||
else:
|
||||
if func_or_sig is None:
|
||||
if config.ENABLE_CUDASIM:
|
||||
def autojitwrapper(func):
|
||||
return FakeCUDAKernel(func, device=device,
|
||||
fastmath=fastmath)
|
||||
else:
|
||||
def autojitwrapper(func):
|
||||
return jit(func, device=device, debug=debug, opt=opt,
|
||||
lineinfo=lineinfo, link=link, cache=cache, **kws)
|
||||
|
||||
return autojitwrapper
|
||||
# func_or_sig is a function
|
||||
else:
|
||||
if config.ENABLE_CUDASIM:
|
||||
return FakeCUDAKernel(func_or_sig, device=device,
|
||||
fastmath=fastmath)
|
||||
else:
|
||||
targetoptions = kws.copy()
|
||||
targetoptions['debug'] = debug
|
||||
targetoptions['lineinfo'] = lineinfo
|
||||
targetoptions['opt'] = opt
|
||||
targetoptions['link'] = link
|
||||
targetoptions['fastmath'] = fastmath
|
||||
targetoptions['device'] = device
|
||||
targetoptions['extensions'] = extensions
|
||||
disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
|
||||
|
||||
if cache:
|
||||
disp.enable_caching()
|
||||
|
||||
return disp
|
||||
|
||||
|
||||
def declare_device(name, sig):
|
||||
"""
|
||||
Declare the signature of a foreign function. Returns a descriptor that can
|
||||
be used to call the function from a Python kernel.
|
||||
|
||||
:param name: The name of the foreign function.
|
||||
:type name: str
|
||||
:param sig: The Numba signature of the function.
|
||||
"""
|
||||
argtypes, restype = sigutils.normalize_signature(sig)
|
||||
if restype is None:
|
||||
msg = 'Return type must be provided for device declarations'
|
||||
raise TypeError(msg)
|
||||
|
||||
return declare_device_function(name, restype, argtypes)
|
||||
@@ -0,0 +1,33 @@
|
||||
from numba.core.descriptors import TargetDescriptor
|
||||
from numba.core.options import TargetOptions
|
||||
from .target import CUDATargetContext, CUDATypingContext
|
||||
|
||||
|
||||
class CUDATargetOptions(TargetOptions):
|
||||
pass
|
||||
|
||||
|
||||
class CUDATarget(TargetDescriptor):
|
||||
def __init__(self, name):
|
||||
self.options = CUDATargetOptions
|
||||
# The typing and target contexts are initialized only when needed -
|
||||
# this prevents an attempt to load CUDA libraries at import time on
|
||||
# systems that might not have them present.
|
||||
self._typingctx = None
|
||||
self._targetctx = None
|
||||
super().__init__(name)
|
||||
|
||||
@property
|
||||
def typing_context(self):
|
||||
if self._typingctx is None:
|
||||
self._typingctx = CUDATypingContext()
|
||||
return self._typingctx
|
||||
|
||||
@property
|
||||
def target_context(self):
|
||||
if self._targetctx is None:
|
||||
self._targetctx = CUDATargetContext(self._typingctx)
|
||||
return self._targetctx
|
||||
|
||||
|
||||
cuda_target = CUDATarget('cuda')
|
||||
@@ -0,0 +1,89 @@
|
||||
# Re export
|
||||
import sys
|
||||
from numba.cuda import cg
|
||||
from .stubs import (threadIdx, blockIdx, blockDim, gridDim, laneid, warpsize,
|
||||
syncwarp, shared, local, const, atomic,
|
||||
shfl_sync_intrinsic, vote_sync_intrinsic, match_any_sync,
|
||||
match_all_sync, threadfence_block, threadfence_system,
|
||||
threadfence, selp, popc, brev, clz, ffs, fma, cbrt,
|
||||
activemask, lanemask_lt, nanosleep, fp16,
|
||||
_vector_type_stubs)
|
||||
from .intrinsics import (grid, gridsize, syncthreads, syncthreads_and,
|
||||
syncthreads_count, syncthreads_or)
|
||||
from .cudadrv.error import CudaSupportError
|
||||
from numba.cuda.cudadrv.driver import (BaseCUDAMemoryManager,
|
||||
HostOnlyCUDAMemoryManager,
|
||||
GetIpcHandleMixin, MemoryPointer,
|
||||
MappedMemory, PinnedMemory, MemoryInfo,
|
||||
IpcHandle, set_memory_manager)
|
||||
from numba.cuda.cudadrv.runtime import runtime
|
||||
from .cudadrv import nvvm
|
||||
from numba.cuda import initialize
|
||||
from .errors import KernelRuntimeError
|
||||
|
||||
from .decorators import jit, declare_device
|
||||
from .api import *
|
||||
from .api import _auto_device
|
||||
from .args import In, Out, InOut
|
||||
|
||||
from .intrinsic_wrapper import (all_sync, any_sync, eq_sync, ballot_sync,
|
||||
shfl_sync, shfl_up_sync, shfl_down_sync,
|
||||
shfl_xor_sync)
|
||||
|
||||
from .kernels import reduction
|
||||
|
||||
reduce = Reduce = reduction.Reduce
|
||||
|
||||
# Expose vector type constructors and aliases as module level attributes.
|
||||
for vector_type_stub in _vector_type_stubs:
|
||||
setattr(sys.modules[__name__], vector_type_stub.__name__, vector_type_stub)
|
||||
for alias in vector_type_stub.aliases:
|
||||
setattr(sys.modules[__name__], alias, vector_type_stub)
|
||||
del vector_type_stub, _vector_type_stubs
|
||||
|
||||
|
||||
def is_available():
|
||||
"""Returns a boolean to indicate the availability of a CUDA GPU.
|
||||
|
||||
This will initialize the driver if it hasn't been initialized.
|
||||
"""
|
||||
# whilst `driver.is_available` will init the driver itself,
|
||||
# the driver initialization may raise and as a result break
|
||||
# test discovery/orchestration as `cuda.is_available` is often
|
||||
# used as a guard for whether to run a CUDA test, the try/except
|
||||
# below is to handle this case.
|
||||
driver_is_available = False
|
||||
try:
|
||||
driver_is_available = driver.driver.is_available
|
||||
except CudaSupportError:
|
||||
pass
|
||||
|
||||
return driver_is_available and nvvm.is_available()
|
||||
|
||||
|
||||
def is_supported_version():
|
||||
"""Returns True if the CUDA Runtime is a supported version.
|
||||
|
||||
Unsupported versions (e.g. newer versions than those known to Numba)
|
||||
may still work; this function provides a facility to check whether the
|
||||
current Numba version is tested and known to work with the current
|
||||
runtime version. If the current version is unsupported, the caller can
|
||||
decide how to act. Options include:
|
||||
|
||||
- Continuing silently,
|
||||
- Emitting a warning,
|
||||
- Generating an error or otherwise preventing the use of CUDA.
|
||||
"""
|
||||
|
||||
return runtime.is_supported_version()
|
||||
|
||||
|
||||
def cuda_error():
|
||||
"""Returns None if there was no error initializing the CUDA driver.
|
||||
If there was an error initializing the driver, a string describing the
|
||||
error is returned.
|
||||
"""
|
||||
return driver.driver.initialization_error
|
||||
|
||||
|
||||
initialize.initialize_all()
|
||||
@@ -0,0 +1,919 @@
|
||||
"""
|
||||
Implements custom ufunc dispatch mechanism for non-CPU devices.
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import OrderedDict
|
||||
import operator
|
||||
import warnings
|
||||
from functools import reduce
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numba.np.ufunc.ufuncbuilder import _BaseUFuncBuilder, parse_identity
|
||||
from numba.core import types, sigutils
|
||||
from numba.core.typing import signature
|
||||
from numba.np.ufunc.sigparse import parse_signature
|
||||
|
||||
|
||||
def _broadcast_axis(a, b):
|
||||
"""
|
||||
Raises
|
||||
------
|
||||
ValueError if broadcast fails
|
||||
"""
|
||||
if a == b:
|
||||
return a
|
||||
elif a == 1:
|
||||
return b
|
||||
elif b == 1:
|
||||
return a
|
||||
else:
|
||||
raise ValueError("failed to broadcast {0} and {1}".format(a, b))
|
||||
|
||||
|
||||
def _pairwise_broadcast(shape1, shape2):
|
||||
"""
|
||||
Raises
|
||||
------
|
||||
ValueError if broadcast fails
|
||||
"""
|
||||
shape1, shape2 = map(tuple, [shape1, shape2])
|
||||
|
||||
while len(shape1) < len(shape2):
|
||||
shape1 = (1,) + shape1
|
||||
|
||||
while len(shape1) > len(shape2):
|
||||
shape2 = (1,) + shape2
|
||||
|
||||
return tuple(_broadcast_axis(a, b) for a, b in zip(shape1, shape2))
|
||||
|
||||
|
||||
def _multi_broadcast(*shapelist):
|
||||
"""
|
||||
Raises
|
||||
------
|
||||
ValueError if broadcast fails
|
||||
"""
|
||||
assert shapelist
|
||||
|
||||
result = shapelist[0]
|
||||
others = shapelist[1:]
|
||||
try:
|
||||
for i, each in enumerate(others, start=1):
|
||||
result = _pairwise_broadcast(result, each)
|
||||
except ValueError:
|
||||
raise ValueError("failed to broadcast argument #{0}".format(i))
|
||||
else:
|
||||
return result
|
||||
|
||||
|
||||
class UFuncMechanism(object):
|
||||
"""
|
||||
Prepare ufunc arguments for vectorize.
|
||||
"""
|
||||
DEFAULT_STREAM = None
|
||||
SUPPORT_DEVICE_SLICING = False
|
||||
|
||||
def __init__(self, typemap, args):
|
||||
"""Never used directly by user. Invoke by UFuncMechanism.call().
|
||||
"""
|
||||
self.typemap = typemap
|
||||
self.args = args
|
||||
nargs = len(self.args)
|
||||
self.argtypes = [None] * nargs
|
||||
self.scalarpos = []
|
||||
self.signature = None
|
||||
self.arrays = [None] * nargs
|
||||
|
||||
def _fill_arrays(self):
|
||||
"""
|
||||
Get all arguments in array form
|
||||
"""
|
||||
for i, arg in enumerate(self.args):
|
||||
if self.is_device_array(arg):
|
||||
self.arrays[i] = self.as_device_array(arg)
|
||||
elif isinstance(arg, (int, float, complex, np.number)):
|
||||
# Is scalar
|
||||
self.scalarpos.append(i)
|
||||
else:
|
||||
self.arrays[i] = np.asarray(arg)
|
||||
|
||||
def _fill_argtypes(self):
|
||||
"""
|
||||
Get dtypes
|
||||
"""
|
||||
for i, ary in enumerate(self.arrays):
|
||||
if ary is not None:
|
||||
dtype = getattr(ary, 'dtype')
|
||||
if dtype is None:
|
||||
dtype = np.asarray(ary).dtype
|
||||
self.argtypes[i] = dtype
|
||||
|
||||
def _resolve_signature(self):
|
||||
"""Resolve signature.
|
||||
May have ambiguous case.
|
||||
"""
|
||||
matches = []
|
||||
# Resolve scalar args exact match first
|
||||
if self.scalarpos:
|
||||
# Try resolve scalar arguments
|
||||
for formaltys in self.typemap:
|
||||
match_map = []
|
||||
for i, (formal, actual) in enumerate(zip(formaltys,
|
||||
self.argtypes)):
|
||||
if actual is None:
|
||||
actual = np.asarray(self.args[i]).dtype
|
||||
|
||||
match_map.append(actual == formal)
|
||||
|
||||
if all(match_map):
|
||||
matches.append(formaltys)
|
||||
|
||||
# No matching with exact match; try coercing the scalar arguments
|
||||
if not matches:
|
||||
matches = []
|
||||
for formaltys in self.typemap:
|
||||
all_matches = all(actual is None or formal == actual
|
||||
for formal, actual in
|
||||
zip(formaltys, self.argtypes))
|
||||
if all_matches:
|
||||
matches.append(formaltys)
|
||||
|
||||
if not matches:
|
||||
raise TypeError("No matching version. GPU ufunc requires array "
|
||||
"arguments to have the exact types. This behaves "
|
||||
"like regular ufunc with casting='no'.")
|
||||
|
||||
if len(matches) > 1:
|
||||
raise TypeError("Failed to resolve ufunc due to ambiguous "
|
||||
"signature. Too many untyped scalars. "
|
||||
"Use numpy dtype object to type tag.")
|
||||
|
||||
# Try scalar arguments
|
||||
self.argtypes = matches[0]
|
||||
|
||||
def _get_actual_args(self):
|
||||
"""Return the actual arguments
|
||||
Casts scalar arguments to np.array.
|
||||
"""
|
||||
for i in self.scalarpos:
|
||||
self.arrays[i] = np.array([self.args[i]], dtype=self.argtypes[i])
|
||||
|
||||
return self.arrays
|
||||
|
||||
def _broadcast(self, arys):
|
||||
"""Perform numpy ufunc broadcasting
|
||||
"""
|
||||
shapelist = [a.shape for a in arys]
|
||||
shape = _multi_broadcast(*shapelist)
|
||||
|
||||
for i, ary in enumerate(arys):
|
||||
if ary.shape == shape:
|
||||
pass
|
||||
|
||||
else:
|
||||
if self.is_device_array(ary):
|
||||
arys[i] = self.broadcast_device(ary, shape)
|
||||
|
||||
else:
|
||||
ax_differs = [ax for ax in range(len(shape))
|
||||
if ax >= ary.ndim
|
||||
or ary.shape[ax] != shape[ax]]
|
||||
|
||||
missingdim = len(shape) - len(ary.shape)
|
||||
strides = [0] * missingdim + list(ary.strides)
|
||||
|
||||
for ax in ax_differs:
|
||||
strides[ax] = 0
|
||||
|
||||
strided = np.lib.stride_tricks.as_strided(ary,
|
||||
shape=shape,
|
||||
strides=strides)
|
||||
|
||||
arys[i] = self.force_array_layout(strided)
|
||||
|
||||
return arys
|
||||
|
||||
def get_arguments(self):
|
||||
"""Prepare and return the arguments for the ufunc.
|
||||
Does not call to_device().
|
||||
"""
|
||||
self._fill_arrays()
|
||||
self._fill_argtypes()
|
||||
self._resolve_signature()
|
||||
arys = self._get_actual_args()
|
||||
return self._broadcast(arys)
|
||||
|
||||
def get_function(self):
|
||||
"""Returns (result_dtype, function)
|
||||
"""
|
||||
return self.typemap[self.argtypes]
|
||||
|
||||
def is_device_array(self, obj):
|
||||
"""Is the `obj` a device array?
|
||||
Override in subclass
|
||||
"""
|
||||
return False
|
||||
|
||||
def as_device_array(self, obj):
|
||||
"""Convert the `obj` to a device array
|
||||
Override in subclass
|
||||
|
||||
Default implementation is an identity function
|
||||
"""
|
||||
return obj
|
||||
|
||||
def broadcast_device(self, ary, shape):
|
||||
"""Handles ondevice broadcasting
|
||||
|
||||
Override in subclass to add support.
|
||||
"""
|
||||
raise NotImplementedError("broadcasting on device is not supported")
|
||||
|
||||
def force_array_layout(self, ary):
|
||||
"""Ensures array layout met device requirement.
|
||||
|
||||
Override in sublcass
|
||||
"""
|
||||
return ary
|
||||
|
||||
@classmethod
|
||||
def call(cls, typemap, args, kws):
|
||||
"""Perform the entire ufunc call mechanism.
|
||||
"""
|
||||
# Handle keywords
|
||||
stream = kws.pop('stream', cls.DEFAULT_STREAM)
|
||||
out = kws.pop('out', None)
|
||||
|
||||
if kws:
|
||||
warnings.warn("unrecognized keywords: %s" % ', '.join(kws))
|
||||
|
||||
# Begin call resolution
|
||||
cr = cls(typemap, args)
|
||||
args = cr.get_arguments()
|
||||
resty, func = cr.get_function()
|
||||
|
||||
outshape = args[0].shape
|
||||
|
||||
# Adjust output value
|
||||
if out is not None and cr.is_device_array(out):
|
||||
out = cr.as_device_array(out)
|
||||
|
||||
def attempt_ravel(a):
|
||||
if cr.SUPPORT_DEVICE_SLICING:
|
||||
raise NotImplementedError
|
||||
|
||||
try:
|
||||
# Call the `.ravel()` method
|
||||
return a.ravel()
|
||||
except NotImplementedError:
|
||||
# If it is not a device array
|
||||
if not cr.is_device_array(a):
|
||||
raise
|
||||
# For device array, retry ravel on the host by first
|
||||
# copying it back.
|
||||
else:
|
||||
hostary = cr.to_host(a, stream).ravel()
|
||||
return cr.to_device(hostary, stream)
|
||||
|
||||
if args[0].ndim > 1:
|
||||
args = [attempt_ravel(a) for a in args]
|
||||
|
||||
# Prepare argument on the device
|
||||
devarys = []
|
||||
any_device = False
|
||||
for a in args:
|
||||
if cr.is_device_array(a):
|
||||
devarys.append(a)
|
||||
any_device = True
|
||||
else:
|
||||
dev_a = cr.to_device(a, stream=stream)
|
||||
devarys.append(dev_a)
|
||||
|
||||
# Launch
|
||||
shape = args[0].shape
|
||||
if out is None:
|
||||
# No output is provided
|
||||
devout = cr.allocate_device_array(shape, resty, stream=stream)
|
||||
|
||||
devarys.extend([devout])
|
||||
cr.launch(func, shape[0], stream, devarys)
|
||||
|
||||
if any_device:
|
||||
# If any of the arguments are on device,
|
||||
# Keep output on the device
|
||||
return devout.reshape(outshape)
|
||||
else:
|
||||
# Otherwise, transfer output back to host
|
||||
return devout.copy_to_host().reshape(outshape)
|
||||
|
||||
elif cr.is_device_array(out):
|
||||
# If output is provided and it is a device array,
|
||||
# Return device array
|
||||
if out.ndim > 1:
|
||||
out = attempt_ravel(out)
|
||||
devout = out
|
||||
devarys.extend([devout])
|
||||
cr.launch(func, shape[0], stream, devarys)
|
||||
return devout.reshape(outshape)
|
||||
|
||||
else:
|
||||
# If output is provided and it is a host array,
|
||||
# Return host array
|
||||
assert out.shape == shape
|
||||
assert out.dtype == resty
|
||||
devout = cr.allocate_device_array(shape, resty, stream=stream)
|
||||
devarys.extend([devout])
|
||||
cr.launch(func, shape[0], stream, devarys)
|
||||
return devout.copy_to_host(out, stream=stream).reshape(outshape)
|
||||
|
||||
def to_device(self, hostary, stream):
|
||||
"""Implement to device transfer
|
||||
Override in subclass
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def to_host(self, devary, stream):
|
||||
"""Implement to host transfer
|
||||
Override in subclass
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def allocate_device_array(self, shape, dtype, stream):
|
||||
"""Implements device allocation
|
||||
Override in subclass
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def launch(self, func, count, stream, args):
|
||||
"""Implements device function invocation
|
||||
Override in subclass
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def to_dtype(ty):
|
||||
if isinstance(ty, types.EnumMember):
|
||||
ty = ty.dtype
|
||||
return np.dtype(str(ty))
|
||||
|
||||
|
||||
class DeviceVectorize(_BaseUFuncBuilder):
|
||||
def __init__(self, func, identity=None, cache=False, targetoptions=None):
|
||||
if targetoptions is None:
|
||||
targetoptions = {}
|
||||
if cache:
|
||||
raise TypeError("caching is not supported")
|
||||
for opt in targetoptions:
|
||||
if opt == 'nopython':
|
||||
warnings.warn("nopython kwarg for cuda target is redundant",
|
||||
RuntimeWarning)
|
||||
else:
|
||||
fmt = "Unrecognized options. "
|
||||
fmt += "cuda vectorize target does not support option: '%s'"
|
||||
raise KeyError(fmt % opt)
|
||||
self.py_func = func
|
||||
self.identity = parse_identity(identity)
|
||||
# { arg_dtype: (return_dtype), cudakernel }
|
||||
self.kernelmap = OrderedDict()
|
||||
|
||||
@property
|
||||
def pyfunc(self):
|
||||
return self.py_func
|
||||
|
||||
def add(self, sig=None):
|
||||
# compile core as device function
|
||||
args, return_type = sigutils.normalize_signature(sig)
|
||||
devfnsig = signature(return_type, *args)
|
||||
|
||||
funcname = self.pyfunc.__name__
|
||||
kernelsource = self._get_kernel_source(self._kernel_template,
|
||||
devfnsig, funcname)
|
||||
corefn, return_type = self._compile_core(devfnsig)
|
||||
glbl = self._get_globals(corefn)
|
||||
sig = signature(types.void, *([a[:] for a in args] + [return_type[:]]))
|
||||
exec(kernelsource, glbl)
|
||||
|
||||
stager = glbl['__vectorized_%s' % funcname]
|
||||
kernel = self._compile_kernel(stager, sig)
|
||||
|
||||
argdtypes = tuple(to_dtype(t) for t in devfnsig.args)
|
||||
resdtype = to_dtype(return_type)
|
||||
self.kernelmap[tuple(argdtypes)] = resdtype, kernel
|
||||
|
||||
def build_ufunc(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_kernel_source(self, template, sig, funcname):
|
||||
args = ['a%d' % i for i in range(len(sig.args))]
|
||||
fmts = dict(name=funcname,
|
||||
args=', '.join(args),
|
||||
argitems=', '.join('%s[__tid__]' % i for i in args))
|
||||
return template.format(**fmts)
|
||||
|
||||
def _compile_core(self, sig):
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_globals(self, corefn):
|
||||
raise NotImplementedError
|
||||
|
||||
def _compile_kernel(self, fnobj, sig):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DeviceGUFuncVectorize(_BaseUFuncBuilder):
|
||||
def __init__(
|
||||
self,
|
||||
func,
|
||||
sig,
|
||||
identity=None,
|
||||
cache=False,
|
||||
targetoptions=None,
|
||||
writable_args=(),
|
||||
):
|
||||
if targetoptions is None:
|
||||
targetoptions = {}
|
||||
if cache:
|
||||
raise TypeError("caching is not supported")
|
||||
if writable_args:
|
||||
raise TypeError("writable_args are not supported")
|
||||
|
||||
# Allow nopython flag to be set.
|
||||
if not targetoptions.pop('nopython', True):
|
||||
raise TypeError("nopython flag must be True")
|
||||
# Are there any more target options?
|
||||
if targetoptions:
|
||||
opts = ', '.join([repr(k) for k in targetoptions.keys()])
|
||||
fmt = "The following target options are not supported: {0}"
|
||||
raise TypeError(fmt.format(opts))
|
||||
|
||||
self.py_func = func
|
||||
self.identity = parse_identity(identity)
|
||||
self.signature = sig
|
||||
self.inputsig, self.outputsig = parse_signature(self.signature)
|
||||
|
||||
# Maps from a tuple of input_dtypes to (output_dtypes, kernel)
|
||||
self.kernelmap = OrderedDict()
|
||||
|
||||
@property
|
||||
def pyfunc(self):
|
||||
return self.py_func
|
||||
|
||||
def add(self, sig=None):
|
||||
indims = [len(x) for x in self.inputsig]
|
||||
outdims = [len(x) for x in self.outputsig]
|
||||
args, return_type = sigutils.normalize_signature(sig)
|
||||
|
||||
# It is only valid to specify types.none as a return type, or to not
|
||||
# specify the return type (where the "Python None" is the return type)
|
||||
valid_return_type = return_type in (types.none, None)
|
||||
if not valid_return_type:
|
||||
raise TypeError('guvectorized functions cannot return values: '
|
||||
f'signature {sig} specifies {return_type} return '
|
||||
'type')
|
||||
|
||||
funcname = self.py_func.__name__
|
||||
src = expand_gufunc_template(self._kernel_template, indims,
|
||||
outdims, funcname, args)
|
||||
|
||||
glbls = self._get_globals(sig)
|
||||
|
||||
exec(src, glbls)
|
||||
fnobj = glbls['__gufunc_{name}'.format(name=funcname)]
|
||||
|
||||
outertys = list(_determine_gufunc_outer_types(args, indims + outdims))
|
||||
kernel = self._compile_kernel(fnobj, sig=tuple(outertys))
|
||||
|
||||
nout = len(outdims)
|
||||
dtypes = [np.dtype(str(t.dtype)) for t in outertys]
|
||||
indtypes = tuple(dtypes[:-nout])
|
||||
outdtypes = tuple(dtypes[-nout:])
|
||||
|
||||
self.kernelmap[indtypes] = outdtypes, kernel
|
||||
|
||||
def _compile_kernel(self, fnobj, sig):
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_globals(self, sig):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def _determine_gufunc_outer_types(argtys, dims):
|
||||
for at, nd in zip(argtys, dims):
|
||||
if isinstance(at, types.Array):
|
||||
yield at.copy(ndim=nd + 1)
|
||||
else:
|
||||
if nd > 0:
|
||||
raise ValueError("gufunc signature mismatch: ndim>0 for scalar")
|
||||
yield types.Array(dtype=at, ndim=1, layout='A')
|
||||
|
||||
|
||||
def expand_gufunc_template(template, indims, outdims, funcname, argtypes):
|
||||
"""Expand gufunc source template
|
||||
"""
|
||||
argdims = indims + outdims
|
||||
argnames = ["arg{0}".format(i) for i in range(len(argdims))]
|
||||
checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a)
|
||||
for a in argnames]))
|
||||
inputs = [_gen_src_for_indexing(aref, adims, atype)
|
||||
for aref, adims, atype in zip(argnames, indims, argtypes)]
|
||||
outputs = [_gen_src_for_indexing(aref, adims, atype)
|
||||
for aref, adims, atype in zip(argnames[len(indims):], outdims,
|
||||
argtypes[len(indims):])]
|
||||
argitems = inputs + outputs
|
||||
src = template.format(name=funcname, args=', '.join(argnames),
|
||||
checkedarg=checkedarg,
|
||||
argitems=', '.join(argitems))
|
||||
return src
|
||||
|
||||
|
||||
def _gen_src_for_indexing(aref, adims, atype):
|
||||
return "{aref}[{sliced}]".format(aref=aref,
|
||||
sliced=_gen_src_index(adims, atype))
|
||||
|
||||
|
||||
def _gen_src_index(adims, atype):
|
||||
if adims > 0:
|
||||
return ','.join(['__tid__'] + [':'] * adims)
|
||||
elif isinstance(atype, types.Array) and atype.ndim - 1 == adims:
|
||||
# Special case for 0-nd in shape-signature but
|
||||
# 1d array in type signature.
|
||||
# Slice it so that the result has the same dimension.
|
||||
return '__tid__:(__tid__ + 1)'
|
||||
else:
|
||||
return '__tid__'
|
||||
|
||||
|
||||
class GUFuncEngine(object):
|
||||
'''Determine how to broadcast and execute a gufunc
|
||||
base on input shape and signature
|
||||
'''
|
||||
|
||||
@classmethod
|
||||
def from_signature(cls, signature):
|
||||
return cls(*parse_signature(signature))
|
||||
|
||||
def __init__(self, inputsig, outputsig):
|
||||
# signatures
|
||||
self.sin = inputsig
|
||||
self.sout = outputsig
|
||||
# argument count
|
||||
self.nin = len(self.sin)
|
||||
self.nout = len(self.sout)
|
||||
|
||||
def schedule(self, ishapes):
|
||||
if len(ishapes) != self.nin:
|
||||
raise TypeError('invalid number of input argument')
|
||||
|
||||
# associate symbol values for input signature
|
||||
symbolmap = {}
|
||||
outer_shapes = []
|
||||
inner_shapes = []
|
||||
|
||||
for argn, (shape, symbols) in enumerate(zip(ishapes, self.sin)):
|
||||
argn += 1 # start from 1 for human
|
||||
inner_ndim = len(symbols)
|
||||
if len(shape) < inner_ndim:
|
||||
fmt = "arg #%d: insufficient inner dimension"
|
||||
raise ValueError(fmt % (argn,))
|
||||
if inner_ndim:
|
||||
inner_shape = shape[-inner_ndim:]
|
||||
outer_shape = shape[:-inner_ndim]
|
||||
else:
|
||||
inner_shape = ()
|
||||
outer_shape = shape
|
||||
|
||||
for axis, (dim, sym) in enumerate(zip(inner_shape, symbols)):
|
||||
axis += len(outer_shape)
|
||||
if sym in symbolmap:
|
||||
if symbolmap[sym] != dim:
|
||||
fmt = "arg #%d: shape[%d] mismatch argument"
|
||||
raise ValueError(fmt % (argn, axis))
|
||||
symbolmap[sym] = dim
|
||||
|
||||
outer_shapes.append(outer_shape)
|
||||
inner_shapes.append(inner_shape)
|
||||
|
||||
# solve output shape
|
||||
oshapes = []
|
||||
for outsig in self.sout:
|
||||
oshape = []
|
||||
for sym in outsig:
|
||||
oshape.append(symbolmap[sym])
|
||||
oshapes.append(tuple(oshape))
|
||||
|
||||
# find the biggest outershape as looping dimension
|
||||
sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]
|
||||
largest_i = np.argmax(sizes)
|
||||
loopdims = outer_shapes[largest_i]
|
||||
|
||||
pinned = [False] * self.nin # same argument for each iteration
|
||||
for i, d in enumerate(outer_shapes):
|
||||
if d != loopdims:
|
||||
if d == (1,) or d == ():
|
||||
pinned[i] = True
|
||||
else:
|
||||
fmt = "arg #%d: outer dimension mismatch"
|
||||
raise ValueError(fmt % (i + 1,))
|
||||
|
||||
return GUFuncSchedule(self, inner_shapes, oshapes, loopdims, pinned)
|
||||
|
||||
|
||||
class GUFuncSchedule(object):
|
||||
def __init__(self, parent, ishapes, oshapes, loopdims, pinned):
|
||||
self.parent = parent
|
||||
# core shapes
|
||||
self.ishapes = ishapes
|
||||
self.oshapes = oshapes
|
||||
# looping dimension
|
||||
self.loopdims = loopdims
|
||||
self.loopn = reduce(operator.mul, loopdims, 1)
|
||||
# flags
|
||||
self.pinned = pinned
|
||||
|
||||
self.output_shapes = [loopdims + s for s in oshapes]
|
||||
|
||||
def __str__(self):
|
||||
import pprint
|
||||
|
||||
attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned'
|
||||
values = [(k, getattr(self, k)) for k in attrs]
|
||||
return pprint.pformat(dict(values))
|
||||
|
||||
|
||||
class GeneralizedUFunc(object):
|
||||
def __init__(self, kernelmap, engine):
|
||||
self.kernelmap = kernelmap
|
||||
self.engine = engine
|
||||
self.max_blocksize = 2 ** 30
|
||||
|
||||
def __call__(self, *args, **kws):
|
||||
callsteps = self._call_steps(self.engine.nin, self.engine.nout,
|
||||
args, kws)
|
||||
indtypes, schedule, outdtypes, kernel = self._schedule(
|
||||
callsteps.inputs, callsteps.outputs)
|
||||
callsteps.adjust_input_types(indtypes)
|
||||
|
||||
outputs = callsteps.prepare_outputs(schedule, outdtypes)
|
||||
inputs = callsteps.prepare_inputs()
|
||||
parameters = self._broadcast(schedule, inputs, outputs)
|
||||
|
||||
callsteps.launch_kernel(kernel, schedule.loopn, parameters)
|
||||
|
||||
return callsteps.post_process_outputs(outputs)
|
||||
|
||||
def _schedule(self, inputs, outs):
|
||||
input_shapes = [a.shape for a in inputs]
|
||||
schedule = self.engine.schedule(input_shapes)
|
||||
|
||||
# find kernel
|
||||
indtypes = tuple(i.dtype for i in inputs)
|
||||
try:
|
||||
outdtypes, kernel = self.kernelmap[indtypes]
|
||||
except KeyError:
|
||||
# No exact match, then use the first compatible.
|
||||
# This does not match the numpy dispatching exactly.
|
||||
# Later, we may just jit a new version for the missing signature.
|
||||
indtypes = self._search_matching_signature(indtypes)
|
||||
# Select kernel
|
||||
outdtypes, kernel = self.kernelmap[indtypes]
|
||||
|
||||
# check output
|
||||
for sched_shape, out in zip(schedule.output_shapes, outs):
|
||||
if out is not None and sched_shape != out.shape:
|
||||
raise ValueError('output shape mismatch')
|
||||
|
||||
return indtypes, schedule, outdtypes, kernel
|
||||
|
||||
def _search_matching_signature(self, idtypes):
|
||||
"""
|
||||
Given the input types in `idtypes`, return a compatible sequence of
|
||||
types that is defined in `kernelmap`.
|
||||
|
||||
Note: Ordering is guaranteed by `kernelmap` being a OrderedDict
|
||||
"""
|
||||
for sig in self.kernelmap.keys():
|
||||
if all(np.can_cast(actual, desired)
|
||||
for actual, desired in zip(sig, idtypes)):
|
||||
return sig
|
||||
else:
|
||||
raise TypeError("no matching signature")
|
||||
|
||||
def _broadcast(self, schedule, params, retvals):
|
||||
assert schedule.loopn > 0, "zero looping dimension"
|
||||
|
||||
odim = 1 if not schedule.loopdims else schedule.loopn
|
||||
newparams = []
|
||||
for p, cs in zip(params, schedule.ishapes):
|
||||
if not cs and p.size == 1:
|
||||
# Broadcast scalar input
|
||||
devary = self._broadcast_scalar_input(p, odim)
|
||||
newparams.append(devary)
|
||||
else:
|
||||
# Broadcast vector input
|
||||
newparams.append(self._broadcast_array(p, odim, cs))
|
||||
|
||||
newretvals = []
|
||||
for retval, oshape in zip(retvals, schedule.oshapes):
|
||||
newretvals.append(retval.reshape(odim, *oshape))
|
||||
return tuple(newparams) + tuple(newretvals)
|
||||
|
||||
def _broadcast_array(self, ary, newdim, innerdim):
|
||||
newshape = (newdim,) + innerdim
|
||||
# No change in shape
|
||||
if ary.shape == newshape:
|
||||
return ary
|
||||
|
||||
# Creating new dimension
|
||||
elif len(ary.shape) < len(newshape):
|
||||
assert newshape[-len(ary.shape):] == ary.shape, \
|
||||
"cannot add dim and reshape at the same time"
|
||||
return self._broadcast_add_axis(ary, newshape)
|
||||
|
||||
# Collapsing dimension
|
||||
else:
|
||||
return ary.reshape(*newshape)
|
||||
|
||||
def _broadcast_add_axis(self, ary, newshape):
|
||||
raise NotImplementedError("cannot add new axis")
|
||||
|
||||
def _broadcast_scalar_input(self, ary, shape):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class GUFuncCallSteps(metaclass=ABCMeta):
|
||||
"""
|
||||
Implements memory management and kernel launch operations for GUFunc calls.
|
||||
|
||||
One instance of this class is instantiated for each call, and the instance
|
||||
is specific to the arguments given to the GUFunc call.
|
||||
|
||||
The base class implements the overall logic; subclasses provide
|
||||
target-specific implementations of individual functions.
|
||||
"""
|
||||
|
||||
# The base class uses these slots; subclasses may provide additional slots.
|
||||
__slots__ = [
|
||||
'outputs',
|
||||
'inputs',
|
||||
'_copy_result_to_host',
|
||||
]
|
||||
|
||||
@abstractmethod
|
||||
def launch_kernel(self, kernel, nelem, args):
|
||||
"""Implement the kernel launch"""
|
||||
|
||||
@abstractmethod
|
||||
def is_device_array(self, obj):
|
||||
"""
|
||||
Return True if `obj` is a device array for this target, False
|
||||
otherwise.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def as_device_array(self, obj):
|
||||
"""
|
||||
Return `obj` as a device array on this target.
|
||||
|
||||
May return `obj` directly if it is already on the target.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def to_device(self, hostary):
|
||||
"""
|
||||
Copy `hostary` to the device and return the device array.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def allocate_device_array(self, shape, dtype):
|
||||
"""
|
||||
Allocate a new uninitialized device array with the given shape and
|
||||
dtype.
|
||||
"""
|
||||
|
||||
def __init__(self, nin, nout, args, kwargs):
|
||||
outputs = kwargs.get('out')
|
||||
|
||||
# Ensure the user has passed a correct number of arguments
|
||||
if outputs is None and len(args) not in (nin, (nin + nout)):
|
||||
def pos_argn(n):
|
||||
return f'{n} positional argument{"s" * (n != 1)}'
|
||||
|
||||
msg = (f'This gufunc accepts {pos_argn(nin)} (when providing '
|
||||
f'input only) or {pos_argn(nin + nout)} (when providing '
|
||||
f'input and output). Got {pos_argn(len(args))}.')
|
||||
raise TypeError(msg)
|
||||
|
||||
if outputs is not None and len(args) > nin:
|
||||
raise ValueError("cannot specify argument 'out' as both positional "
|
||||
"and keyword")
|
||||
else:
|
||||
# If the user did not pass outputs either in the out kwarg or as
|
||||
# positional arguments, then we need to generate an initial list of
|
||||
# "placeholder" outputs using None as a sentry value
|
||||
outputs = [outputs] * nout
|
||||
|
||||
# Ensure all output device arrays are Numba device arrays - for
|
||||
# example, any output passed in that supports the CUDA Array Interface
|
||||
# is converted to a Numba CUDA device array; others are left untouched.
|
||||
all_user_outputs_are_host = True
|
||||
self.outputs = []
|
||||
for output in outputs:
|
||||
if self.is_device_array(output):
|
||||
self.outputs.append(self.as_device_array(output))
|
||||
all_user_outputs_are_host = False
|
||||
else:
|
||||
self.outputs.append(output)
|
||||
|
||||
all_host_arrays = not any([self.is_device_array(a) for a in args])
|
||||
|
||||
# - If any of the arguments are device arrays, we leave the output on
|
||||
# the device.
|
||||
self._copy_result_to_host = (all_host_arrays and
|
||||
all_user_outputs_are_host)
|
||||
|
||||
# Normalize arguments - ensure they are either device- or host-side
|
||||
# arrays (as opposed to lists, tuples, etc).
|
||||
def normalize_arg(a):
|
||||
if self.is_device_array(a):
|
||||
convert = self.as_device_array
|
||||
else:
|
||||
convert = np.asarray
|
||||
|
||||
return convert(a)
|
||||
|
||||
normalized_args = [normalize_arg(a) for a in args]
|
||||
self.inputs = normalized_args[:nin]
|
||||
|
||||
# Check if there are extra arguments for outputs.
|
||||
unused_inputs = normalized_args[nin:]
|
||||
if unused_inputs:
|
||||
self.outputs = unused_inputs
|
||||
|
||||
def adjust_input_types(self, indtypes):
|
||||
"""
|
||||
Attempt to cast the inputs to the required types if necessary
|
||||
and if they are not device arrays.
|
||||
|
||||
Side effect: Only affects the elements of `inputs` that require
|
||||
a type cast.
|
||||
"""
|
||||
for i, (ity, val) in enumerate(zip(indtypes, self.inputs)):
|
||||
if ity != val.dtype:
|
||||
if not hasattr(val, 'astype'):
|
||||
msg = ("compatible signature is possible by casting but "
|
||||
"{0} does not support .astype()").format(type(val))
|
||||
raise TypeError(msg)
|
||||
# Cast types
|
||||
self.inputs[i] = val.astype(ity)
|
||||
|
||||
def prepare_outputs(self, schedule, outdtypes):
|
||||
"""
|
||||
Returns a list of output parameters that all reside on the target
|
||||
device.
|
||||
|
||||
Outputs that were passed-in to the GUFunc are used if they reside on the
|
||||
device; other outputs are allocated as necessary.
|
||||
"""
|
||||
outputs = []
|
||||
for shape, dtype, output in zip(schedule.output_shapes, outdtypes,
|
||||
self.outputs):
|
||||
if output is None or self._copy_result_to_host:
|
||||
output = self.allocate_device_array(shape, dtype)
|
||||
outputs.append(output)
|
||||
|
||||
return outputs
|
||||
|
||||
def prepare_inputs(self):
|
||||
"""
|
||||
Returns a list of input parameters that all reside on the target device.
|
||||
"""
|
||||
def ensure_device(parameter):
|
||||
if self.is_device_array(parameter):
|
||||
convert = self.as_device_array
|
||||
else:
|
||||
convert = self.to_device
|
||||
|
||||
return convert(parameter)
|
||||
|
||||
return [ensure_device(p) for p in self.inputs]
|
||||
|
||||
def post_process_outputs(self, outputs):
|
||||
"""
|
||||
Moves the given output(s) to the host if necessary.
|
||||
|
||||
Returns a single value (e.g. an array) if there was one output, or a
|
||||
tuple of arrays if there were multiple. Although this feels a little
|
||||
jarring, it is consistent with the behavior of GUFuncs in general.
|
||||
"""
|
||||
if self._copy_result_to_host:
|
||||
outputs = [self.to_host(output, self_output)
|
||||
for output, self_output in zip(outputs, self.outputs)]
|
||||
elif self.outputs[0] is not None:
|
||||
outputs = self.outputs
|
||||
|
||||
if len(outputs) == 1:
|
||||
return outputs[0]
|
||||
else:
|
||||
return tuple(outputs)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,59 @@
|
||||
import numbers
|
||||
from numba.core.errors import LoweringError
|
||||
|
||||
|
||||
class KernelRuntimeError(RuntimeError):
|
||||
def __init__(self, msg, tid=None, ctaid=None):
|
||||
self.tid = tid
|
||||
self.ctaid = ctaid
|
||||
self.msg = msg
|
||||
t = ("An exception was raised in thread=%s block=%s\n"
|
||||
"\t%s")
|
||||
msg = t % (self.tid, self.ctaid, self.msg)
|
||||
super(KernelRuntimeError, self).__init__(msg)
|
||||
|
||||
|
||||
class CudaLoweringError(LoweringError):
|
||||
pass
|
||||
|
||||
|
||||
_launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
|
||||
"kernels.html#kernel-invocation")
|
||||
missing_launch_config_msg = """
|
||||
Kernel launch configuration was not specified. Use the syntax:
|
||||
|
||||
kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn)
|
||||
|
||||
See {} for help.
|
||||
|
||||
""".format(_launch_help_url)
|
||||
|
||||
|
||||
def normalize_kernel_dimensions(griddim, blockdim):
|
||||
"""
|
||||
Normalize and validate the user-supplied kernel dimensions.
|
||||
"""
|
||||
|
||||
def check_dim(dim, name):
|
||||
if not isinstance(dim, (tuple, list)):
|
||||
dim = [dim]
|
||||
else:
|
||||
dim = list(dim)
|
||||
if len(dim) > 3:
|
||||
raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
|
||||
'got %r' % (name, dim))
|
||||
for v in dim:
|
||||
if not isinstance(v, numbers.Integral):
|
||||
raise TypeError('%s must be a sequence of integers, got %r'
|
||||
% (name, dim))
|
||||
while len(dim) < 3:
|
||||
dim.append(1)
|
||||
return tuple(dim)
|
||||
|
||||
if None in (griddim, blockdim):
|
||||
raise ValueError(missing_launch_config_msg)
|
||||
|
||||
griddim = check_dim(griddim, 'griddim')
|
||||
blockdim = check_dim(blockdim, 'blockdim')
|
||||
|
||||
return griddim, blockdim
|
||||
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
Added for symmetry with the core API
|
||||
"""
|
||||
|
||||
from numba.core.extending import intrinsic as _intrinsic
|
||||
|
||||
intrinsic = _intrinsic(target='cuda')
|
||||
@@ -0,0 +1,13 @@
|
||||
def initialize_all():
|
||||
# Import models to register them with the data model manager
|
||||
import numba.cuda.models # noqa: F401
|
||||
|
||||
from numba.cuda.decorators import jit
|
||||
from numba.cuda.dispatcher import CUDADispatcher
|
||||
from numba.core.target_extension import (target_registry,
|
||||
dispatcher_registry,
|
||||
jit_registry)
|
||||
|
||||
cuda_target = target_registry["cuda"]
|
||||
jit_registry[cuda_target] = jit
|
||||
dispatcher_registry[cuda_target] = CUDADispatcher
|
||||
@@ -0,0 +1,77 @@
|
||||
from .decorators import jit
|
||||
import numba
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def all_sync(mask, predicate):
|
||||
"""
|
||||
If for all threads in the masked warp the predicate is true, then
|
||||
a non-zero value is returned, otherwise 0 is returned.
|
||||
"""
|
||||
return numba.cuda.vote_sync_intrinsic(mask, 0, predicate)[1]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def any_sync(mask, predicate):
|
||||
"""
|
||||
If for any thread in the masked warp the predicate is true, then
|
||||
a non-zero value is returned, otherwise 0 is returned.
|
||||
"""
|
||||
return numba.cuda.vote_sync_intrinsic(mask, 1, predicate)[1]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def eq_sync(mask, predicate):
|
||||
"""
|
||||
If for all threads in the masked warp the boolean predicate is the same,
|
||||
then a non-zero value is returned, otherwise 0 is returned.
|
||||
"""
|
||||
return numba.cuda.vote_sync_intrinsic(mask, 2, predicate)[1]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def ballot_sync(mask, predicate):
|
||||
"""
|
||||
Returns a mask of all threads in the warp whose predicate is true,
|
||||
and are within the given mask.
|
||||
"""
|
||||
return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def shfl_sync(mask, value, src_lane):
|
||||
"""
|
||||
Shuffles value across the masked warp and returns the value
|
||||
from src_lane. If this is outside the warp, then the
|
||||
given value is returned.
|
||||
"""
|
||||
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def shfl_up_sync(mask, value, delta):
|
||||
"""
|
||||
Shuffles value across the masked warp and returns the value
|
||||
from (laneid - delta). If this is outside the warp, then the
|
||||
given value is returned.
|
||||
"""
|
||||
return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def shfl_down_sync(mask, value, delta):
|
||||
"""
|
||||
Shuffles value across the masked warp and returns the value
|
||||
from (laneid + delta). If this is outside the warp, then the
|
||||
given value is returned.
|
||||
"""
|
||||
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
|
||||
|
||||
|
||||
@jit(device=True)
|
||||
def shfl_xor_sync(mask, value, lane_mask):
|
||||
"""
|
||||
Shuffles value across the masked warp and returns the value
|
||||
from (laneid ^ lane_mask).
|
||||
"""
|
||||
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
|
||||
@@ -0,0 +1,198 @@
|
||||
from llvmlite import ir
|
||||
|
||||
from numba import cuda, types
|
||||
from numba.core import cgutils
|
||||
from numba.core.errors import RequireLiteralValue, NumbaValueError
|
||||
from numba.core.typing import signature
|
||||
from numba.core.extending import overload_attribute
|
||||
from numba.cuda import nvvmutils
|
||||
from numba.cuda.extending import intrinsic
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Grid functions
|
||||
|
||||
def _type_grid_function(ndim):
|
||||
val = ndim.literal_value
|
||||
if val == 1:
|
||||
restype = types.int64
|
||||
elif val in (2, 3):
|
||||
restype = types.UniTuple(types.int64, val)
|
||||
else:
|
||||
raise NumbaValueError('argument can only be 1, 2, 3')
|
||||
|
||||
return signature(restype, types.int32)
|
||||
|
||||
|
||||
@intrinsic
|
||||
def grid(typingctx, ndim):
|
||||
'''grid(ndim)
|
||||
|
||||
Return the absolute position of the current thread in the entire grid of
|
||||
blocks. *ndim* should correspond to the number of dimensions declared when
|
||||
instantiating the kernel. If *ndim* is 1, a single integer is returned.
|
||||
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
|
||||
|
||||
Computation of the first integer is as follows::
|
||||
|
||||
cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
|
||||
|
||||
and is similar for the other two indices, but using the ``y`` and ``z``
|
||||
attributes.
|
||||
'''
|
||||
|
||||
if not isinstance(ndim, types.IntegerLiteral):
|
||||
raise RequireLiteralValue(ndim)
|
||||
|
||||
sig = _type_grid_function(ndim)
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
restype = sig.return_type
|
||||
if restype == types.int64:
|
||||
return nvvmutils.get_global_id(builder, dim=1)
|
||||
elif isinstance(restype, types.UniTuple):
|
||||
ids = nvvmutils.get_global_id(builder, dim=restype.count)
|
||||
return cgutils.pack_array(builder, ids)
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
@intrinsic
|
||||
def gridsize(typingctx, ndim):
|
||||
'''gridsize(ndim)
|
||||
|
||||
Return the absolute size (or shape) in threads of the entire grid of
|
||||
blocks. *ndim* should correspond to the number of dimensions declared when
|
||||
instantiating the kernel. If *ndim* is 1, a single integer is returned.
|
||||
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
|
||||
|
||||
Computation of the first integer is as follows::
|
||||
|
||||
cuda.blockDim.x * cuda.gridDim.x
|
||||
|
||||
and is similar for the other two indices, but using the ``y`` and ``z``
|
||||
attributes.
|
||||
'''
|
||||
|
||||
if not isinstance(ndim, types.IntegerLiteral):
|
||||
raise RequireLiteralValue(ndim)
|
||||
|
||||
sig = _type_grid_function(ndim)
|
||||
|
||||
def _nthreads_for_dim(builder, dim):
|
||||
i64 = ir.IntType(64)
|
||||
ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
|
||||
nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
|
||||
return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
restype = sig.return_type
|
||||
nx = _nthreads_for_dim(builder, 'x')
|
||||
|
||||
if restype == types.int64:
|
||||
return nx
|
||||
elif isinstance(restype, types.UniTuple):
|
||||
ny = _nthreads_for_dim(builder, 'y')
|
||||
|
||||
if restype.count == 2:
|
||||
return cgutils.pack_array(builder, (nx, ny))
|
||||
elif restype.count == 3:
|
||||
nz = _nthreads_for_dim(builder, 'z')
|
||||
return cgutils.pack_array(builder, (nx, ny, nz))
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
@intrinsic
|
||||
def _warpsize(typingctx):
|
||||
sig = signature(types.int32)
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
return nvvmutils.call_sreg(builder, 'warpsize')
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
@overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
|
||||
def cuda_warpsize(mod):
|
||||
'''
|
||||
The size of a warp. All architectures implemented to date have a warp size
|
||||
of 32.
|
||||
'''
|
||||
def get(mod):
|
||||
return _warpsize()
|
||||
return get
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# syncthreads
|
||||
|
||||
@intrinsic
|
||||
def syncthreads(typingctx):
|
||||
'''
|
||||
Synchronize all threads in the same thread block. This function implements
|
||||
the same pattern as barriers in traditional multi-threaded programming: this
|
||||
function waits until all threads in the block call it, at which point it
|
||||
returns control to all its callers.
|
||||
'''
|
||||
sig = signature(types.none)
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
fname = 'llvm.nvvm.barrier0'
|
||||
lmod = builder.module
|
||||
fnty = ir.FunctionType(ir.VoidType(), ())
|
||||
sync = cgutils.get_or_insert_function(lmod, fnty, fname)
|
||||
builder.call(sync, ())
|
||||
return context.get_dummy_value()
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
def _syncthreads_predicate(typingctx, predicate, fname):
|
||||
if not isinstance(predicate, types.Integer):
|
||||
return None
|
||||
|
||||
sig = signature(types.i4, types.i4)
|
||||
|
||||
def codegen(context, builder, sig, args):
|
||||
fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
|
||||
sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
|
||||
return builder.call(sync, args)
|
||||
|
||||
return sig, codegen
|
||||
|
||||
|
||||
@intrinsic
|
||||
def syncthreads_count(typingctx, predicate):
|
||||
'''
|
||||
syncthreads_count(predicate)
|
||||
|
||||
An extension to numba.cuda.syncthreads where the return value is a count
|
||||
of the threads where predicate is true.
|
||||
'''
|
||||
fname = 'llvm.nvvm.barrier0.popc'
|
||||
return _syncthreads_predicate(typingctx, predicate, fname)
|
||||
|
||||
|
||||
@intrinsic
|
||||
def syncthreads_and(typingctx, predicate):
|
||||
'''
|
||||
syncthreads_and(predicate)
|
||||
|
||||
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
||||
true for all threads or 0 otherwise.
|
||||
'''
|
||||
fname = 'llvm.nvvm.barrier0.and'
|
||||
return _syncthreads_predicate(typingctx, predicate, fname)
|
||||
|
||||
|
||||
@intrinsic
|
||||
def syncthreads_or(typingctx, predicate):
|
||||
'''
|
||||
syncthreads_or(predicate)
|
||||
|
||||
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
||||
true for any thread or 0 otherwise.
|
||||
'''
|
||||
fname = 'llvm.nvvm.barrier0.or'
|
||||
return _syncthreads_predicate(typingctx, predicate, fname)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,262 @@
|
||||
"""
|
||||
A library written in CUDA Python for generating reduction kernels
|
||||
"""
|
||||
|
||||
from numba.np.numpy_support import from_dtype
|
||||
|
||||
|
||||
_WARPSIZE = 32
|
||||
_NUMWARPS = 4
|
||||
|
||||
|
||||
def _gpu_reduce_factory(fn, nbtype):
|
||||
from numba import cuda
|
||||
|
||||
reduce_op = cuda.jit(device=True)(fn)
|
||||
inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
|
||||
max_blocksize = _NUMWARPS * _WARPSIZE
|
||||
|
||||
@cuda.jit(device=True)
|
||||
def inner_warp_reduction(sm_partials, init):
|
||||
"""
|
||||
Compute reduction within a single warp
|
||||
"""
|
||||
tid = cuda.threadIdx.x
|
||||
warpid = tid // _WARPSIZE
|
||||
laneid = tid % _WARPSIZE
|
||||
|
||||
sm_this = sm_partials[warpid, :]
|
||||
sm_this[laneid] = init
|
||||
cuda.syncwarp()
|
||||
|
||||
width = _WARPSIZE // 2
|
||||
while width:
|
||||
if laneid < width:
|
||||
old = sm_this[laneid]
|
||||
sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
|
||||
cuda.syncwarp()
|
||||
width //= 2
|
||||
|
||||
@cuda.jit(device=True)
|
||||
def device_reduce_full_block(arr, partials, sm_partials):
|
||||
"""
|
||||
Partially reduce `arr` into `partials` using `sm_partials` as working
|
||||
space. The algorithm goes like:
|
||||
|
||||
array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
|
||||
block-0: | x | | | x | |
|
||||
block-1: | | x | | | x |
|
||||
block-2: | | | x | | |
|
||||
|
||||
The array is divided into chunks of 128 (size of a threadblock).
|
||||
The threadblocks consumes the chunks in roundrobin scheduling.
|
||||
First, a threadblock loads a chunk into temp memory. Then, all
|
||||
subsequent chunks are combined into the temp memory.
|
||||
|
||||
Once all chunks are processed. Inner-block reduction is performed
|
||||
on the temp memory. So that, there will just be one scalar result
|
||||
per block. The result from each block is stored to `partials` at
|
||||
the dedicated slot.
|
||||
"""
|
||||
tid = cuda.threadIdx.x
|
||||
blkid = cuda.blockIdx.x
|
||||
blksz = cuda.blockDim.x
|
||||
gridsz = cuda.gridDim.x
|
||||
|
||||
# block strided loop to compute the reduction
|
||||
start = tid + blksz * blkid
|
||||
stop = arr.size
|
||||
step = blksz * gridsz
|
||||
|
||||
# load first value
|
||||
tmp = arr[start]
|
||||
# loop over all values in block-stride
|
||||
for i in range(start + step, stop, step):
|
||||
tmp = reduce_op(tmp, arr[i])
|
||||
|
||||
cuda.syncthreads()
|
||||
# inner-warp reduction
|
||||
inner_warp_reduction(sm_partials, tmp)
|
||||
|
||||
cuda.syncthreads()
|
||||
# at this point, only the first slot for each warp in tsm_partials
|
||||
# is valid.
|
||||
|
||||
# finish up block reduction
|
||||
# warning: this is assuming 4 warps.
|
||||
# assert numwarps == 4
|
||||
if tid < 2:
|
||||
sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
|
||||
sm_partials[tid + 2, 0])
|
||||
cuda.syncwarp()
|
||||
if tid == 0:
|
||||
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
|
||||
|
||||
@cuda.jit(device=True)
|
||||
def device_reduce_partial_block(arr, partials, sm_partials):
|
||||
"""
|
||||
This computes reduction on `arr`.
|
||||
This device function must be used by 1 threadblock only.
|
||||
The blocksize must match `arr.size` and must not be greater than 128.
|
||||
"""
|
||||
tid = cuda.threadIdx.x
|
||||
blkid = cuda.blockIdx.x
|
||||
blksz = cuda.blockDim.x
|
||||
warpid = tid // _WARPSIZE
|
||||
laneid = tid % _WARPSIZE
|
||||
|
||||
size = arr.size
|
||||
# load first value
|
||||
tid = cuda.threadIdx.x
|
||||
value = arr[tid]
|
||||
sm_partials[warpid, laneid] = value
|
||||
|
||||
cuda.syncthreads()
|
||||
|
||||
if (warpid + 1) * _WARPSIZE < size:
|
||||
# fully populated warps
|
||||
inner_warp_reduction(sm_partials, value)
|
||||
else:
|
||||
# partially populated warps
|
||||
# NOTE: this uses a very inefficient sequential algorithm
|
||||
if laneid == 0:
|
||||
sm_this = sm_partials[warpid, :]
|
||||
base = warpid * _WARPSIZE
|
||||
for i in range(1, size - base):
|
||||
sm_this[0] = reduce_op(sm_this[0], sm_this[i])
|
||||
|
||||
cuda.syncthreads()
|
||||
# finish up
|
||||
if tid == 0:
|
||||
num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
|
||||
|
||||
result = sm_partials[0, 0]
|
||||
for i in range(1, num_active_warps):
|
||||
result = reduce_op(result, sm_partials[i, 0])
|
||||
|
||||
partials[blkid] = result
|
||||
|
||||
def gpu_reduce_block_strided(arr, partials, init, use_init):
|
||||
"""
|
||||
Perform reductions on *arr* and writing out partial reduction result
|
||||
into *partials*. The length of *partials* is determined by the
|
||||
number of threadblocks. The initial value is set with *init*.
|
||||
|
||||
Launch config:
|
||||
|
||||
Blocksize must be multiple of warpsize and it is limited to 4 warps.
|
||||
"""
|
||||
tid = cuda.threadIdx.x
|
||||
|
||||
sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
|
||||
dtype=nbtype)
|
||||
if cuda.blockDim.x == max_blocksize:
|
||||
device_reduce_full_block(arr, partials, sm_partials)
|
||||
else:
|
||||
device_reduce_partial_block(arr, partials, sm_partials)
|
||||
# deal with the initializer
|
||||
if use_init and tid == 0 and cuda.blockIdx.x == 0:
|
||||
partials[0] = reduce_op(partials[0], init)
|
||||
|
||||
return cuda.jit(gpu_reduce_block_strided)
|
||||
|
||||
|
||||
class Reduce(object):
|
||||
"""Create a reduction object that reduces values using a given binary
|
||||
function. The binary function is compiled once and cached inside this
|
||||
object. Keeping this object alive will prevent re-compilation.
|
||||
"""
|
||||
|
||||
_cache = {}
|
||||
|
||||
def __init__(self, functor):
|
||||
"""
|
||||
:param functor: A function implementing a binary operation for
|
||||
reduction. It will be compiled as a CUDA device
|
||||
function using ``cuda.jit(device=True)``.
|
||||
"""
|
||||
self._functor = functor
|
||||
|
||||
def _compile(self, dtype):
|
||||
key = self._functor, dtype
|
||||
if key in self._cache:
|
||||
kernel = self._cache[key]
|
||||
else:
|
||||
kernel = _gpu_reduce_factory(self._functor, from_dtype(dtype))
|
||||
self._cache[key] = kernel
|
||||
return kernel
|
||||
|
||||
def __call__(self, arr, size=None, res=None, init=0, stream=0):
|
||||
"""Performs a full reduction.
|
||||
|
||||
:param arr: A host or device array.
|
||||
:param size: Optional integer specifying the number of elements in
|
||||
``arr`` to reduce. If this parameter is not specified, the
|
||||
entire array is reduced.
|
||||
:param res: Optional device array into which to write the reduction
|
||||
result to. The result is written into the first element of
|
||||
this array. If this parameter is specified, then no
|
||||
communication of the reduction output takes place from the
|
||||
device to the host.
|
||||
:param init: Optional initial value for the reduction, the type of which
|
||||
must match ``arr.dtype``.
|
||||
:param stream: Optional CUDA stream in which to perform the reduction.
|
||||
If no stream is specified, the default stream of 0 is
|
||||
used.
|
||||
:return: If ``res`` is specified, ``None`` is returned. Otherwise, the
|
||||
result of the reduction is returned.
|
||||
"""
|
||||
from numba import cuda
|
||||
|
||||
# ensure 1d array
|
||||
if arr.ndim != 1:
|
||||
raise TypeError("only support 1D array")
|
||||
|
||||
# adjust array size
|
||||
if size is not None:
|
||||
arr = arr[:size]
|
||||
|
||||
init = arr.dtype.type(init) # ensure the right type
|
||||
|
||||
# return `init` if `arr` is empty
|
||||
if arr.size < 1:
|
||||
return init
|
||||
|
||||
kernel = self._compile(arr.dtype)
|
||||
|
||||
# Perform the reduction on the GPU
|
||||
blocksize = _NUMWARPS * _WARPSIZE
|
||||
size_full = (arr.size // blocksize) * blocksize
|
||||
size_partial = arr.size - size_full
|
||||
full_blockct = min(size_full // blocksize, _WARPSIZE * 2)
|
||||
|
||||
# allocate size of partials array
|
||||
partials_size = full_blockct
|
||||
if size_partial:
|
||||
partials_size += 1
|
||||
partials = cuda.device_array(shape=partials_size, dtype=arr.dtype)
|
||||
|
||||
if size_full:
|
||||
# kernel for the fully populated threadblocks
|
||||
kernel[full_blockct, blocksize, stream](arr[:size_full],
|
||||
partials[:full_blockct],
|
||||
init,
|
||||
True)
|
||||
|
||||
if size_partial:
|
||||
# kernel for partially populated threadblocks
|
||||
kernel[1, size_partial, stream](arr[size_full:],
|
||||
partials[full_blockct:],
|
||||
init,
|
||||
not full_blockct)
|
||||
|
||||
if partials.size > 1:
|
||||
# finish up
|
||||
kernel[1, partials_size, stream](partials, partials, init, False)
|
||||
|
||||
# handle return value
|
||||
if res is not None:
|
||||
res[:1].copy_to_device(partials[:1], stream=stream)
|
||||
return
|
||||
else:
|
||||
return partials[0]
|
||||
@@ -0,0 +1,65 @@
|
||||
from numba import cuda
|
||||
from numba.cuda.cudadrv.driver import driver
|
||||
import math
|
||||
from numba.np import numpy_support as nps
|
||||
|
||||
|
||||
def transpose(a, b=None):
|
||||
"""Compute the transpose of 'a' and store it into 'b', if given,
|
||||
and return it. If 'b' is not given, allocate a new array
|
||||
and return that.
|
||||
|
||||
This implements the algorithm documented in
|
||||
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
|
||||
|
||||
:param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
|
||||
the device its stream will be used to perform the transpose (and to copy
|
||||
`b` to the device if necessary).
|
||||
"""
|
||||
|
||||
# prefer `a`'s stream if
|
||||
stream = getattr(a, 'stream', 0)
|
||||
|
||||
if not b:
|
||||
cols, rows = a.shape
|
||||
strides = a.dtype.itemsize * cols, a.dtype.itemsize
|
||||
b = cuda.cudadrv.devicearray.DeviceNDArray(
|
||||
(rows, cols),
|
||||
strides,
|
||||
dtype=a.dtype,
|
||||
stream=stream)
|
||||
|
||||
dt = nps.from_dtype(a.dtype)
|
||||
|
||||
tpb = driver.get_device().MAX_THREADS_PER_BLOCK
|
||||
# we need to factor available threads into x and y axis
|
||||
tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
|
||||
tile_height = int(tpb / tile_width)
|
||||
|
||||
tile_shape = (tile_height, tile_width + 1)
|
||||
|
||||
@cuda.jit
|
||||
def kernel(input, output):
|
||||
|
||||
tile = cuda.shared.array(shape=tile_shape, dtype=dt)
|
||||
|
||||
tx = cuda.threadIdx.x
|
||||
ty = cuda.threadIdx.y
|
||||
bx = cuda.blockIdx.x * cuda.blockDim.x
|
||||
by = cuda.blockIdx.y * cuda.blockDim.y
|
||||
x = by + tx
|
||||
y = bx + ty
|
||||
|
||||
if by + ty < input.shape[0] and bx + tx < input.shape[1]:
|
||||
tile[ty, tx] = input[by + ty, bx + tx]
|
||||
cuda.syncthreads()
|
||||
if y < output.shape[0] and x < output.shape[1]:
|
||||
output[y, x] = tile[tx, ty]
|
||||
|
||||
# one block per tile, plus one for remainders
|
||||
blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
|
||||
# one thread per tile element
|
||||
threads = tile_height, tile_width
|
||||
kernel[blocks, threads, stream](a, b)
|
||||
|
||||
return b
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,17 @@
|
||||
from numba.cuda import libdevice, libdevicefuncs
|
||||
from numba.core.typing.templates import ConcreteTemplate, Registry
|
||||
|
||||
registry = Registry()
|
||||
register_global = registry.register_global
|
||||
|
||||
|
||||
def libdevice_declare(func, retty, args):
|
||||
class Libdevice_function(ConcreteTemplate):
|
||||
cases = [libdevicefuncs.create_signature(retty, args)]
|
||||
|
||||
pyfunc = getattr(libdevice, func[5:])
|
||||
register_global(pyfunc)(Libdevice_function)
|
||||
|
||||
|
||||
for func, (retty, args) in libdevicefuncs.functions.items():
|
||||
libdevice_declare(func, retty, args)
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user