This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,22 @@
from numba import runtests
from numba.core import config
if config.ENABLE_CUDASIM:
from .simulator_init import *
else:
from .device_init import *
from .device_init import _auto_device
from numba.cuda.compiler import (compile, compile_for_current_device,
compile_ptx, compile_ptx_for_current_device)
# Are we the numba.cuda built in to upstream Numba, or the out-of-tree
# NVIDIA-maintained target?
implementation = "Built-in"
def test(*args, **kwargs):
if not is_available():
raise cuda_error()
return runtests.main("numba.cuda.tests", *args, **kwargs)

View File

@@ -0,0 +1,525 @@
"""
API that are reported to numba.cuda
"""
import contextlib
import os
import numpy as np
from .cudadrv import devicearray, devices, driver
from numba.core import config
from numba.cuda.api_util import prepare_shape_strides_dtype
# NDarray device helper
require_context = devices.require_context
current_context = devices.get_context
gpus = devices.gpus
@require_context
def from_cuda_array_interface(desc, owner=None, sync=True):
"""Create a DeviceNDArray from a cuda-array-interface description.
The ``owner`` is the owner of the underlying memory.
The resulting DeviceNDArray will acquire a reference from it.
If ``sync`` is ``True``, then the imported stream (if present) will be
synchronized.
"""
version = desc.get('version')
# Mask introduced in version 1
if 1 <= version:
mask = desc.get('mask')
# Would ideally be better to detect if the mask is all valid
if mask is not None:
raise NotImplementedError('Masked arrays are not supported')
shape = desc['shape']
strides = desc.get('strides')
dtype = np.dtype(desc['typestr'])
shape, strides, dtype = prepare_shape_strides_dtype(
shape, strides, dtype, order='C')
size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
data = driver.MemoryPointer(
current_context(), devptr, size=size, owner=owner)
stream_ptr = desc.get('stream', None)
if stream_ptr is not None:
stream = external_stream(stream_ptr)
if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
stream.synchronize()
else:
stream = 0 # No "Numba default stream", not the CUDA default stream
da = devicearray.DeviceNDArray(shape=shape, strides=strides,
dtype=dtype, gpu_data=data,
stream=stream)
return da
def as_cuda_array(obj, sync=True):
"""Create a DeviceNDArray from any object that implements
the :ref:`cuda array interface <cuda-array-interface>`.
A view of the underlying GPU buffer is created. No copying of the data
is done. The resulting DeviceNDArray will acquire a reference from `obj`.
If ``sync`` is ``True``, then the imported stream (if present) will be
synchronized.
"""
if not is_cuda_array(obj):
raise TypeError("*obj* doesn't implement the cuda array interface.")
else:
return from_cuda_array_interface(obj.__cuda_array_interface__,
owner=obj, sync=sync)
def is_cuda_array(obj):
"""Test if the object has defined the `__cuda_array_interface__` attribute.
Does not verify the validity of the interface.
"""
return hasattr(obj, '__cuda_array_interface__')
def is_float16_supported():
"""Whether 16-bit floats are supported.
float16 is always supported in current versions of Numba - returns True.
"""
return True
@require_context
def to_device(obj, stream=0, copy=True, to=None):
"""to_device(obj, stream=0, copy=True, to=None)
Allocate and transfer a numpy ndarray or structured scalar to the device.
To copy host->device a numpy array::
ary = np.arange(10)
d_ary = cuda.to_device(ary)
To enqueue the transfer to a stream::
stream = cuda.stream()
d_ary = cuda.to_device(ary, stream=stream)
The resulting ``d_ary`` is a ``DeviceNDArray``.
To copy device->host::
hary = d_ary.copy_to_host()
To copy device->host to an existing array::
ary = np.empty(shape=d_ary.shape, dtype=d_ary.dtype)
d_ary.copy_to_host(ary)
To enqueue the transfer to a stream::
hary = d_ary.copy_to_host(stream=stream)
"""
if to is None:
to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
user_explicit=True)
return to
if copy:
to.copy_to_device(obj, stream=stream)
return to
@require_context
def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
"""device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
"""
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
order)
return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
stream=stream)
@require_context
def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
attach_global=True):
"""managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
attach_global=True)
Allocate a np.ndarray with a buffer that is managed.
Similar to np.empty().
Managed memory is supported on Linux / x86 and PowerPC, and is considered
experimental on Windows and Linux / AArch64.
:param attach_global: A flag indicating whether to attach globally. Global
attachment implies that the memory is accessible from
any stream on any device. If ``False``, attachment is
*host*, and memory is only accessible by devices
with Compute Capability 6.0 and later.
"""
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
order)
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
buffer = current_context().memallocmanaged(bytesize,
attach_global=attach_global)
npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
buffer=buffer)
managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
managedview.device_setup(buffer, stream=stream)
return managedview
@require_context
def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
"""pinned_array(shape, dtype=np.float64, strides=None, order='C')
Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
(pagelocked). Similar to :func:`np.empty() <numpy.empty>`.
"""
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
order)
bytesize = driver.memory_size_from_info(shape, strides,
dtype.itemsize)
buffer = current_context().memhostalloc(bytesize)
return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
buffer=buffer)
@require_context
def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
portable=False, wc=False):
"""mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
portable=False, wc=False)
Allocate a mapped ndarray with a buffer that is pinned and mapped on
to the device. Similar to np.empty()
:param portable: a boolean flag to allow the allocated device memory to be
usable in multiple devices.
:param wc: a boolean flag to enable writecombined allocation which is faster
to write by the host and to read by the device, but slower to
write by the host and slower to write by the device.
"""
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
order)
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
buffer = current_context().memhostalloc(bytesize, mapped=True)
npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
buffer=buffer)
mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
mappedview.device_setup(buffer, stream=stream)
return mappedview
@contextlib.contextmanager
@require_context
def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
"""
A context manager that opens a IPC *handle* (*CUipcMemHandle*) that is
represented as a sequence of bytes (e.g. *bytes*, tuple of int)
and represent it as an array of the given *shape*, *strides* and *dtype*.
The *strides* can be omitted. In that case, it is assumed to be a 1D
C contiguous array.
Yields a device array.
The IPC handle is closed automatically when context manager exits.
"""
dtype = np.dtype(dtype)
# compute size
size = np.prod(shape) * dtype.itemsize
# manually recreate the IPC mem handle
if driver.USE_NV_BINDING:
driver_handle = driver.binding.CUipcMemHandle()
driver_handle.reserved = handle
else:
driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle)
# use *IpcHandle* to open the IPC memory
ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
yield ipchandle.open_array(current_context(), shape=shape,
strides=strides, dtype=dtype)
ipchandle.close()
def synchronize():
"Synchronize the current context."
return current_context().synchronize()
def _contiguous_strides_like_array(ary):
"""
Given an array, compute strides for a new contiguous array of the same
shape.
"""
# Don't recompute strides if the default strides will be sufficient to
# create a contiguous array.
if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
return None
# Otherwise, we need to compute new strides using an algorithm adapted from
# NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
# core/src/multiarray/ctors.c. We permute the strides in ascending order
# then compute the stride for the dimensions with the same permutation.
# Stride permutation. E.g. a stride array (4, -2, 12) becomes
# [(1, -2), (0, 4), (2, 12)]
strideperm = [ x for x in enumerate(ary.strides) ]
strideperm.sort(key=lambda x: x[1])
# Compute new strides using permutation
strides = [0] * len(ary.strides)
stride = ary.dtype.itemsize
for i_perm, _ in strideperm:
strides[i_perm] = stride
stride *= ary.shape[i_perm]
return tuple(strides)
def _order_like_array(ary):
if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
return 'F'
else:
return 'C'
def device_array_like(ary, stream=0):
"""
Call :func:`device_array() <numba.cuda.device_array>` with information from
the array.
"""
strides = _contiguous_strides_like_array(ary)
order = _order_like_array(ary)
return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
order=order, stream=stream)
def mapped_array_like(ary, stream=0, portable=False, wc=False):
"""
Call :func:`mapped_array() <numba.cuda.mapped_array>` with the information
from the array.
"""
strides = _contiguous_strides_like_array(ary)
order = _order_like_array(ary)
return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
order=order, stream=stream, portable=portable, wc=wc)
def pinned_array_like(ary):
"""
Call :func:`pinned_array() <numba.cuda.pinned_array>` with the information
from the array.
"""
strides = _contiguous_strides_like_array(ary)
order = _order_like_array(ary)
return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
order=order)
# Stream helper
@require_context
def stream():
"""
Create a CUDA stream that represents a command queue for the device.
"""
return current_context().create_stream()
@require_context
def default_stream():
"""
Get the default CUDA stream. CUDA semantics in general are that the default
stream is either the legacy default stream or the per-thread default stream
depending on which CUDA APIs are in use. In Numba, the APIs for the legacy
default stream are always the ones in use, but an option to use APIs for
the per-thread default stream may be provided in future.
"""
return current_context().get_default_stream()
@require_context
def legacy_default_stream():
"""
Get the legacy default CUDA stream.
"""
return current_context().get_legacy_default_stream()
@require_context
def per_thread_default_stream():
"""
Get the per-thread default CUDA stream.
"""
return current_context().get_per_thread_default_stream()
@require_context
def external_stream(ptr):
"""Create a Numba stream object for a stream allocated outside Numba.
:param ptr: Pointer to the external stream to wrap in a Numba Stream
:type ptr: int
"""
return current_context().create_external_stream(ptr)
# Page lock
@require_context
@contextlib.contextmanager
def pinned(*arylist):
"""A context manager for temporary pinning a sequence of host ndarrays.
"""
pmlist = []
for ary in arylist:
pm = current_context().mempin(ary, driver.host_pointer(ary),
driver.host_memory_size(ary),
mapped=False)
pmlist.append(pm)
yield
@require_context
@contextlib.contextmanager
def mapped(*arylist, **kws):
"""A context manager for temporarily mapping a sequence of host ndarrays.
"""
assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
stream = kws.get('stream', 0)
pmlist = []
devarylist = []
for ary in arylist:
pm = current_context().mempin(ary, driver.host_pointer(ary),
driver.host_memory_size(ary),
mapped=True)
pmlist.append(pm)
devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
devarylist.append(devary)
try:
if len(devarylist) == 1:
yield devarylist[0]
else:
yield devarylist
finally:
# When exiting from `with cuda.mapped(*arrs) as mapped_arrs:`, the name
# `mapped_arrs` stays in scope, blocking automatic unmapping based on
# reference count. We therefore invoke the finalizer manually.
for pm in pmlist:
pm.free()
def event(timing=True):
"""
Create a CUDA event. Timing data is only recorded by the event if it is
created with ``timing=True``.
"""
evt = current_context().create_event(timing=timing)
return evt
event_elapsed_time = driver.event_elapsed_time
# Device selection
def select_device(device_id):
"""
Make the context associated with device *device_id* the current context.
Returns a Device instance.
Raises exception on error.
"""
context = devices.get_context(device_id)
return context.device
def get_current_device():
"Get current device associated with the current thread"
return current_context().device
def list_devices():
"Return a list of all detected devices"
return devices.gpus
def close():
"""
Explicitly clears all contexts in the current thread, and destroys all
contexts if the current thread is the main thread.
"""
devices.reset()
def _auto_device(ary, stream=0, copy=True):
return devicearray.auto_device(ary, stream=stream, copy=copy)
def detect():
"""
Detect supported CUDA hardware and print a summary of the detected hardware.
Returns a boolean indicating whether any supported devices were detected.
"""
devlist = list_devices()
print('Found %d CUDA devices' % len(devlist))
supported_count = 0
for dev in devlist:
attrs = []
cc = dev.compute_capability
kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
tcc = dev.TCC_DRIVER
fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
attrs += [('Compute Capability', '%d.%d' % cc)]
attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
attrs += [('UUID', dev.uuid)]
attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
if os.name == "nt":
attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
if cc < (3, 5):
support = '[NOT SUPPORTED: CC < 3.5]'
elif cc < (5, 0):
support = '[SUPPORTED (DEPRECATED)]'
supported_count += 1
else:
support = '[SUPPORTED]'
supported_count += 1
print('id %d %20s %40s' % (dev.id, dev.name, support))
for key, val in attrs:
print('%40s: %s' % (key, val))
print('Summary:')
print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
return supported_count > 0
@contextlib.contextmanager
def defer_cleanup():
"""
Temporarily disable memory deallocation.
Use this to prevent resource deallocation breaking asynchronous execution.
For example::
with defer_cleanup():
# all cleanup is deferred in here
do_speed_critical_code()
# cleanup can occur here
Note: this context manager can be nested.
"""
with current_context().defer_cleanup():
yield
profiling = require_context(driver.profiling)
profile_start = require_context(driver.profile_start)
profile_stop = require_context(driver.profile_stop)

View File

@@ -0,0 +1,30 @@
import numpy as np
def prepare_shape_strides_dtype(shape, strides, dtype, order):
dtype = np.dtype(dtype)
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
else:
strides = strides or _fill_stride_by_order(shape, dtype, order)
return shape, strides, dtype
def _fill_stride_by_order(shape, dtype, order):
nd = len(shape)
if nd == 0:
return ()
strides = [0] * nd
if order == 'C':
strides[-1] = dtype.itemsize
for d in reversed(range(nd - 1)):
strides[d] = strides[d + 1] * shape[d + 1]
elif order == 'F':
strides[0] = dtype.itemsize
for d in range(1, nd):
strides[d] = strides[d - 1] * shape[d - 1]
else:
raise ValueError('must be either C/F order')
return tuple(strides)

View File

@@ -0,0 +1,77 @@
"""
Hints to wrap Kernel arguments to indicate how to manage host-device
memory transfers before & after the kernel call.
"""
import abc
from numba.core.typing.typeof import typeof, Purpose
class ArgHint(metaclass=abc.ABCMeta):
def __init__(self, value):
self.value = value
@abc.abstractmethod
def to_device(self, retr, stream=0):
"""
:param stream: a stream to use when copying data
:param retr:
a list of clean-up work to do after the kernel's been run.
Append 0-arg lambdas to it!
:return: a value (usually an `DeviceNDArray`) to be passed to
the kernel
"""
pass
@property
def _numba_type_(self):
return typeof(self.value, Purpose.argument)
class In(ArgHint):
def to_device(self, retr, stream=0):
from .cudadrv.devicearray import auto_device
devary, _ = auto_device(
self.value,
stream=stream)
# A dummy writeback functor to keep devary alive until the kernel
# is called.
retr.append(lambda: devary)
return devary
class Out(ArgHint):
def to_device(self, retr, stream=0):
from .cudadrv.devicearray import auto_device
devary, conv = auto_device(
self.value,
copy=False,
stream=stream)
if conv:
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
return devary
class InOut(ArgHint):
def to_device(self, retr, stream=0):
from .cudadrv.devicearray import auto_device
devary, conv = auto_device(
self.value,
stream=stream)
if conv:
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
return devary
def wrap_arg(value, default=InOut):
return value if isinstance(value, ArgHint) else default(value)
__all__ = [
'In',
'Out',
'InOut',
'ArgHint',
'wrap_arg',
]

View File

@@ -0,0 +1,62 @@
from numba.core import types
from numba.core.extending import overload, overload_method
from numba.core.typing import signature
from numba.cuda import nvvmutils
from numba.cuda.extending import intrinsic
from numba.cuda.types import grid_group, GridGroup as GridGroupClass
class GridGroup:
"""A cooperative group representing the entire grid"""
def sync() -> None:
"""Synchronize this grid group"""
def this_grid() -> GridGroup:
"""Get the current grid group."""
return GridGroup()
@intrinsic
def _this_grid(typingctx):
sig = signature(grid_group)
def codegen(context, builder, sig, args):
one = context.get_constant(types.int32, 1)
mod = builder.module
return builder.call(
nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
(one,))
return sig, codegen
@overload(this_grid, target='cuda')
def _ol_this_grid():
def impl():
return _this_grid()
return impl
@intrinsic
def _grid_group_sync(typingctx, group):
sig = signature(types.int32, group)
def codegen(context, builder, sig, args):
flags = context.get_constant(types.int32, 0)
mod = builder.module
return builder.call(
nvvmutils.declare_cudaCGSynchronize(mod),
(*args, flags))
return sig, codegen
@overload_method(GridGroupClass, 'sync', target='cuda')
def _ol_grid_group_sync(group):
def impl(group):
return _grid_group_sync(group)
return impl

View File

@@ -0,0 +1,378 @@
from llvmlite import ir
from numba.core import config, serialize
from numba.core.codegen import Codegen, CodeLibrary
from .cudadrv import devices, driver, nvvm, runtime
from numba.cuda.cudadrv.libs import get_cudalib
import os
import subprocess
import tempfile
CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
def run_nvdisasm(cubin, flags):
# nvdisasm only accepts input from a file, so we need to write out to a
# temp file and clean up afterwards.
fd = None
fname = None
try:
fd, fname = tempfile.mkstemp()
with open(fname, 'wb') as f:
f.write(cubin)
try:
cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except FileNotFoundError as e:
msg = ("nvdisasm has not been found. You may need "
"to install the CUDA toolkit and ensure that "
"it is available on your PATH.\n")
raise RuntimeError(msg) from e
return cp.stdout.decode('utf-8')
finally:
if fd is not None:
os.close(fd)
if fname is not None:
os.unlink(fname)
def disassemble_cubin(cubin):
# Request lineinfo in disassembly
flags = ['-gi']
return run_nvdisasm(cubin, flags)
def disassemble_cubin_for_cfg(cubin):
# Request control flow graph in disassembly
flags = ['-cfg']
return run_nvdisasm(cubin, flags)
class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
"""
The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
compute capabilities. It also loads cubins to multiple devices (via
get_cufunc), which may be of different compute capabilities.
"""
def __init__(self, codegen, name, entry_name=None, max_registers=None,
nvvm_options=None):
"""
codegen:
Codegen object.
name:
Name of the function in the source.
entry_name:
Name of the kernel function in the binary, if this is a global
kernel and not a device function.
max_registers:
The maximum register usage to aim for when linking.
nvvm_options:
Dict of options to pass to NVVM.
"""
super().__init__(codegen, name)
# The llvmlite module for this library.
self._module = None
# CodeLibrary objects that will be "linked" into this library. The
# modules within them are compiled from NVVM IR to PTX along with the
# IR from this module - in that sense they are "linked" by NVVM at PTX
# generation time, rather than at link time.
self._linking_libraries = set()
# Files to link with the generated PTX. These are linked using the
# Driver API at link time.
self._linking_files = set()
# Should we link libcudadevrt?
self.needs_cudadevrt = False
# Cache the LLVM IR string
self._llvm_strs = None
# Maps CC -> PTX string
self._ptx_cache = {}
# Maps CC -> LTO-IR
self._ltoir_cache = {}
# Maps CC -> cubin
self._cubin_cache = {}
# Maps CC -> linker info output for cubin
self._linkerinfo_cache = {}
# Maps Device numeric ID -> cufunc
self._cufunc_cache = {}
self._max_registers = max_registers
if nvvm_options is None:
nvvm_options = {}
self._nvvm_options = nvvm_options
self._entry_name = entry_name
@property
def llvm_strs(self):
if self._llvm_strs is None:
self._llvm_strs = [str(mod) for mod in self.modules]
return self._llvm_strs
def get_llvm_str(self):
return "\n\n".join(self.llvm_strs)
def _ensure_cc(self, cc):
if cc is not None:
return cc
device = devices.get_context().device
return device.compute_capability
def get_asm_str(self, cc=None):
cc = self._ensure_cc(cc)
ptxes = self._ptx_cache.get(cc, None)
if ptxes:
return ptxes
arch = nvvm.get_arch_option(*cc)
options = self._nvvm_options.copy()
options['arch'] = arch
irs = self.llvm_strs
ptx = nvvm.compile_ir(irs, **options)
# Sometimes the result from NVVM contains trailing whitespace and
# nulls, which we strip so that the assembly dump looks a little
# tidier.
ptx = ptx.decode().strip('\x00').strip()
if config.DUMP_ASSEMBLY:
print(("ASSEMBLY %s" % self._name).center(80, '-'))
print(ptx)
print('=' * 80)
self._ptx_cache[cc] = ptx
return ptx
def get_ltoir(self, cc=None):
cc = self._ensure_cc(cc)
ltoir = self._ltoir_cache.get(cc, None)
if ltoir is not None:
return ltoir
arch = nvvm.get_arch_option(*cc)
options = self._nvvm_options.copy()
options['arch'] = arch
options['gen-lto'] = None
irs = self.llvm_strs
ltoir = nvvm.compile_ir(irs, **options)
self._ltoir_cache[cc] = ltoir
return ltoir
def get_cubin(self, cc=None):
cc = self._ensure_cc(cc)
cubin = self._cubin_cache.get(cc, None)
if cubin:
return cubin
linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
if linker.lto:
ltoir = self.get_ltoir(cc=cc)
linker.add_ltoir(ltoir)
else:
ptx = self.get_asm_str(cc=cc)
linker.add_ptx(ptx.encode())
for path in self._linking_files:
linker.add_file_guess_ext(path)
if self.needs_cudadevrt:
linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
cubin = linker.complete()
self._cubin_cache[cc] = cubin
self._linkerinfo_cache[cc] = linker.info_log
return cubin
def get_cufunc(self):
if self._entry_name is None:
msg = "Missing entry_name - are you trying to get the cufunc " \
"for a device function?"
raise RuntimeError(msg)
ctx = devices.get_context()
device = ctx.device
cufunc = self._cufunc_cache.get(device.id, None)
if cufunc:
return cufunc
cubin = self.get_cubin(cc=device.compute_capability)
module = ctx.create_module_image(cubin)
# Load
cufunc = module.get_function(self._entry_name)
# Populate caches
self._cufunc_cache[device.id] = cufunc
return cufunc
def get_linkerinfo(self, cc):
try:
return self._linkerinfo_cache[cc]
except KeyError:
raise KeyError(f'No linkerinfo for CC {cc}')
def get_sass(self, cc=None):
return disassemble_cubin(self.get_cubin(cc=cc))
def get_sass_cfg(self, cc=None):
return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
def add_ir_module(self, mod):
self._raise_if_finalized()
if self._module is not None:
raise RuntimeError('CUDACodeLibrary only supports one module')
self._module = mod
def add_linking_library(self, library):
library._ensure_finalized()
# We don't want to allow linking more libraries in after finalization
# because our linked libraries are modified by the finalization, and we
# won't be able to finalize again after adding new ones
self._raise_if_finalized()
self._linking_libraries.add(library)
def add_linking_file(self, filepath):
self._linking_files.add(filepath)
def get_function(self, name):
for fn in self._module.functions:
if fn.name == name:
return fn
raise KeyError(f'Function {name} not found')
@property
def modules(self):
return [self._module] + [mod for lib in self._linking_libraries
for mod in lib.modules]
@property
def linking_libraries(self):
# Libraries we link to may link to other libraries, so we recursively
# traverse the linking libraries property to build up a list of all
# linked libraries.
libs = []
for lib in self._linking_libraries:
libs.extend(lib.linking_libraries)
libs.append(lib)
return libs
def finalize(self):
# Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
# we only adjust the linkage of functions. Global kernels (with
# external linkage) have their linkage untouched. Device functions are
# set linkonce_odr to prevent them appearing in the PTX.
self._raise_if_finalized()
# Note in-place modification of the linkage of functions in linked
# libraries. This presently causes no issues as only device functions
# are shared across code libraries, so they would always need their
# linkage set to linkonce_odr. If in a future scenario some code
# libraries require linkonce_odr linkage of functions in linked
# modules, and another code library requires another linkage, each code
# library will need to take its own private copy of its linked modules.
#
# See also discussion on PR #890:
# https://github.com/numba/numba/pull/890
for library in self._linking_libraries:
for mod in library.modules:
for fn in mod.functions:
if not fn.is_declaration:
fn.linkage = 'linkonce_odr'
self._finalized = True
def _reduce_states(self):
"""
Reduce the instance for serialization. We retain the PTX and cubins,
but loaded functions are discarded. They are recreated when needed
after deserialization.
"""
if self._linking_files:
msg = 'Cannot pickle CUDACodeLibrary with linking files'
raise RuntimeError(msg)
if not self._finalized:
raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
return dict(
codegen=None,
name=self.name,
entry_name=self._entry_name,
llvm_strs=self.llvm_strs,
ptx_cache=self._ptx_cache,
cubin_cache=self._cubin_cache,
linkerinfo_cache=self._linkerinfo_cache,
max_registers=self._max_registers,
nvvm_options=self._nvvm_options,
needs_cudadevrt=self.needs_cudadevrt
)
@classmethod
def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
needs_cudadevrt):
"""
Rebuild an instance.
"""
instance = cls(codegen, name, entry_name=entry_name)
instance._llvm_strs = llvm_strs
instance._ptx_cache = ptx_cache
instance._cubin_cache = cubin_cache
instance._linkerinfo_cache = linkerinfo_cache
instance._max_registers = max_registers
instance._nvvm_options = nvvm_options
instance.needs_cudadevrt = needs_cudadevrt
instance._finalized = True
return instance
class JITCUDACodegen(Codegen):
"""
This codegen implementation for CUDA only generates optimized LLVM IR.
Generation of PTX code is done separately (see numba.cuda.compiler).
"""
_library_class = CUDACodeLibrary
def __init__(self, module_name):
pass
def _create_empty_module(self, name):
ir_module = ir.Module(name)
ir_module.triple = CUDA_TRIPLE
ir_module.data_layout = nvvm.NVVM().data_layout
nvvm.add_ir_version(ir_module)
return ir_module
def _add_module(self, module):
pass
def magic_tuple(self):
"""
Return a tuple unambiguously describing the codegen behaviour.
"""
ctx = devices.get_context()
cc = ctx.device.compute_capability
return (runtime.runtime.get_version(), cc)

View File

@@ -0,0 +1,422 @@
from llvmlite import ir
from numba.core.typing.templates import ConcreteTemplate
from numba.core import types, typing, funcdesc, config, compiler, sigutils
from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
DefaultPassBuilder, Flags, Option,
CompileResult)
from numba.core.compiler_lock import global_compiler_lock
from numba.core.compiler_machinery import (LoweringPass,
PassManager, register_pass)
from numba.core.errors import NumbaInvalidConfigWarning
from numba.core.typed_passes import (IRLegalization, NativeLowering,
AnnotateTypes)
from warnings import warn
from numba.cuda.api import get_current_device
from numba.cuda.target import CUDACABICallConv
def _nvvm_options_type(x):
if x is None:
return None
else:
assert isinstance(x, dict)
return x
class CUDAFlags(Flags):
nvvm_options = Option(
type=_nvvm_options_type,
default=None,
doc="NVVM options",
)
compute_capability = Option(
type=tuple,
default=None,
doc="Compute Capability",
)
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
# id. This is because the entry point is used as a key into a dict of
# overloads by the base dispatcher. The id of the CCR is the only small and
# unique property of a CompileResult in the CUDA target (cf. the CPU target,
# which uses its entry_point, which is a pointer value).
#
# This does feel a little hackish, and there are two ways in which this could
# be improved:
#
# 1. We could change the core of Numba so that each CompileResult has its own
# unique ID that can be used as a key - e.g. a count, similar to the way in
# which types have unique counts.
# 2. At some future time when kernel launch uses a compiled function, the entry
# point will no longer need to be a synthetic value, but will instead be a
# pointer to the compiled function as in the CPU target.
class CUDACompileResult(CompileResult):
@property
def entry_point(self):
return id(self)
def cuda_compile_result(**entries):
entries = sanitize_compile_result_entries(entries)
return CUDACompileResult(**entries)
@register_pass(mutates_CFG=True, analysis_only=False)
class CUDABackend(LoweringPass):
_name = "cuda_backend"
def __init__(self):
LoweringPass.__init__(self)
def run_pass(self, state):
"""
Back-end: Packages lowering output in a compile result
"""
lowered = state['cr']
signature = typing.signature(state.return_type, *state.args)
state.cr = cuda_compile_result(
typing_context=state.typingctx,
target_context=state.targetctx,
typing_error=state.status.fail_reason,
type_annotation=state.type_annotation,
library=state.library,
call_helper=lowered.call_helper,
signature=signature,
fndesc=lowered.fndesc,
)
return True
@register_pass(mutates_CFG=False, analysis_only=False)
class CreateLibrary(LoweringPass):
"""
Create a CUDACodeLibrary for the NativeLowering pass to populate. The
NativeLowering pass will create a code library if none exists, but we need
to set it up with nvvm_options from the flags if they are present.
"""
_name = "create_library"
def __init__(self):
LoweringPass.__init__(self)
def run_pass(self, state):
codegen = state.targetctx.codegen()
name = state.func_id.func_qualname
nvvm_options = state.flags.nvvm_options
state.library = codegen.create_library(name, nvvm_options=nvvm_options)
# Enable object caching upfront so that the library can be serialized.
state.library.enable_object_caching()
return True
class CUDACompiler(CompilerBase):
def define_pipelines(self):
dpb = DefaultPassBuilder
pm = PassManager('cuda')
untyped_passes = dpb.define_untyped_pipeline(self.state)
pm.passes.extend(untyped_passes.passes)
typed_passes = dpb.define_typed_pipeline(self.state)
pm.passes.extend(typed_passes.passes)
lowering_passes = self.define_cuda_lowering_pipeline(self.state)
pm.passes.extend(lowering_passes.passes)
pm.finalize()
return [pm]
def define_cuda_lowering_pipeline(self, state):
pm = PassManager('cuda_lowering')
# legalise
pm.add_pass(IRLegalization,
"ensure IR is legal prior to lowering")
pm.add_pass(AnnotateTypes, "annotate types")
# lower
pm.add_pass(CreateLibrary, "create library")
pm.add_pass(NativeLowering, "native lowering")
pm.add_pass(CUDABackend, "cuda backend")
pm.finalize()
return pm
@global_compiler_lock
def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
inline=False, fastmath=False, nvvm_options=None,
cc=None):
if cc is None:
raise ValueError('Compute Capability must be supplied')
from .descriptor import cuda_target
typingctx = cuda_target.typing_context
targetctx = cuda_target.target_context
flags = CUDAFlags()
# Do not compile (generate native code), just lower (to LLVM)
flags.no_compile = True
flags.no_cpython_wrapper = True
flags.no_cfunc_wrapper = True
# Both debug and lineinfo turn on debug information in the compiled code,
# but we keep them separate arguments in case we later want to overload
# some other behavior on the debug flag. In particular, -opt=3 is not
# supported with debug enabled, and enabling only lineinfo should not
# affect the error model.
if debug or lineinfo:
flags.debuginfo = True
if lineinfo:
flags.dbg_directives_only = True
if debug:
flags.error_model = 'python'
else:
flags.error_model = 'numpy'
if inline:
flags.forceinline = True
if fastmath:
flags.fastmath = True
if nvvm_options:
flags.nvvm_options = nvvm_options
flags.compute_capability = cc
# Run compilation pipeline
from numba.core.target_extension import target_override
with target_override('cuda'):
cres = compiler.compile_extra(typingctx=typingctx,
targetctx=targetctx,
func=pyfunc,
args=args,
return_type=return_type,
flags=flags,
locals={},
pipeline_class=CUDACompiler)
library = cres.library
library.finalize()
return cres
def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
nvvm_options):
"""
Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
The C ABI wrapper will have the same name as the source Python function.
"""
# The wrapper will be contained in a new library that links to the wrapped
# function's library
library = lib.codegen.create_library(f'{lib.name}_function_',
entry_name=wrapper_function_name,
nvvm_options=nvvm_options)
library.add_linking_library(lib)
# Determine the caller (C ABI) and wrapper (Numba ABI) function types
argtypes = fndesc.argtypes
restype = fndesc.restype
c_call_conv = CUDACABICallConv(context)
wrapfnty = c_call_conv.get_function_type(restype, argtypes)
fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
# Create a new module and declare the callee
wrapper_module = context.create_module("cuda.cabi.wrapper")
func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
# Define the caller - populate it with a call to the callee and return
# its return value
wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
builder = ir.IRBuilder(wrapfn.append_basic_block(''))
arginfo = context.get_arg_packer(argtypes)
callargs = arginfo.from_arguments(builder, wrapfn.args)
# We get (status, return_value), but we ignore the status since we
# can't propagate it through the C ABI anyway
_, return_value = context.call_conv.call_function(
builder, func, restype, argtypes, callargs)
builder.ret(return_value)
library.add_ir_module(wrapper_module)
library.finalize()
return library
@global_compiler_lock
def compile(pyfunc, sig, debug=False, lineinfo=False, device=True,
fastmath=False, cc=None, opt=True, abi="c", abi_info=None,
output='ptx'):
"""Compile a Python function to PTX or LTO-IR for a given set of argument
types.
:param pyfunc: The Python function to compile.
:param sig: The signature representing the function's input and output
types. If this is a tuple of argument types without a return
type, the inferred return type is returned by this function. If
a signature including a return type is passed, the compiled code
will include a cast from the inferred return type to the
specified return type, and this function will return the
specified return type.
:param debug: Whether to include debug info in the compiled code.
:type debug: bool
:param lineinfo: Whether to include a line mapping from the compiled code
to the source code. Usually this is used with optimized
code (since debug mode would automatically include this),
so we want debug info in the LLVM IR but only the line
mapping in the final output.
:type lineinfo: bool
:param device: Whether to compile a device function.
:type device: bool
:param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
prec_div=, and fma=1)
:type fastmath: bool
:param cc: Compute capability to compile for, as a tuple
``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
:type cc: tuple
:param opt: Enable optimizations. Defaults to ``True``.
:type opt: bool
:param abi: The ABI for a compiled function - either ``"numba"`` or
``"c"``. Note that the Numba ABI is not considered stable.
The C ABI is only supported for device functions at present.
:type abi: str
:param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
one option, ``"abi_name"``, for providing the wrapper
function's name. The ``"numba"`` ABI has no options.
:type abi_info: dict
:param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
:type output: str
:return: (code, resty): The compiled code and inferred return type
:rtype: tuple
"""
if abi not in ("numba", "c"):
raise NotImplementedError(f'Unsupported ABI: {abi}')
if abi == 'c' and not device:
raise NotImplementedError('The C ABI is not supported for kernels')
if output not in ("ptx", "ltoir"):
raise NotImplementedError(f'Unsupported output type: {output}')
if debug and opt:
msg = ("debug=True with opt=True (the default) "
"is not supported by CUDA. This may result in a crash"
" - set debug=False or opt=False.")
warn(NumbaInvalidConfigWarning(msg))
lto = (output == 'ltoir')
abi_info = abi_info or dict()
nvvm_options = {
'fastmath': fastmath,
'opt': 3 if opt else 0
}
if lto:
nvvm_options['gen-lto'] = None
args, return_type = sigutils.normalize_signature(sig)
cc = cc or config.CUDA_DEFAULT_PTX_CC
cres = compile_cuda(pyfunc, return_type, args, debug=debug,
lineinfo=lineinfo, fastmath=fastmath,
nvvm_options=nvvm_options, cc=cc)
resty = cres.signature.return_type
if resty and not device and resty != types.void:
raise TypeError("CUDA kernel must have void return type.")
tgt = cres.target_context
if device:
lib = cres.library
if abi == "c":
wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
nvvm_options)
else:
code = pyfunc.__code__
filename = code.co_filename
linenum = code.co_firstlineno
lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
lineinfo, nvvm_options, filename,
linenum)
if lto:
code = lib.get_ltoir(cc=cc)
else:
code = lib.get_asm_str(cc=cc)
return code, resty
def compile_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
device=True, fastmath=False, opt=True,
abi="c", abi_info=None, output='ptx'):
"""Compile a Python function to PTX or LTO-IR for a given signature for the
current device's compute capabilility. This calls :func:`compile` with an
appropriate ``cc`` value for the current device."""
cc = get_current_device().compute_capability
return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
fastmath=fastmath, cc=cc, opt=opt, abi=abi,
abi_info=abi_info, output=output)
def compile_ptx(pyfunc, sig, debug=False, lineinfo=False, device=False,
fastmath=False, cc=None, opt=True, abi="numba", abi_info=None):
"""Compile a Python function to PTX for a given signature. See
:func:`compile`. The defaults for this function are to compile a kernel
with the Numba ABI, rather than :func:`compile`'s default of compiling a
device function with the C ABI."""
return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
fastmath=fastmath, cc=cc, opt=opt, abi=abi,
abi_info=abi_info, output='ptx')
def compile_ptx_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
device=False, fastmath=False, opt=True,
abi="numba", abi_info=None):
"""Compile a Python function to PTX for a given signature for the current
device's compute capabilility. See :func:`compile_ptx`."""
cc = get_current_device().compute_capability
return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
device=device, fastmath=fastmath, cc=cc, opt=opt,
abi=abi, abi_info=abi_info)
def declare_device_function(name, restype, argtypes):
return declare_device_function_template(name, restype, argtypes).key
def declare_device_function_template(name, restype, argtypes):
from .descriptor import cuda_target
typingctx = cuda_target.typing_context
targetctx = cuda_target.target_context
sig = typing.signature(restype, *argtypes)
extfn = ExternFunction(name, sig)
class device_function_template(ConcreteTemplate):
key = extfn
cases = [sig]
fndesc = funcdesc.ExternalFunctionDescriptor(
name=name, restype=restype, argtypes=argtypes)
typingctx.insert_user_function(extfn, device_function_template)
targetctx.insert_user_function(extfn, fndesc)
return device_function_template
class ExternFunction(object):
def __init__(self, name, sig):
self.name = name
self.sig = sig

View File

@@ -0,0 +1,47 @@
#include "cuda_fp16.h"
#define FNDEF(fname) __numba_wrapper_ ## fname
#define UNARY_FUNCTION(fname) extern "C" __device__ int\
FNDEF(fname)( \
short* return_value,\
short x\
)\
{\
__half retval = fname(__short_as_half (x));\
\
*return_value = __half_as_short (retval);\
/* Signal that no Python exception occurred */ \
return 0;\
}\
extern "C" __device__ int
FNDEF(hdiv)(
short* return_value,
short x,
short y
)
{
__half retval = __hdiv(__short_as_half (x), __short_as_half (y));
*return_value = __half_as_short (retval);
// Signal that no Python exception occurred
return 0;
}
UNARY_FUNCTION(hsin)
UNARY_FUNCTION(hcos)
UNARY_FUNCTION(hlog)
UNARY_FUNCTION(hlog10)
UNARY_FUNCTION(hlog2)
UNARY_FUNCTION(hexp)
UNARY_FUNCTION(hexp10)
UNARY_FUNCTION(hexp2)
UNARY_FUNCTION(hsqrt)
UNARY_FUNCTION(hrsqrt)
UNARY_FUNCTION(hfloor)
UNARY_FUNCTION(hceil)
UNARY_FUNCTION(hrcp)
UNARY_FUNCTION(hrint)
UNARY_FUNCTION(htrunc)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,258 @@
import sys
import re
import os
from collections import namedtuple
from numba.core.config import IS_WIN32
from numba.misc.findlib import find_lib, find_file
_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
def _find_valid_path(options):
"""Find valid path from *options*, which is a list of 2-tuple of
(name, path). Return first pair where *path* is not None.
If no valid path is found, return ('<unknown>', None)
"""
for by, data in options:
if data is not None:
return by, data
else:
return '<unknown>', None
def _get_libdevice_path_decision():
options = [
('Conda environment', get_conda_ctk()),
('Conda environment (NVIDIA package)', get_nvidia_libdevice_ctk()),
('CUDA_HOME', get_cuda_home('nvvm', 'libdevice')),
('System', get_system_ctk('nvvm', 'libdevice')),
('Debian package', get_debian_pkg_libdevice()),
]
by, libdir = _find_valid_path(options)
return by, libdir
def _nvvm_lib_dir():
if IS_WIN32:
return 'nvvm', 'bin'
else:
return 'nvvm', 'lib64'
def _get_nvvm_path_decision():
options = [
('Conda environment', get_conda_ctk()),
('Conda environment (NVIDIA package)', get_nvidia_nvvm_ctk()),
('CUDA_HOME', get_cuda_home(*_nvvm_lib_dir())),
('System', get_system_ctk(*_nvvm_lib_dir())),
]
by, path = _find_valid_path(options)
return by, path
def _get_libdevice_paths():
by, libdir = _get_libdevice_path_decision()
# Search for pattern
pat = r'libdevice(\.\d+)*\.bc$'
candidates = find_file(re.compile(pat), libdir)
# Keep only the max (most recent version) of the bitcode files.
out = max(candidates, default=None)
return _env_path_tuple(by, out)
def _cudalib_path():
if IS_WIN32:
return 'bin'
else:
return 'lib64'
def _cuda_home_static_cudalib_path():
if IS_WIN32:
return ('lib', 'x64')
else:
return ('lib64',)
def _get_cudalib_dir_path_decision():
options = [
('Conda environment', get_conda_ctk()),
('Conda environment (NVIDIA package)', get_nvidia_cudalib_ctk()),
('CUDA_HOME', get_cuda_home(_cudalib_path())),
('System', get_system_ctk(_cudalib_path())),
]
by, libdir = _find_valid_path(options)
return by, libdir
def _get_static_cudalib_dir_path_decision():
options = [
('Conda environment', get_conda_ctk()),
('Conda environment (NVIDIA package)', get_nvidia_static_cudalib_ctk()),
('CUDA_HOME', get_cuda_home(*_cuda_home_static_cudalib_path())),
('System', get_system_ctk(_cudalib_path())),
]
by, libdir = _find_valid_path(options)
return by, libdir
def _get_cudalib_dir():
by, libdir = _get_cudalib_dir_path_decision()
return _env_path_tuple(by, libdir)
def _get_static_cudalib_dir():
by, libdir = _get_static_cudalib_dir_path_decision()
return _env_path_tuple(by, libdir)
def get_system_ctk(*subdirs):
"""Return path to system-wide cudatoolkit; or, None if it doesn't exist.
"""
# Linux?
if sys.platform.startswith('linux'):
# Is cuda alias to /usr/local/cuda?
# We are intentionally not getting versioned cuda installation.
base = '/usr/local/cuda'
if os.path.exists(base):
return os.path.join(base, *subdirs)
def get_conda_ctk():
"""Return path to directory containing the shared libraries of cudatoolkit.
"""
is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
if not is_conda_env:
return
# Assume the existence of NVVM to imply cudatoolkit installed
paths = find_lib('nvvm')
if not paths:
return
# Use the directory name of the max path
return os.path.dirname(max(paths))
def get_nvidia_nvvm_ctk():
"""Return path to directory containing the NVVM shared library.
"""
is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
if not is_conda_env:
return
# Assume the existence of NVVM in the conda env implies that a CUDA toolkit
# conda package is installed.
# First, try the location used on Linux and the Windows 11.x packages
libdir = os.path.join(sys.prefix, 'nvvm', _cudalib_path())
if not os.path.exists(libdir) or not os.path.isdir(libdir):
# If that fails, try the location used for Windows 12.x packages
libdir = os.path.join(sys.prefix, 'Library', 'nvvm', _cudalib_path())
if not os.path.exists(libdir) or not os.path.isdir(libdir):
# If that doesn't exist either, assume we don't have the NVIDIA
# conda package
return
paths = find_lib('nvvm', libdir=libdir)
if not paths:
return
# Use the directory name of the max path
return os.path.dirname(max(paths))
def get_nvidia_libdevice_ctk():
"""Return path to directory containing the libdevice library.
"""
nvvm_ctk = get_nvidia_nvvm_ctk()
if not nvvm_ctk:
return
nvvm_dir = os.path.dirname(nvvm_ctk)
return os.path.join(nvvm_dir, 'libdevice')
def get_nvidia_cudalib_ctk():
"""Return path to directory containing the shared libraries of cudatoolkit.
"""
nvvm_ctk = get_nvidia_nvvm_ctk()
if not nvvm_ctk:
return
env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
subdir = 'bin' if IS_WIN32 else 'lib'
return os.path.join(env_dir, subdir)
def get_nvidia_static_cudalib_ctk():
"""Return path to directory containing the static libraries of cudatoolkit.
"""
nvvm_ctk = get_nvidia_nvvm_ctk()
if not nvvm_ctk:
return
if IS_WIN32 and ("Library" not in nvvm_ctk):
# Location specific to CUDA 11.x packages on Windows
dirs = ('Lib', 'x64')
else:
# Linux, or Windows with CUDA 12.x packages
dirs = ('lib',)
env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
return os.path.join(env_dir, *dirs)
def get_cuda_home(*subdirs):
"""Get paths of CUDA_HOME.
If *subdirs* are the subdirectory name to be appended in the resulting
path.
"""
cuda_home = os.environ.get('CUDA_HOME')
if cuda_home is None:
# Try Windows CUDA installation without Anaconda
cuda_home = os.environ.get('CUDA_PATH')
if cuda_home is not None:
return os.path.join(cuda_home, *subdirs)
def _get_nvvm_path():
by, path = _get_nvvm_path_decision()
candidates = find_lib('nvvm', path)
path = max(candidates) if candidates else None
return _env_path_tuple(by, path)
def get_cuda_paths():
"""Returns a dictionary mapping component names to a 2-tuple
of (source_variable, info).
The returned dictionary will have the following keys and infos:
- "nvvm": file_path
- "libdevice": List[Tuple[arch, file_path]]
- "cudalib_dir": directory_path
Note: The result of the function is cached.
"""
# Check cache
if hasattr(get_cuda_paths, '_cached_result'):
return get_cuda_paths._cached_result
else:
# Not in cache
d = {
'nvvm': _get_nvvm_path(),
'libdevice': _get_libdevice_paths(),
'cudalib_dir': _get_cudalib_dir(),
'static_cudalib_dir': _get_static_cudalib_dir(),
}
# Cache result
get_cuda_paths._cached_result = d
return d
def get_debian_pkg_libdevice():
"""
Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
exists.
"""
pkg_libdevice_location = '/usr/lib/nvidia-cuda-toolkit/libdevice'
if not os.path.exists(pkg_libdevice_location):
return None
return pkg_libdevice_location

View File

@@ -0,0 +1,806 @@
import operator
from numba.core import types
from numba.core.typing.npydecl import (parse_dtype, parse_shape,
register_number_classes,
register_numpy_ufunc,
trigonometric_functions,
comparison_functions,
math_operations,
bit_twiddling_functions)
from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
AbstractTemplate, CallableTemplate,
signature, Registry)
from numba.cuda.types import dim3
from numba.core.typeconv import Conversion
from numba import cuda
from numba.cuda.compiler import declare_device_function_template
registry = Registry()
register = registry.register
register_attr = registry.register_attr
register_global = registry.register_global
register_number_classes(register_global)
class Cuda_array_decl(CallableTemplate):
def generic(self):
def typer(shape, dtype):
# Only integer literals and tuples of integer literals are valid
# shapes
if isinstance(shape, types.Integer):
if not isinstance(shape, types.IntegerLiteral):
return None
elif isinstance(shape, (types.Tuple, types.UniTuple)):
if any([not isinstance(s, types.IntegerLiteral)
for s in shape]):
return None
else:
return None
ndim = parse_shape(shape)
nb_dtype = parse_dtype(dtype)
if nb_dtype is not None and ndim is not None:
return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
return typer
@register
class Cuda_shared_array(Cuda_array_decl):
key = cuda.shared.array
@register
class Cuda_local_array(Cuda_array_decl):
key = cuda.local.array
@register
class Cuda_const_array_like(CallableTemplate):
key = cuda.const.array_like
def generic(self):
def typer(ndarray):
return ndarray
return typer
@register
class Cuda_threadfence_device(ConcreteTemplate):
key = cuda.threadfence
cases = [signature(types.none)]
@register
class Cuda_threadfence_block(ConcreteTemplate):
key = cuda.threadfence_block
cases = [signature(types.none)]
@register
class Cuda_threadfence_system(ConcreteTemplate):
key = cuda.threadfence_system
cases = [signature(types.none)]
@register
class Cuda_syncwarp(ConcreteTemplate):
key = cuda.syncwarp
cases = [signature(types.none), signature(types.none, types.i4)]
@register
class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
key = cuda.shfl_sync_intrinsic
cases = [
signature(types.Tuple((types.i4, types.b1)),
types.i4, types.i4, types.i4, types.i4, types.i4),
signature(types.Tuple((types.i8, types.b1)),
types.i4, types.i4, types.i8, types.i4, types.i4),
signature(types.Tuple((types.f4, types.b1)),
types.i4, types.i4, types.f4, types.i4, types.i4),
signature(types.Tuple((types.f8, types.b1)),
types.i4, types.i4, types.f8, types.i4, types.i4),
]
@register
class Cuda_vote_sync_intrinsic(ConcreteTemplate):
key = cuda.vote_sync_intrinsic
cases = [signature(types.Tuple((types.i4, types.b1)),
types.i4, types.i4, types.b1)]
@register
class Cuda_match_any_sync(ConcreteTemplate):
key = cuda.match_any_sync
cases = [
signature(types.i4, types.i4, types.i4),
signature(types.i4, types.i4, types.i8),
signature(types.i4, types.i4, types.f4),
signature(types.i4, types.i4, types.f8),
]
@register
class Cuda_match_all_sync(ConcreteTemplate):
key = cuda.match_all_sync
cases = [
signature(types.Tuple((types.i4, types.b1)), types.i4, types.i4),
signature(types.Tuple((types.i4, types.b1)), types.i4, types.i8),
signature(types.Tuple((types.i4, types.b1)), types.i4, types.f4),
signature(types.Tuple((types.i4, types.b1)), types.i4, types.f8),
]
@register
class Cuda_activemask(ConcreteTemplate):
key = cuda.activemask
cases = [signature(types.uint32)]
@register
class Cuda_lanemask_lt(ConcreteTemplate):
key = cuda.lanemask_lt
cases = [signature(types.uint32)]
@register
class Cuda_popc(ConcreteTemplate):
"""
Supported types from `llvm.popc`
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
"""
key = cuda.popc
cases = [
signature(types.int8, types.int8),
signature(types.int16, types.int16),
signature(types.int32, types.int32),
signature(types.int64, types.int64),
signature(types.uint8, types.uint8),
signature(types.uint16, types.uint16),
signature(types.uint32, types.uint32),
signature(types.uint64, types.uint64),
]
@register
class Cuda_fma(ConcreteTemplate):
"""
Supported types from `llvm.fma`
[here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics)
"""
key = cuda.fma
cases = [
signature(types.float32, types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64, types.float64),
]
@register
class Cuda_hfma(ConcreteTemplate):
key = cuda.fp16.hfma
cases = [
signature(types.float16, types.float16, types.float16, types.float16)
]
@register
class Cuda_cbrt(ConcreteTemplate):
key = cuda.cbrt
cases = [
signature(types.float32, types.float32),
signature(types.float64, types.float64),
]
@register
class Cuda_brev(ConcreteTemplate):
key = cuda.brev
cases = [
signature(types.uint32, types.uint32),
signature(types.uint64, types.uint64),
]
@register
class Cuda_clz(ConcreteTemplate):
"""
Supported types from `llvm.ctlz`
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
"""
key = cuda.clz
cases = [
signature(types.int8, types.int8),
signature(types.int16, types.int16),
signature(types.int32, types.int32),
signature(types.int64, types.int64),
signature(types.uint8, types.uint8),
signature(types.uint16, types.uint16),
signature(types.uint32, types.uint32),
signature(types.uint64, types.uint64),
]
@register
class Cuda_ffs(ConcreteTemplate):
"""
Supported types from `llvm.cttz`
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
"""
key = cuda.ffs
cases = [
signature(types.uint32, types.int8),
signature(types.uint32, types.int16),
signature(types.uint32, types.int32),
signature(types.uint32, types.int64),
signature(types.uint32, types.uint8),
signature(types.uint32, types.uint16),
signature(types.uint32, types.uint32),
signature(types.uint32, types.uint64),
]
@register
class Cuda_selp(AbstractTemplate):
key = cuda.selp
def generic(self, args, kws):
assert not kws
test, a, b = args
# per docs
# http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
supported_types = (types.float64, types.float32,
types.int16, types.uint16,
types.int32, types.uint32,
types.int64, types.uint64)
if a != b or a not in supported_types:
return
return signature(a, test, a, a)
def _genfp16_unary(l_key):
@register
class Cuda_fp16_unary(ConcreteTemplate):
key = l_key
cases = [signature(types.float16, types.float16)]
return Cuda_fp16_unary
def _genfp16_unary_operator(l_key):
@register_global(l_key)
class Cuda_fp16_unary(AbstractTemplate):
key = l_key
def generic(self, args, kws):
assert not kws
if len(args) == 1 and args[0] == types.float16:
return signature(types.float16, types.float16)
return Cuda_fp16_unary
def _genfp16_binary(l_key):
@register
class Cuda_fp16_binary(ConcreteTemplate):
key = l_key
cases = [signature(types.float16, types.float16, types.float16)]
return Cuda_fp16_binary
@register_global(float)
class Float(AbstractTemplate):
def generic(self, args, kws):
assert not kws
[arg] = args
if arg == types.float16:
return signature(arg, arg)
def _genfp16_binary_comparison(l_key):
@register
class Cuda_fp16_cmp(ConcreteTemplate):
key = l_key
cases = [
signature(types.b1, types.float16, types.float16)
]
return Cuda_fp16_cmp
# If multiple ConcreteTemplates provide typing for a single function, then
# function resolution will pick the first compatible typing it finds even if it
# involves inserting a cast that would be considered undesirable (in this
# specific case, float16s could be cast to float32s for comparisons).
#
# To work around this, we instead use an AbstractTemplate that implements
# exactly the casting logic that we desire. The AbstractTemplate gets
# considered in preference to ConcreteTemplates during typing.
#
# This is tracked as Issue #7863 (https://github.com/numba/numba/issues/7863) -
# once this is resolved it should be possible to replace this AbstractTemplate
# with a ConcreteTemplate to simplify the logic.
def _fp16_binary_operator(l_key, retty):
@register_global(l_key)
class Cuda_fp16_operator(AbstractTemplate):
key = l_key
def generic(self, args, kws):
assert not kws
if len(args) == 2 and \
(args[0] == types.float16 or args[1] == types.float16):
if (args[0] == types.float16):
convertible = self.context.can_convert(args[1], args[0])
else:
convertible = self.context.can_convert(args[0], args[1])
# We allow three cases here:
#
# 1. fp16 to fp16 - Conversion.exact
# 2. fp16 to other types fp16 can be promoted to
# - Conversion.promote
# 3. fp16 to int8 (safe conversion) -
# - Conversion.safe
if (convertible == Conversion.exact) or \
(convertible == Conversion.promote) or \
(convertible == Conversion.safe):
return signature(retty, types.float16, types.float16)
return Cuda_fp16_operator
def _genfp16_comparison_operator(op):
return _fp16_binary_operator(op, types.b1)
def _genfp16_binary_operator(op):
return _fp16_binary_operator(op, types.float16)
Cuda_hadd = _genfp16_binary(cuda.fp16.hadd)
Cuda_add = _genfp16_binary_operator(operator.add)
Cuda_iadd = _genfp16_binary_operator(operator.iadd)
Cuda_hsub = _genfp16_binary(cuda.fp16.hsub)
Cuda_sub = _genfp16_binary_operator(operator.sub)
Cuda_isub = _genfp16_binary_operator(operator.isub)
Cuda_hmul = _genfp16_binary(cuda.fp16.hmul)
Cuda_mul = _genfp16_binary_operator(operator.mul)
Cuda_imul = _genfp16_binary_operator(operator.imul)
Cuda_hmax = _genfp16_binary(cuda.fp16.hmax)
Cuda_hmin = _genfp16_binary(cuda.fp16.hmin)
Cuda_hneg = _genfp16_unary(cuda.fp16.hneg)
Cuda_neg = _genfp16_unary_operator(operator.neg)
Cuda_habs = _genfp16_unary(cuda.fp16.habs)
Cuda_abs = _genfp16_unary_operator(abs)
Cuda_heq = _genfp16_binary_comparison(cuda.fp16.heq)
_genfp16_comparison_operator(operator.eq)
Cuda_hne = _genfp16_binary_comparison(cuda.fp16.hne)
_genfp16_comparison_operator(operator.ne)
Cuda_hge = _genfp16_binary_comparison(cuda.fp16.hge)
_genfp16_comparison_operator(operator.ge)
Cuda_hgt = _genfp16_binary_comparison(cuda.fp16.hgt)
_genfp16_comparison_operator(operator.gt)
Cuda_hle = _genfp16_binary_comparison(cuda.fp16.hle)
_genfp16_comparison_operator(operator.le)
Cuda_hlt = _genfp16_binary_comparison(cuda.fp16.hlt)
_genfp16_comparison_operator(operator.lt)
_genfp16_binary_operator(operator.truediv)
_genfp16_binary_operator(operator.itruediv)
def _resolve_wrapped_unary(fname):
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
types.float16,
(types.float16,))
return types.Function(decl)
def _resolve_wrapped_binary(fname):
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
types.float16,
(types.float16, types.float16,))
return types.Function(decl)
hsin_device = _resolve_wrapped_unary('hsin')
hcos_device = _resolve_wrapped_unary('hcos')
hlog_device = _resolve_wrapped_unary('hlog')
hlog10_device = _resolve_wrapped_unary('hlog10')
hlog2_device = _resolve_wrapped_unary('hlog2')
hexp_device = _resolve_wrapped_unary('hexp')
hexp10_device = _resolve_wrapped_unary('hexp10')
hexp2_device = _resolve_wrapped_unary('hexp2')
hsqrt_device = _resolve_wrapped_unary('hsqrt')
hrsqrt_device = _resolve_wrapped_unary('hrsqrt')
hfloor_device = _resolve_wrapped_unary('hfloor')
hceil_device = _resolve_wrapped_unary('hceil')
hrcp_device = _resolve_wrapped_unary('hrcp')
hrint_device = _resolve_wrapped_unary('hrint')
htrunc_device = _resolve_wrapped_unary('htrunc')
hdiv_device = _resolve_wrapped_binary('hdiv')
# generate atomic operations
def _gen(l_key, supported_types):
@register
class Cuda_atomic(AbstractTemplate):
key = l_key
def generic(self, args, kws):
assert not kws
ary, idx, val = args
if ary.dtype not in supported_types:
return
if ary.ndim == 1:
return signature(ary.dtype, ary, types.intp, ary.dtype)
elif ary.ndim > 1:
return signature(ary.dtype, ary, idx, ary.dtype)
return Cuda_atomic
all_numba_types = (types.float64, types.float32,
types.int32, types.uint32,
types.int64, types.uint64)
integer_numba_types = (types.int32, types.uint32,
types.int64, types.uint64)
unsigned_int_numba_types = (types.uint32, types.uint64)
Cuda_atomic_add = _gen(cuda.atomic.add, all_numba_types)
Cuda_atomic_sub = _gen(cuda.atomic.sub, all_numba_types)
Cuda_atomic_max = _gen(cuda.atomic.max, all_numba_types)
Cuda_atomic_min = _gen(cuda.atomic.min, all_numba_types)
Cuda_atomic_nanmax = _gen(cuda.atomic.nanmax, all_numba_types)
Cuda_atomic_nanmin = _gen(cuda.atomic.nanmin, all_numba_types)
Cuda_atomic_and = _gen(cuda.atomic.and_, integer_numba_types)
Cuda_atomic_or = _gen(cuda.atomic.or_, integer_numba_types)
Cuda_atomic_xor = _gen(cuda.atomic.xor, integer_numba_types)
Cuda_atomic_inc = _gen(cuda.atomic.inc, unsigned_int_numba_types)
Cuda_atomic_dec = _gen(cuda.atomic.dec, unsigned_int_numba_types)
Cuda_atomic_exch = _gen(cuda.atomic.exch, integer_numba_types)
@register
class Cuda_atomic_compare_and_swap(AbstractTemplate):
key = cuda.atomic.compare_and_swap
def generic(self, args, kws):
assert not kws
ary, old, val = args
dty = ary.dtype
if dty in integer_numba_types and ary.ndim == 1:
return signature(dty, ary, dty, dty)
@register
class Cuda_atomic_cas(AbstractTemplate):
key = cuda.atomic.cas
def generic(self, args, kws):
assert not kws
ary, idx, old, val = args
dty = ary.dtype
if dty not in integer_numba_types:
return
if ary.ndim == 1:
return signature(dty, ary, types.intp, dty, dty)
elif ary.ndim > 1:
return signature(dty, ary, idx, dty, dty)
@register
class Cuda_nanosleep(ConcreteTemplate):
key = cuda.nanosleep
cases = [signature(types.void, types.uint32)]
@register_attr
class Dim3_attrs(AttributeTemplate):
key = dim3
def resolve_x(self, mod):
return types.int32
def resolve_y(self, mod):
return types.int32
def resolve_z(self, mod):
return types.int32
@register_attr
class CudaSharedModuleTemplate(AttributeTemplate):
key = types.Module(cuda.shared)
def resolve_array(self, mod):
return types.Function(Cuda_shared_array)
@register_attr
class CudaConstModuleTemplate(AttributeTemplate):
key = types.Module(cuda.const)
def resolve_array_like(self, mod):
return types.Function(Cuda_const_array_like)
@register_attr
class CudaLocalModuleTemplate(AttributeTemplate):
key = types.Module(cuda.local)
def resolve_array(self, mod):
return types.Function(Cuda_local_array)
@register_attr
class CudaAtomicTemplate(AttributeTemplate):
key = types.Module(cuda.atomic)
def resolve_add(self, mod):
return types.Function(Cuda_atomic_add)
def resolve_sub(self, mod):
return types.Function(Cuda_atomic_sub)
def resolve_and_(self, mod):
return types.Function(Cuda_atomic_and)
def resolve_or_(self, mod):
return types.Function(Cuda_atomic_or)
def resolve_xor(self, mod):
return types.Function(Cuda_atomic_xor)
def resolve_inc(self, mod):
return types.Function(Cuda_atomic_inc)
def resolve_dec(self, mod):
return types.Function(Cuda_atomic_dec)
def resolve_exch(self, mod):
return types.Function(Cuda_atomic_exch)
def resolve_max(self, mod):
return types.Function(Cuda_atomic_max)
def resolve_min(self, mod):
return types.Function(Cuda_atomic_min)
def resolve_nanmin(self, mod):
return types.Function(Cuda_atomic_nanmin)
def resolve_nanmax(self, mod):
return types.Function(Cuda_atomic_nanmax)
def resolve_compare_and_swap(self, mod):
return types.Function(Cuda_atomic_compare_and_swap)
def resolve_cas(self, mod):
return types.Function(Cuda_atomic_cas)
@register_attr
class CudaFp16Template(AttributeTemplate):
key = types.Module(cuda.fp16)
def resolve_hadd(self, mod):
return types.Function(Cuda_hadd)
def resolve_hsub(self, mod):
return types.Function(Cuda_hsub)
def resolve_hmul(self, mod):
return types.Function(Cuda_hmul)
def resolve_hdiv(self, mod):
return hdiv_device
def resolve_hneg(self, mod):
return types.Function(Cuda_hneg)
def resolve_habs(self, mod):
return types.Function(Cuda_habs)
def resolve_hfma(self, mod):
return types.Function(Cuda_hfma)
def resolve_hsin(self, mod):
return hsin_device
def resolve_hcos(self, mod):
return hcos_device
def resolve_hlog(self, mod):
return hlog_device
def resolve_hlog10(self, mod):
return hlog10_device
def resolve_hlog2(self, mod):
return hlog2_device
def resolve_hexp(self, mod):
return hexp_device
def resolve_hexp10(self, mod):
return hexp10_device
def resolve_hexp2(self, mod):
return hexp2_device
def resolve_hfloor(self, mod):
return hfloor_device
def resolve_hceil(self, mod):
return hceil_device
def resolve_hsqrt(self, mod):
return hsqrt_device
def resolve_hrsqrt(self, mod):
return hrsqrt_device
def resolve_hrcp(self, mod):
return hrcp_device
def resolve_hrint(self, mod):
return hrint_device
def resolve_htrunc(self, mod):
return htrunc_device
def resolve_heq(self, mod):
return types.Function(Cuda_heq)
def resolve_hne(self, mod):
return types.Function(Cuda_hne)
def resolve_hge(self, mod):
return types.Function(Cuda_hge)
def resolve_hgt(self, mod):
return types.Function(Cuda_hgt)
def resolve_hle(self, mod):
return types.Function(Cuda_hle)
def resolve_hlt(self, mod):
return types.Function(Cuda_hlt)
def resolve_hmax(self, mod):
return types.Function(Cuda_hmax)
def resolve_hmin(self, mod):
return types.Function(Cuda_hmin)
@register_attr
class CudaModuleTemplate(AttributeTemplate):
key = types.Module(cuda)
def resolve_cg(self, mod):
return types.Module(cuda.cg)
def resolve_threadIdx(self, mod):
return dim3
def resolve_blockIdx(self, mod):
return dim3
def resolve_blockDim(self, mod):
return dim3
def resolve_gridDim(self, mod):
return dim3
def resolve_laneid(self, mod):
return types.int32
def resolve_shared(self, mod):
return types.Module(cuda.shared)
def resolve_popc(self, mod):
return types.Function(Cuda_popc)
def resolve_brev(self, mod):
return types.Function(Cuda_brev)
def resolve_clz(self, mod):
return types.Function(Cuda_clz)
def resolve_ffs(self, mod):
return types.Function(Cuda_ffs)
def resolve_fma(self, mod):
return types.Function(Cuda_fma)
def resolve_cbrt(self, mod):
return types.Function(Cuda_cbrt)
def resolve_threadfence(self, mod):
return types.Function(Cuda_threadfence_device)
def resolve_threadfence_block(self, mod):
return types.Function(Cuda_threadfence_block)
def resolve_threadfence_system(self, mod):
return types.Function(Cuda_threadfence_system)
def resolve_syncwarp(self, mod):
return types.Function(Cuda_syncwarp)
def resolve_shfl_sync_intrinsic(self, mod):
return types.Function(Cuda_shfl_sync_intrinsic)
def resolve_vote_sync_intrinsic(self, mod):
return types.Function(Cuda_vote_sync_intrinsic)
def resolve_match_any_sync(self, mod):
return types.Function(Cuda_match_any_sync)
def resolve_match_all_sync(self, mod):
return types.Function(Cuda_match_all_sync)
def resolve_activemask(self, mod):
return types.Function(Cuda_activemask)
def resolve_lanemask_lt(self, mod):
return types.Function(Cuda_lanemask_lt)
def resolve_selp(self, mod):
return types.Function(Cuda_selp)
def resolve_nanosleep(self, mod):
return types.Function(Cuda_nanosleep)
def resolve_atomic(self, mod):
return types.Module(cuda.atomic)
def resolve_fp16(self, mod):
return types.Module(cuda.fp16)
def resolve_const(self, mod):
return types.Module(cuda.const)
def resolve_local(self, mod):
return types.Module(cuda.local)
register_global(cuda, types.Module(cuda))
# NumPy
for func in trigonometric_functions:
register_numpy_ufunc(func, register_global)
for func in comparison_functions:
register_numpy_ufunc(func, register_global)
for func in bit_twiddling_functions:
register_numpy_ufunc(func, register_global)
for func in math_operations:
if func in ('log', 'log2', 'log10'):
register_numpy_ufunc(func, register_global)

View File

@@ -0,0 +1,9 @@
"""CUDA Driver
- Driver API binding
- NVVM API binding
- Device array implementation
"""
from numba.core import config
assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'

View File

@@ -0,0 +1,904 @@
"""
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object. If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
"""
import math
import functools
import operator
import copy
from ctypes import c_void_p
import numpy as np
import numba
from numba import _devicearray
from numba.cuda.cudadrv import devices, dummyarray
from numba.cuda.cudadrv import driver as _driver
from numba.core import types, config
from numba.np.unsafe.ndarray import to_fixed_tuple
from numba.np.numpy_support import numpy_version
from numba.np import numpy_support
from numba.cuda.api_util import prepare_shape_strides_dtype
from numba.core.errors import NumbaPerformanceWarning
from warnings import warn
try:
lru_cache = getattr(functools, 'lru_cache')(None)
except AttributeError:
# Python 3.1 or lower
def lru_cache(func):
return func
def is_cuda_ndarray(obj):
"Check if an object is a CUDA ndarray"
return getattr(obj, '__cuda_ndarray__', False)
def verify_cuda_ndarray_interface(obj):
"Verify the CUDA ndarray interface for an obj"
require_cuda_ndarray(obj)
def requires_attr(attr, typ):
if not hasattr(obj, attr):
raise AttributeError(attr)
if not isinstance(getattr(obj, attr), typ):
raise AttributeError('%s must be of type %s' % (attr, typ))
requires_attr('shape', tuple)
requires_attr('strides', tuple)
requires_attr('dtype', np.dtype)
requires_attr('size', int)
def require_cuda_ndarray(obj):
"Raises ValueError is is_cuda_ndarray(obj) evaluates False"
if not is_cuda_ndarray(obj):
raise ValueError('require an cuda ndarray object')
class DeviceNDArrayBase(_devicearray.DeviceArray):
"""A on GPU NDArray representation
"""
__cuda_memory__ = True
__cuda_ndarray__ = True # There must be gpu_data attribute
def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
"""
Args
----
shape
array shape.
strides
array strides.
dtype
data type as np.dtype coercible object.
stream
cuda stream.
gpu_data
user provided device memory for the ndarray data buffer
"""
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
dtype = np.dtype(dtype)
self.ndim = len(shape)
if len(strides) != self.ndim:
raise ValueError('strides not match ndim')
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
dtype.itemsize)
self.shape = tuple(shape)
self.strides = tuple(strides)
self.dtype = dtype
self.size = int(functools.reduce(operator.mul, self.shape, 1))
# prepare gpu memory
if self.size > 0:
if gpu_data is None:
self.alloc_size = _driver.memory_size_from_info(
self.shape, self.strides, self.dtype.itemsize)
gpu_data = devices.get_context().memalloc(self.alloc_size)
else:
self.alloc_size = _driver.device_memory_size(gpu_data)
else:
# Make NULL pointer for empty allocation
if _driver.USE_NV_BINDING:
null = _driver.binding.CUdeviceptr(0)
else:
null = c_void_p(0)
gpu_data = _driver.MemoryPointer(context=devices.get_context(),
pointer=null, size=0)
self.alloc_size = 0
self.gpu_data = gpu_data
self.stream = stream
@property
def __cuda_array_interface__(self):
if _driver.USE_NV_BINDING:
if self.device_ctypes_pointer is not None:
ptr = int(self.device_ctypes_pointer)
else:
ptr = 0
else:
if self.device_ctypes_pointer.value is not None:
ptr = self.device_ctypes_pointer.value
else:
ptr = 0
return {
'shape': tuple(self.shape),
'strides': None if is_contiguous(self) else tuple(self.strides),
'data': (ptr, False),
'typestr': self.dtype.str,
'stream': int(self.stream) if self.stream != 0 else None,
'version': 3,
}
def bind(self, stream=0):
"""Bind a CUDA stream to this object so that all subsequent operation
on this array defaults to the given stream.
"""
clone = copy.copy(self)
clone.stream = stream
return clone
@property
def T(self):
return self.transpose()
def transpose(self, axes=None):
if axes and tuple(axes) == tuple(range(self.ndim)):
return self
elif self.ndim != 2:
msg = "transposing a non-2D DeviceNDArray isn't supported"
raise NotImplementedError(msg)
elif axes is not None and set(axes) != set(range(self.ndim)):
raise ValueError("invalid axes list %r" % (axes,))
else:
from numba.cuda.kernels.transpose import transpose
return transpose(self)
def _default_stream(self, stream):
return self.stream if not stream else stream
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
# Typing considerations:
#
# 1. The preference is to use 'C' or 'F' layout since this enables
# hardcoding stride values into compiled kernels, which is more
# efficient than storing a passed-in value in a register.
#
# 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
# the more likely / common case.
#
# 3. If an array is broadcast then it must be typed as 'A' - using 'C'
# or 'F' does not apply for broadcast arrays, because the strides, some
# of which will be 0, will not match those hardcoded in for 'C' or 'F'
# layouts.
broadcast = 0 in self.strides
if self.flags['C_CONTIGUOUS'] and not broadcast:
layout = 'C'
elif self.flags['F_CONTIGUOUS'] and not broadcast:
layout = 'F'
else:
layout = 'A'
dtype = numpy_support.from_dtype(self.dtype)
return types.Array(dtype, self.ndim, layout)
@property
def device_ctypes_pointer(self):
"""Returns the ctypes pointer to the GPU data buffer
"""
if self.gpu_data is None:
if _driver.USE_NV_BINDING:
return _driver.binding.CUdeviceptr(0)
else:
return c_void_p(0)
else:
return self.gpu_data.device_ctypes_pointer
@devices.require_context
def copy_to_device(self, ary, stream=0):
"""Copy `ary` to `self`.
If `ary` is a CUDA memory, perform a device-to-device transfer.
Otherwise, perform a a host-to-device transfer.
"""
if ary.size == 0:
# Nothing to do
return
sentry_contiguous(self)
stream = self._default_stream(stream)
self_core, ary_core = array_core(self), array_core(ary)
if _driver.is_device_memory(ary):
sentry_contiguous(ary)
check_array_compatibility(self_core, ary_core)
_driver.device_to_device(self, ary, self.alloc_size, stream=stream)
else:
# Ensure same contiguity. Only makes a host-side copy if necessary
# (i.e., in order to materialize a writable strided view)
ary_core = np.array(
ary_core,
order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
subok=True,
copy=(not ary_core.flags['WRITEABLE'])
if numpy_version < (2, 0) else None)
check_array_compatibility(self_core, ary_core)
_driver.host_to_device(self, ary_core, self.alloc_size,
stream=stream)
@devices.require_context
def copy_to_host(self, ary=None, stream=0):
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
if ``ary`` is ``None``.
If a CUDA ``stream`` is given, then the transfer will be made
asynchronously as part as the given stream. Otherwise, the transfer is
synchronous: the function returns after the copy is finished.
Always returns the host array.
Example::
import numpy as np
from numba import cuda
arr = np.arange(1000)
d_arr = cuda.to_device(arr)
my_kernel[100, 100](d_arr)
result_array = d_arr.copy_to_host()
"""
if any(s < 0 for s in self.strides):
msg = 'D->H copy not implemented for negative strides: {}'
raise NotImplementedError(msg.format(self.strides))
assert self.alloc_size >= 0, "Negative memory size"
stream = self._default_stream(stream)
if ary is None:
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
else:
check_array_compatibility(self, ary)
hostary = ary
if self.alloc_size != 0:
_driver.device_to_host(hostary, self, self.alloc_size,
stream=stream)
if ary is None:
if self.size == 0:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
buffer=hostary)
else:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
strides=self.strides, buffer=hostary)
return hostary
def split(self, section, stream=0):
"""Split the array into equal partition of the `section` size.
If the array cannot be equally divided, the last section will be
smaller.
"""
stream = self._default_stream(stream)
if self.ndim != 1:
raise ValueError("only support 1d array")
if self.strides[0] != self.dtype.itemsize:
raise ValueError("only support unit stride")
nsect = int(math.ceil(float(self.size) / section))
strides = self.strides
itemsize = self.dtype.itemsize
for i in range(nsect):
begin = i * section
end = min(begin + section, self.size)
shape = (end - begin,)
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
gpu_data=gpu_data)
def as_cuda_arg(self):
"""Returns a device memory object that is used as the argument.
"""
return self.gpu_data
def get_ipc_handle(self):
"""
Returns a *IpcArrayHandle* object that is safe to serialize and transfer
to another process to share the local allocation.
Note: this feature is only available on Linux.
"""
ipch = devices.get_context().get_ipc_handle(self.gpu_data)
desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
def squeeze(self, axis=None, stream=0):
"""
Remove axes of size one from the array shape.
Parameters
----------
axis : None or int or tuple of ints, optional
Subset of dimensions to remove. A `ValueError` is raised if an axis
with size greater than one is selected. If `None`, all axes with
size one are removed.
stream : cuda stream or 0, optional
Default stream for the returned view of the array.
Returns
-------
DeviceNDArray
Squeezed view into the array.
"""
new_dummy, _ = self._dummy.squeeze(axis=axis)
return DeviceNDArray(
shape=new_dummy.shape,
strides=new_dummy.strides,
dtype=self.dtype,
stream=self._default_stream(stream),
gpu_data=self.gpu_data,
)
def view(self, dtype):
"""Returns a new object by reinterpretting the dtype without making a
copy of the data.
"""
dtype = np.dtype(dtype)
shape = list(self.shape)
strides = list(self.strides)
if self.dtype.itemsize != dtype.itemsize:
if not self.is_c_contiguous():
raise ValueError(
"To change to a dtype of a different size,"
" the array must be C-contiguous"
)
shape[-1], rem = divmod(
shape[-1] * self.dtype.itemsize,
dtype.itemsize
)
if rem != 0:
raise ValueError(
"When changing to a larger dtype,"
" its size must be a divisor of the total size in bytes"
" of the last axis of the array."
)
strides[-1] = dtype.itemsize
return DeviceNDArray(
shape=shape,
strides=strides,
dtype=dtype,
stream=self.stream,
gpu_data=self.gpu_data,
)
@property
def nbytes(self):
# Note: not using `alloc_size`. `alloc_size` reports memory
# consumption of the allocation, not the size of the array
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
return self.dtype.itemsize * self.size
class DeviceRecord(DeviceNDArrayBase):
'''
An on-GPU record type
'''
def __init__(self, dtype, stream=0, gpu_data=None):
shape = ()
strides = ()
super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
gpu_data)
@property
def flags(self):
"""
For `numpy.ndarray` compatibility. Ideally this would return a
`np.core.multiarray.flagsobj`, but that needs to be constructed
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
aren't writeable).
"""
return dict(self._dummy.flags) # defensive copy
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
return numpy_support.from_dtype(self.dtype)
@devices.require_context
def __getitem__(self, item):
return self._do_getitem(item)
@devices.require_context
def getitem(self, item, stream=0):
"""Do `__getitem__(item)` with CUDA stream
"""
return self._do_getitem(item, stream)
def _do_getitem(self, item, stream=0):
stream = self._default_stream(stream)
typ, offset = self.dtype.fields[item]
newdata = self.gpu_data.view(offset)
if typ.shape == ():
if typ.names is not None:
return DeviceRecord(dtype=typ, stream=stream,
gpu_data=newdata)
else:
hostary = np.empty(1, dtype=typ)
_driver.device_to_host(dst=hostary, src=newdata,
size=typ.itemsize,
stream=stream)
return hostary[0]
else:
shape, strides, dtype = \
prepare_shape_strides_dtype(typ.shape,
None,
typ.subdtype[0], 'C')
return DeviceNDArray(shape=shape, strides=strides,
dtype=dtype, gpu_data=newdata,
stream=stream)
@devices.require_context
def __setitem__(self, key, value):
return self._do_setitem(key, value)
@devices.require_context
def setitem(self, key, value, stream=0):
"""Do `__setitem__(key, value)` with CUDA stream
"""
return self._do_setitem(key, value, stream=stream)
def _do_setitem(self, key, value, stream=0):
stream = self._default_stream(stream)
# If the record didn't have a default stream, and the user didn't
# provide a stream, then we will use the default stream for the
# assignment kernel and synchronize on it.
synchronous = not stream
if synchronous:
ctx = devices.get_context()
stream = ctx.get_default_stream()
# (1) prepare LHS
typ, offset = self.dtype.fields[key]
newdata = self.gpu_data.view(offset)
lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
# (2) prepare RHS
rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
# (3) do the copy
_driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
if synchronous:
stream.synchronize()
@lru_cache
def _assign_kernel(ndim):
"""
A separate method so we don't need to compile code every assignment (!).
:param ndim: We need to have static array sizes for cuda.local.array, so
bake in the number of dimensions into the kernel
"""
from numba import cuda # circular!
if ndim == 0:
# the (2, ndim) allocation below is not yet supported, so avoid it
@cuda.jit
def kernel(lhs, rhs):
lhs[()] = rhs[()]
return kernel
@cuda.jit
def kernel(lhs, rhs):
location = cuda.grid(1)
n_elements = 1
for i in range(lhs.ndim):
n_elements *= lhs.shape[i]
if location >= n_elements:
# bake n_elements into the kernel, better than passing it in
# as another argument.
return
# [0, :] is the to-index (into `lhs`)
# [1, :] is the from-index (into `rhs`)
idx = cuda.local.array(
shape=(2, ndim),
dtype=types.int64)
for i in range(ndim - 1, -1, -1):
idx[0, i] = location % lhs.shape[i]
idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
location //= lhs.shape[i]
lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
return kernel
class DeviceNDArray(DeviceNDArrayBase):
'''
An on-GPU array type
'''
def is_f_contiguous(self):
'''
Return true if the array is Fortran-contiguous.
'''
return self._dummy.is_f_contig
@property
def flags(self):
"""
For `numpy.ndarray` compatibility. Ideally this would return a
`np.core.multiarray.flagsobj`, but that needs to be constructed
with an existing `numpy.ndarray` (as the C- and F- contiguous flags
aren't writeable).
"""
return dict(self._dummy.flags) # defensive copy
def is_c_contiguous(self):
'''
Return true if the array is C-contiguous.
'''
return self._dummy.is_c_contig
def __array__(self, dtype=None):
"""
:return: an `numpy.ndarray`, so copies to the host.
"""
if dtype:
return self.copy_to_host().__array__(dtype)
else:
return self.copy_to_host().__array__()
def __len__(self):
return self.shape[0]
def reshape(self, *newshape, **kws):
"""
Reshape the array without changing its contents, similarly to
:meth:`numpy.ndarray.reshape`. Example::
d_arr = d_arr.reshape(20, 50, order='F')
"""
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
newshape = newshape[0]
cls = type(self)
if newshape == self.shape:
# nothing to do
return cls(shape=self.shape, strides=self.strides,
dtype=self.dtype, gpu_data=self.gpu_data)
newarr, extents = self._dummy.reshape(*newshape, **kws)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, gpu_data=self.gpu_data)
else:
raise NotImplementedError("operation requires copying")
def ravel(self, order='C', stream=0):
'''
Flattens a contiguous array without changing its contents, similar to
:meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
exception.
'''
stream = self._default_stream(stream)
cls = type(self)
newarr, extents = self._dummy.ravel(order=order)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, gpu_data=self.gpu_data,
stream=stream)
else:
raise NotImplementedError("operation requires copying")
@devices.require_context
def __getitem__(self, item):
return self._do_getitem(item)
@devices.require_context
def getitem(self, item, stream=0):
"""Do `__getitem__(item)` with CUDA stream
"""
return self._do_getitem(item, stream)
def _do_getitem(self, item, stream=0):
stream = self._default_stream(stream)
arr = self._dummy.__getitem__(item)
extents = list(arr.iter_contiguous_extent())
cls = type(self)
if len(extents) == 1:
newdata = self.gpu_data.view(*extents[0])
if not arr.is_array:
# Check for structured array type (record)
if self.dtype.names is not None:
return DeviceRecord(dtype=self.dtype, stream=stream,
gpu_data=newdata)
else:
# Element indexing
hostary = np.empty(1, dtype=self.dtype)
_driver.device_to_host(dst=hostary, src=newdata,
size=self._dummy.itemsize,
stream=stream)
return hostary[0]
else:
return cls(shape=arr.shape, strides=arr.strides,
dtype=self.dtype, gpu_data=newdata, stream=stream)
else:
newdata = self.gpu_data.view(*arr.extent)
return cls(shape=arr.shape, strides=arr.strides,
dtype=self.dtype, gpu_data=newdata, stream=stream)
@devices.require_context
def __setitem__(self, key, value):
return self._do_setitem(key, value)
@devices.require_context
def setitem(self, key, value, stream=0):
"""Do `__setitem__(key, value)` with CUDA stream
"""
return self._do_setitem(key, value, stream=stream)
def _do_setitem(self, key, value, stream=0):
stream = self._default_stream(stream)
# If the array didn't have a default stream, and the user didn't provide
# a stream, then we will use the default stream for the assignment
# kernel and synchronize on it.
synchronous = not stream
if synchronous:
ctx = devices.get_context()
stream = ctx.get_default_stream()
# (1) prepare LHS
arr = self._dummy.__getitem__(key)
newdata = self.gpu_data.view(*arr.extent)
if isinstance(arr, dummyarray.Element):
# convert to a 0d array
shape = ()
strides = ()
else:
shape = arr.shape
strides = arr.strides
lhs = type(self)(
shape=shape,
strides=strides,
dtype=self.dtype,
gpu_data=newdata,
stream=stream)
# (2) prepare RHS
rhs, _ = auto_device(value, stream=stream, user_explicit=True)
if rhs.ndim > lhs.ndim:
raise ValueError("Can't assign %s-D array to %s-D self" % (
rhs.ndim,
lhs.ndim))
rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
# negative indices would not work if rhs.ndim == 0
rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
rhs = rhs.reshape(*rhs_shape)
for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
if r != 1 and l != r:
raise ValueError("Can't copy sequence with size %d to array "
"axis %d with dimension %d" % ( r, i, l))
# (3) do the copy
n_elements = functools.reduce(operator.mul, lhs.shape, 1)
_assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
if synchronous:
stream.synchronize()
class IpcArrayHandle(object):
"""
An IPC array handle that can be serialized and transfer to another process
in the same machine for share a GPU allocation.
On the destination process, use the *.open()* method to creates a new
*DeviceNDArray* object that shares the allocation from the original process.
To release the resources, call the *.close()* method. After that, the
destination can no longer use the shared array object. (Note: the
underlying weakref to the resource is now dead.)
This object implements the context-manager interface that calls the
*.open()* and *.close()* method automatically::
with the_ipc_array_handle as ipc_array:
# use ipc_array here as a normal gpu array object
some_code(ipc_array)
# ipc_array is dead at this point
"""
def __init__(self, ipc_handle, array_desc):
self._array_desc = array_desc
self._ipc_handle = ipc_handle
def open(self):
"""
Returns a new *DeviceNDArray* that shares the allocation from the
original process. Must not be used on the original process.
"""
dptr = self._ipc_handle.open(devices.get_context())
return DeviceNDArray(gpu_data=dptr, **self._array_desc)
def close(self):
"""
Closes the IPC handle to the array.
"""
self._ipc_handle.close()
def __enter__(self):
return self.open()
def __exit__(self, type, value, traceback):
self.close()
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
"""
A host array that uses CUDA mapped memory.
"""
def device_setup(self, gpu_data, stream=0):
self.gpu_data = gpu_data
self.stream = stream
class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
"""
A host array that uses CUDA managed memory.
"""
def device_setup(self, gpu_data, stream=0):
self.gpu_data = gpu_data
self.stream = stream
def from_array_like(ary, stream=0, gpu_data=None):
"Create a DeviceNDArray object that is like ary."
return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
gpu_data=gpu_data)
def from_record_like(rec, stream=0, gpu_data=None):
"Create a DeviceRecord object that is like rec."
return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
def array_core(ary):
"""
Extract the repeated core of a broadcast array.
Broadcast arrays are by definition non-contiguous due to repeated
dimensions, i.e., dimensions with stride 0. In order to ascertain memory
contiguity and copy the underlying data from such arrays, we must create
a view without the repeated dimensions.
"""
if not ary.strides or not ary.size:
return ary
core_index = []
for stride in ary.strides:
core_index.append(0 if stride == 0 else slice(None))
return ary[tuple(core_index)]
def is_contiguous(ary):
"""
Returns True iff `ary` is C-style contiguous while ignoring
broadcasted and 1-sized dimensions.
As opposed to array_core(), it does not call require_context(),
which can be quite expensive.
"""
size = ary.dtype.itemsize
for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
if shape > 1 and stride != 0:
if size != stride:
return False
size *= shape
return True
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
"be transferred as a single memory region. Please "
"ensure contiguous buffer with numpy "
".ascontiguousarray()")
def sentry_contiguous(ary):
core = array_core(ary)
if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
raise ValueError(errmsg_contiguous_buffer)
def auto_device(obj, stream=0, copy=True, user_explicit=False):
"""
Create a DeviceRecord or DeviceArray like obj and optionally copy data from
host to device. If obj already represents device memory, it is returned and
no copy is made.
"""
if _driver.is_device_memory(obj):
return obj, False
elif hasattr(obj, '__cuda_array_interface__'):
return numba.cuda.as_cuda_array(obj), False
else:
if isinstance(obj, np.void):
devobj = from_record_like(obj, stream=stream)
else:
# This allows you to pass non-array objects like constants and
# objects implementing the array interface
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
# into this function (with no overhead -- copies -- for `obj`s
# that are already `ndarray`s.
obj = np.array(
obj,
copy=False if numpy_version < (2, 0) else None,
subok=True)
sentry_contiguous(obj)
devobj = from_array_like(obj, stream=stream)
if copy:
if config.CUDA_WARN_ON_IMPLICIT_COPY:
if (
not user_explicit and
(not isinstance(obj, DeviceNDArray)
and isinstance(obj, np.ndarray))
):
msg = ("Host array used in CUDA kernel will incur "
"copy overhead to/from device.")
warn(NumbaPerformanceWarning(msg))
devobj.copy_to_device(obj, stream=stream)
return devobj, True
def check_array_compatibility(ary1, ary2):
ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
if ary1.dtype != ary2.dtype:
raise TypeError('incompatible dtype: %s vs. %s' %
(ary1.dtype, ary2.dtype))
if ary1sq.shape != ary2sq.shape:
raise ValueError('incompatible shape: %s vs. %s' %
(ary1.shape, ary2.shape))
# We check strides only if the size is nonzero, because strides are
# irrelevant (and can differ) for zero-length copies.
if ary1.size and ary1sq.strides != ary2sq.strides:
raise ValueError('incompatible strides: %s vs. %s' %
(ary1.strides, ary2.strides))

View File

@@ -0,0 +1,248 @@
"""
Expose each GPU devices directly.
This module implements a API that is like the "CUDA runtime" context manager
for managing CUDA context stack and clean up. It relies on thread-local globals
to separate the context stack management of each thread. Contexts are also
shareable among threads. Only the main thread can destroy Contexts.
Note:
- This module must be imported by the main-thread.
"""
import functools
import threading
from contextlib import contextmanager
from .driver import driver, USE_NV_BINDING
class _DeviceList(object):
def __getattr__(self, attr):
# First time looking at "lst" attribute.
if attr == "lst":
# Device list is not initialized.
# Query all CUDA devices.
numdev = driver.get_device_count()
gpus = [_DeviceContextManager(driver.get_device(devid))
for devid in range(numdev)]
# Define "lst" to avoid re-initialization
self.lst = gpus
return gpus
# Other attributes
return super(_DeviceList, self).__getattr__(attr)
def __getitem__(self, devnum):
'''
Returns the context manager for device *devnum*.
'''
return self.lst[devnum]
def __str__(self):
return ', '.join([str(d) for d in self.lst])
def __iter__(self):
return iter(self.lst)
def __len__(self):
return len(self.lst)
@property
def current(self):
"""Returns the active device or None if there's no active device
"""
with driver.get_active_context() as ac:
devnum = ac.devnum
if devnum is not None:
return self[devnum]
class _DeviceContextManager(object):
"""
Provides a context manager for executing in the context of the chosen
device. The normal use of instances of this type is from
``numba.cuda.gpus``. For example, to execute on device 2::
with numba.cuda.gpus[2]:
d_a = numba.cuda.to_device(a)
to copy the array *a* onto device 2, referred to by *d_a*.
"""
def __init__(self, device):
self._device = device
def __getattr__(self, item):
return getattr(self._device, item)
def __enter__(self):
_runtime.get_or_create_context(self._device.id)
def __exit__(self, exc_type, exc_val, exc_tb):
# this will verify that we are popping the right device context.
self._device.get_primary_context().pop()
def __str__(self):
return "<Managed Device {self.id}>".format(self=self)
class _Runtime(object):
"""Emulate the CUDA runtime context management.
It owns all Devices and Contexts.
Keeps at most one Context per Device
"""
def __init__(self):
self.gpus = _DeviceList()
# For caching the attached CUDA Context
self._tls = threading.local()
# Remember the main thread
# Only the main thread can *actually* destroy
self._mainthread = threading.current_thread()
# Avoid mutation of runtime state in multithreaded programs
self._lock = threading.RLock()
@contextmanager
def ensure_context(self):
"""Ensure a CUDA context is available inside the context.
On entrance, queries the CUDA driver for an active CUDA context and
attaches it in TLS for subsequent calls so they do not need to query
the CUDA driver again. On exit, detach the CUDA context from the TLS.
This will allow us to pickup thirdparty activated CUDA context in
any top-level Numba CUDA API.
"""
with driver.get_active_context():
oldctx = self._get_attached_context()
newctx = self.get_or_create_context(None)
self._set_attached_context(newctx)
try:
yield
finally:
self._set_attached_context(oldctx)
def get_or_create_context(self, devnum):
"""Returns the primary context and push+create it if needed
for *devnum*. If *devnum* is None, use the active CUDA context (must
be primary) or create a new one with ``devnum=0``.
"""
if devnum is None:
attached_ctx = self._get_attached_context()
if attached_ctx is None:
return self._get_or_create_context_uncached(devnum)
else:
return attached_ctx
else:
if USE_NV_BINDING:
devnum = int(devnum)
return self._activate_context_for(devnum)
def _get_or_create_context_uncached(self, devnum):
"""See also ``get_or_create_context(devnum)``.
This version does not read the cache.
"""
with self._lock:
# Try to get the active context in the CUDA stack or
# activate GPU-0 with the primary context
with driver.get_active_context() as ac:
if not ac:
return self._activate_context_for(0)
else:
# Get primary context for the active device
ctx = self.gpus[ac.devnum].get_primary_context()
# Is active context the primary context?
if USE_NV_BINDING:
ctx_handle = int(ctx.handle)
ac_ctx_handle = int(ac.context_handle)
else:
ctx_handle = ctx.handle.value
ac_ctx_handle = ac.context_handle.value
if ctx_handle != ac_ctx_handle:
msg = ('Numba cannot operate on non-primary'
' CUDA context {:x}')
raise RuntimeError(msg.format(ac_ctx_handle))
# Ensure the context is ready
ctx.prepare_for_use()
return ctx
def _activate_context_for(self, devnum):
with self._lock:
gpu = self.gpus[devnum]
newctx = gpu.get_primary_context()
# Detect unexpected context switch
cached_ctx = self._get_attached_context()
if cached_ctx is not None and cached_ctx is not newctx:
raise RuntimeError('Cannot switch CUDA-context.')
newctx.push()
return newctx
def _get_attached_context(self):
return getattr(self._tls, 'attached_context', None)
def _set_attached_context(self, ctx):
self._tls.attached_context = ctx
def reset(self):
"""Clear all contexts in the thread. Destroy the context if and only
if we are in the main thread.
"""
# Pop all active context.
while driver.pop_active_context() is not None:
pass
# If it is the main thread
if threading.current_thread() == self._mainthread:
self._destroy_all_contexts()
def _destroy_all_contexts(self):
# Reset all devices
for gpu in self.gpus:
gpu.reset()
_runtime = _Runtime()
# ================================ PUBLIC API ================================
gpus = _runtime.gpus
def get_context(devnum=None):
"""Get the current device or use a device by device number, and
return the CUDA context.
"""
return _runtime.get_or_create_context(devnum)
def require_context(fn):
"""
A decorator that ensures a CUDA context is available when *fn* is executed.
Note: The function *fn* cannot switch CUDA-context.
"""
@functools.wraps(fn)
def _require_cuda_context(*args, **kws):
with _runtime.ensure_context():
return fn(*args, **kws)
return _require_cuda_context
def reset():
"""Reset the CUDA subsystem for the current thread.
In the main thread:
This removes all CUDA contexts. Only use this at shutdown or for
cleaning up between tests.
In non-main threads:
This clear the CUDA context stack only.
"""
_runtime.reset()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,394 @@
from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
from numba.cuda.cudadrv import _extras
cu_device = c_int
cu_device_attribute = c_int # enum
cu_context = c_void_p # an opaque handle
cu_module = c_void_p # an opaque handle
cu_jit_option = c_int # enum
cu_jit_input_type = c_int # enum
cu_function = c_void_p # an opaque handle
cu_device_ptr = c_size_t # defined as unsigned long long
cu_stream = c_void_p # an opaque handle
cu_event = c_void_p
cu_link_state = c_void_p
cu_function_attribute = c_int
cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide
cu_uuid = (c_byte * 16) # Device UUID
cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
CU_STREAM_DEFAULT = 0
CU_STREAM_LEGACY = 1
CU_STREAM_PER_THREAD = 2
API_PROTOTYPES = {
# CUresult cuInit(unsigned int Flags);
'cuInit' : (c_int, c_uint),
# CUresult cuDriverGetVersion (int* driverVersion )
'cuDriverGetVersion': (c_int, POINTER(c_int)),
# CUresult cuDeviceGetCount(int *count);
'cuDeviceGetCount': (c_int, POINTER(c_int)),
# CUresult cuDeviceGet(CUdevice *device, int ordinal);
'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
# CUresult cuDeviceGetName ( char* name, int len, CUdevice dev )
'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
# CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
# CUdevice dev);
'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
cu_device),
# CUresult cuDeviceComputeCapability(int *major, int *minor,
# CUdevice dev);
'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
cu_device),
# CUresult cuDevicePrimaryCtxGetState(
# CUdevice dev,
# unsigned int* flags,
# int* active)
'cuDevicePrimaryCtxGetState': (c_int,
cu_device, POINTER(c_uint), POINTER(c_int)),
# CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
'cuDevicePrimaryCtxRelease': (c_int, cu_device),
# CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
'cuDevicePrimaryCtxReset': (c_int, cu_device),
# CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
# CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int flags )
'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
# CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
# CUdevice dev);
'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
# CUresult cuCtxGetDevice ( CUdevice * device )
'cuCtxGetDevice': (c_int, POINTER(cu_device)),
# CUresult cuCtxGetCurrent (CUcontext *pctx);
'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
# CUresult cuCtxPushCurrent (CUcontext pctx);
'cuCtxPushCurrent': (c_int, cu_context),
# CUresult cuCtxPopCurrent (CUcontext *pctx);
'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
# CUresult cuCtxDestroy(CUcontext pctx);
'cuCtxDestroy': (c_int, cu_context),
# CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
# unsigned int numOptions,
# CUjit_option *options,
# void **optionValues);
'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
POINTER(cu_jit_option), POINTER(c_void_p)),
# CUresult cuModuleUnload(CUmodule hmod);
'cuModuleUnload': (c_int, cu_module),
# CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
# const char *name);
'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
# CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
# hmod, const char* name )
'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
cu_module, c_char_p),
# CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
# CUfunc_cache config);
'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
# CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
# CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
# unsigned int flags);
'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
# CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
# CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
# size_t N, CUstream hStream);
'cuMemsetD8Async': (c_int,
cu_device_ptr, c_uint8, c_size_t, cu_stream),
# CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
# size_t ByteCount);
'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
# CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
# size_t ByteCount, CUstream hStream);
'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
cu_stream),
# CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
# size_t ByteCount);
'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
# CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
# size_t ByteCount, CUstream hStream);
'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
cu_stream),
# CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
# size_t ByteCount);
'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
# CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
# size_t ByteCount, CUstream hStream);
'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
cu_stream),
# CUresult cuMemFree(CUdeviceptr dptr);
'cuMemFree': (c_int, cu_device_ptr),
# CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
# CUresult cuStreamDestroy(CUstream hStream);
'cuStreamDestroy': (c_int, cu_stream),
# CUresult cuStreamSynchronize(CUstream hStream);
'cuStreamSynchronize': (c_int, cu_stream),
# CUresult cuStreamAddCallback(
# CUstream hStream,
# CUstreamCallback callback,
# void* userData,
# unsigned int flags)
'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
py_object, c_uint),
# CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
# unsigned int gridDimY,
# unsigned int gridDimZ,
# unsigned int blockDimX,
# unsigned int blockDimY,
# unsigned int blockDimZ,
# unsigned int sharedMemBytes,
# CUstream hStream, void **kernelParams,
# void ** extra)
'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
c_uint, c_uint, c_uint, c_uint, cu_stream,
POINTER(c_void_p), POINTER(c_void_p)),
# CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
# unsigned int gridDimY,
# unsigned int gridDimZ,
# unsigned int blockDimX,
# unsigned int blockDimY,
# unsigned int blockDimZ,
# unsigned int sharedMemBytes,
# CUstream hStream, void **kernelParams)
'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
c_uint, c_uint, c_uint, c_uint, cu_stream,
POINTER(c_void_p)),
# CUresult cuMemHostAlloc ( void ** pp,
# size_t bytesize,
# unsigned int Flags
# )
'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
# CUresult cuMemFreeHost ( void * p )
'cuMemFreeHost': (c_int, c_void_p),
# CUresult cuMemHostRegister(void * p,
# size_t bytesize,
# unsigned int Flags)
'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
# CUresult cuMemHostUnregister(void * p)
'cuMemHostUnregister': (c_int, c_void_p),
# CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
# void * p,
# unsigned int Flags)
'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
c_void_p, c_uint),
# CUresult cuMemGetInfo(size_t * free, size_t * total)
'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
# CUresult cuEventCreate ( CUevent * phEvent,
# unsigned int Flags )
'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
# CUresult cuEventDestroy ( CUevent hEvent )
'cuEventDestroy': (c_int, cu_event),
# CUresult cuEventElapsedTime ( float * pMilliseconds,
# CUevent hStart,
# CUevent hEnd )
'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
# CUresult cuEventQuery ( CUevent hEvent )
'cuEventQuery': (c_int, cu_event),
# CUresult cuEventRecord ( CUevent hEvent,
# CUstream hStream )
'cuEventRecord': (c_int, cu_event, cu_stream),
# CUresult cuEventSynchronize ( CUevent hEvent )
'cuEventSynchronize': (c_int, cu_event),
# CUresult cuStreamWaitEvent ( CUstream hStream,
# CUevent hEvent,
# unsigned int Flags )
'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
# CUresult cuPointerGetAttribute (
# void *data,
# CUpointer_attribute attribute,
# CUdeviceptr ptr)
'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
# CUresult cuMemGetAddressRange ( CUdeviceptr * pbase,
# size_t * psize,
# CUdeviceptr dptr
# )
'cuMemGetAddressRange': (c_int,
POINTER(cu_device_ptr),
POINTER(c_size_t),
cu_device_ptr),
# CUresult cuMemHostGetFlags ( unsigned int * pFlags,
# void * p )
'cuMemHostGetFlags': (c_int,
POINTER(c_uint),
c_void_p),
# CUresult cuCtxSynchronize ( void )
'cuCtxSynchronize' : (c_int,),
# CUresult
# cuLinkCreate(unsigned int numOptions, CUjit_option *options,
# void **optionValues, CUlinkState *stateOut);
'cuLinkCreate': (c_int,
c_uint, POINTER(cu_jit_option),
POINTER(c_void_p), POINTER(cu_link_state)),
# CUresult
# cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
# size_t size, const char *name, unsigned
# int numOptions, CUjit_option *options,
# void **optionValues);
'cuLinkAddData': (c_int,
cu_link_state, cu_jit_input_type, c_void_p,
c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
POINTER(c_void_p)),
# CUresult
# cuLinkAddFile(CUlinkState state, CUjitInputType type,
# const char *path, unsigned int numOptions,
# CUjit_option *options, void **optionValues);
'cuLinkAddFile': (c_int,
cu_link_state, cu_jit_input_type, c_char_p, c_uint,
POINTER(cu_jit_option), POINTER(c_void_p)),
# CUresult CUDAAPI
# cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
'cuLinkComplete': (c_int,
cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
# CUresult CUDAAPI
# cuLinkDestroy(CUlinkState state)
'cuLinkDestroy': (c_int, cu_link_state),
# cuProfilerStart ( void )
'cuProfilerStart': (c_int,),
# cuProfilerStop ( void )
'cuProfilerStop': (c_int,),
# CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
# CUfunction hfunc )
'cuFuncGetAttribute': (c_int,
POINTER(c_int), cu_function_attribute, cu_function),
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
# int *numBlocks,
# CUfunction func,
# int blockSize,
# size_t dynamicSMemSize);
'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
cu_function, c_size_t,
c_uint),
# CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
# int *numBlocks,
# CUfunction func,
# int blockSize,
# size_t dynamicSMemSize,
# unsigned int flags);
'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
POINTER(c_int),
cu_function,
c_size_t, c_uint),
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
# int *minGridSize, int *blockSize,
# CUfunction func,
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
# size_t dynamicSMemSize, int blockSizeLimit);
'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
cu_function, cu_occupancy_b2d_size,
c_size_t, c_int),
# CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
# int *minGridSize, int *blockSize,
# CUfunction func,
# CUoccupancyB2DSize blockSizeToDynamicSMemSize,
# size_t dynamicSMemSize, int blockSizeLimit,
# unsigned int flags);
'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
POINTER(c_int), cu_function,
cu_occupancy_b2d_size,
c_size_t, c_int, c_uint),
# CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
'cuIpcGetMemHandle': (c_int,
POINTER(cu_ipc_mem_handle), cu_device_ptr),
# CUresult cuIpcOpenMemHandle(
# CUdeviceptr* pdptr,
# CUipcMemHandle handle,
# unsigned int Flags)
'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
c_uint),
# CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
# CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
# CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
# CUdevice dev, CUdevice peerDev )
'cuDeviceCanAccessPeer': (c_int,
POINTER(c_int), cu_device, cu_device),
# CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
}

View File

@@ -0,0 +1,452 @@
from collections import namedtuple
import itertools
import functools
import operator
import ctypes
import numpy as np
from numba import _helperlib
Extent = namedtuple("Extent", ["begin", "end"])
attempt_nocopy_reshape = ctypes.CFUNCTYPE(
ctypes.c_int,
ctypes.c_long, # nd
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # dims
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # strides
ctypes.c_long, # newnd
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newdims
np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newstrides
ctypes.c_long, # itemsize
ctypes.c_int, # is_f_order
)(_helperlib.c_helpers['attempt_nocopy_reshape'])
class Dim(object):
"""A single dimension of the array
Attributes
----------
start:
start offset
stop:
stop offset
size:
number of items
stride:
item stride
"""
__slots__ = 'start', 'stop', 'size', 'stride', 'single'
def __init__(self, start, stop, size, stride, single):
self.start = start
self.stop = stop
self.size = size
self.stride = stride
self.single = single
assert not single or size == 1
def __getitem__(self, item):
if isinstance(item, slice):
start, stop, step = item.indices(self.size)
stride = step * self.stride
start = self.start + start * abs(self.stride)
stop = self.start + stop * abs(self.stride)
if stride == 0:
size = 1
else:
size = _compute_size(start, stop, stride)
ret = Dim(
start=start,
stop=stop,
size=size,
stride=stride,
single=False
)
return ret
else:
sliced = self[item:item + 1] if item != -1 else self[-1:]
if sliced.size != 1:
raise IndexError
return Dim(
start=sliced.start,
stop=sliced.stop,
size=sliced.size,
stride=sliced.stride,
single=True,
)
def get_offset(self, idx):
return self.start + idx * self.stride
def __repr__(self):
strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
return strfmt % (self.start, self.stop, self.size, self.stride)
def normalize(self, base):
return Dim(start=self.start - base, stop=self.stop - base,
size=self.size, stride=self.stride, single=self.single)
def copy(self, start=None, stop=None, size=None, stride=None, single=None):
if start is None:
start = self.start
if stop is None:
stop = self.stop
if size is None:
size = self.size
if stride is None:
stride = self.stride
if single is None:
single = self.single
return Dim(start, stop, size, stride, single)
def is_contiguous(self, itemsize):
return self.stride == itemsize
def compute_index(indices, dims):
return sum(d.get_offset(i) for i, d in zip(indices, dims))
class Element(object):
is_array = False
def __init__(self, extent):
self.extent = extent
def iter_contiguous_extent(self):
yield self.extent
class Array(object):
"""A dummy numpy array-like object. Consider it an array without the
actual data, but offset from the base data pointer.
Attributes
----------
dims: tuple of Dim
describing each dimension of the array
ndim: int
number of dimension
shape: tuple of int
size of each dimension
strides: tuple of int
stride of each dimension
itemsize: int
itemsize
extent: (start, end)
start and end offset containing the memory region
"""
is_array = True
@classmethod
def from_desc(cls, offset, shape, strides, itemsize):
dims = []
for ashape, astride in zip(shape, strides):
dim = Dim(offset, offset + ashape * astride, ashape, astride,
single=False)
dims.append(dim)
offset = 0 # offset only applies to first dimension
return cls(dims, itemsize)
def __init__(self, dims, itemsize):
self.dims = tuple(dims)
self.ndim = len(self.dims)
self.shape = tuple(dim.size for dim in self.dims)
self.strides = tuple(dim.stride for dim in self.dims)
self.itemsize = itemsize
self.size = functools.reduce(operator.mul, self.shape, 1)
self.extent = self._compute_extent()
self.flags = self._compute_layout()
def _compute_layout(self):
# The logic here is based on that in _UpdateContiguousFlags from
# numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
# 13661ac70).
# https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
# Records have no dims, and we can treat them as contiguous
if not self.dims:
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
# If this is a broadcast array then it is not contiguous
if any([dim.stride == 0 for dim in self.dims]):
return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
# Check C contiguity
sd = self.itemsize
for dim in reversed(self.dims):
if dim.size == 0:
# Contiguous by definition
return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
if dim.size != 1:
if dim.stride != sd:
flags['C_CONTIGUOUS'] = False
sd *= dim.size
# Check F contiguity
sd = self.itemsize
for dim in self.dims:
if dim.size != 1:
if dim.stride != sd:
flags['F_CONTIGUOUS'] = False
return flags
sd *= dim.size
return flags
def _compute_extent(self):
firstidx = [0] * self.ndim
lastidx = [s - 1 for s in self.shape]
start = compute_index(firstidx, self.dims)
stop = compute_index(lastidx, self.dims) + self.itemsize
stop = max(stop, start) # ensure positive extent
return Extent(start, stop)
def __repr__(self):
return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
def __getitem__(self, item):
if not isinstance(item, tuple):
item = [item]
else:
item = list(item)
nitem = len(item)
ndim = len(self.dims)
if nitem > ndim:
raise IndexError("%d extra indices given" % (nitem - ndim,))
# Add empty slices for missing indices
while len(item) < ndim:
item.append(slice(None, None))
dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
newshape = [d.size for d in dims if not d.single]
arr = Array(dims, self.itemsize)
if newshape:
return arr.reshape(*newshape)[0]
else:
return Element(arr.extent)
@property
def is_c_contig(self):
return self.flags['C_CONTIGUOUS']
@property
def is_f_contig(self):
return self.flags['F_CONTIGUOUS']
def iter_contiguous_extent(self):
""" Generates extents
"""
if self.is_c_contig or self.is_f_contig:
yield self.extent
else:
if self.dims[0].stride < self.dims[-1].stride:
innerdim = self.dims[0]
outerdims = self.dims[1:]
outershape = self.shape[1:]
else:
innerdim = self.dims[-1]
outerdims = self.dims[:-1]
outershape = self.shape[:-1]
if innerdim.is_contiguous(self.itemsize):
oslen = [range(s) for s in outershape]
for indices in itertools.product(*oslen):
base = compute_index(indices, outerdims)
yield base + innerdim.start, base + innerdim.stop
else:
oslen = [range(s) for s in self.shape]
for indices in itertools.product(*oslen):
offset = compute_index(indices, self.dims)
yield offset, offset + self.itemsize
def reshape(self, *newdims, **kws):
oldnd = self.ndim
newnd = len(newdims)
if newdims == self.shape:
return self, None
order = kws.pop('order', 'C')
if kws:
raise TypeError('unknown keyword arguments %s' % kws.keys())
if order not in 'CFA':
raise ValueError('order not C|F|A')
# check for exactly one instance of -1 in newdims
# https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515 # noqa: E501
unknownidx = -1
knownsize = 1
for i, dim in enumerate(newdims):
if dim < 0:
if unknownidx == -1:
unknownidx = i
else:
raise ValueError("can only specify one unknown dimension")
else:
knownsize *= dim
# compute the missing dimension
if unknownidx >= 0:
if knownsize == 0 or self.size % knownsize != 0:
raise ValueError("cannot infer valid shape "
"for unknown dimension")
else:
newdims = newdims[0:unknownidx] \
+ (self.size // knownsize,) \
+ newdims[unknownidx + 1:]
newsize = functools.reduce(operator.mul, newdims, 1)
if order == 'A':
order = 'F' if self.is_f_contig else 'C'
if newsize != self.size:
raise ValueError("reshape changes the size of the array")
if self.is_c_contig or self.is_f_contig:
if order == 'C':
newstrides = list(iter_strides_c_contig(self, newdims))
elif order == 'F':
newstrides = list(iter_strides_f_contig(self, newdims))
else:
raise AssertionError("unreachable")
else:
newstrides = np.empty(newnd, np.ctypeslib.c_intp)
# need to keep these around in variables, not temporaries, so they
# don't get GC'ed before we call into the C code
olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
if not attempt_nocopy_reshape(
oldnd,
olddims,
oldstrides,
newnd,
newdims,
newstrides,
self.itemsize,
order == 'F',
):
raise NotImplementedError('reshape would require copy')
ret = self.from_desc(self.extent.begin, shape=newdims,
strides=newstrides, itemsize=self.itemsize)
return ret, list(self.iter_contiguous_extent())
def squeeze(self, axis=None):
newshape, newstrides = [], []
if axis is None:
for length, stride in zip(self.shape, self.strides):
if length != 1:
newshape.append(length)
newstrides.append(stride)
else:
if not isinstance(axis, tuple):
axis = (axis,)
for ax in axis:
if self.shape[ax] != 1:
raise ValueError(
"cannot select an axis to squeeze out which has size "
"not equal to one"
)
for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
if i not in axis:
newshape.append(length)
newstrides.append(stride)
newarr = self.from_desc(
self.extent.begin,
shape=newshape,
strides=newstrides,
itemsize=self.itemsize,
)
return newarr, list(self.iter_contiguous_extent())
def ravel(self, order='C'):
if order not in 'CFA':
raise ValueError('order not C|F|A')
if (order in 'CA' and self.is_c_contig
or order in 'FA' and self.is_f_contig):
newshape = (self.size,)
newstrides = (self.itemsize,)
arr = self.from_desc(self.extent.begin, newshape, newstrides,
self.itemsize)
return arr, list(self.iter_contiguous_extent())
else:
raise NotImplementedError("ravel on non-contiguous array")
def iter_strides_f_contig(arr, shape=None):
"""yields the f-contiguous strides
"""
shape = arr.shape if shape is None else shape
itemsize = arr.itemsize
yield itemsize
sum = 1
for s in shape[:-1]:
sum *= s
yield sum * itemsize
def iter_strides_c_contig(arr, shape=None):
"""yields the c-contiguous strides
"""
shape = arr.shape if shape is None else shape
itemsize = arr.itemsize
def gen():
yield itemsize
sum = 1
for s in reversed(shape[1:]):
sum *= s
yield sum * itemsize
for i in reversed(list(gen())):
yield i
def is_element_indexing(item, ndim):
if isinstance(item, slice):
return False
elif isinstance(item, tuple):
if len(item) == ndim:
if not any(isinstance(it, slice) for it in item):
return True
else:
return True
return False
def _compute_size(start, stop, step):
"""Algorithm adapted from cpython rangeobject.c
"""
if step > 0:
lo = start
hi = stop
else:
lo = stop
hi = start
step = -step
if lo >= hi:
return 0
return (hi - lo - 1) // step + 1

View File

@@ -0,0 +1,607 @@
"""
Enum values for CUDA driver. Information about the values
can be found on the official NVIDIA documentation website.
ref: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
anchor: #group__CUDA__TYPES
"""
# Error codes
CUDA_SUCCESS = 0
CUDA_ERROR_INVALID_VALUE = 1
CUDA_ERROR_OUT_OF_MEMORY = 2
CUDA_ERROR_NOT_INITIALIZED = 3
CUDA_ERROR_DEINITIALIZED = 4
CUDA_ERROR_PROFILER_DISABLED = 5
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
CUDA_ERROR_STUB_LIBRARY = 34
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
CUDA_ERROR_NO_DEVICE = 100
CUDA_ERROR_INVALID_DEVICE = 101
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
CUDA_ERROR_INVALID_IMAGE = 200
CUDA_ERROR_INVALID_CONTEXT = 201
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
CUDA_ERROR_MAP_FAILED = 205
CUDA_ERROR_UNMAP_FAILED = 206
CUDA_ERROR_ARRAY_IS_MAPPED = 207
CUDA_ERROR_ALREADY_MAPPED = 208
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
CUDA_ERROR_ALREADY_ACQUIRED = 210
CUDA_ERROR_NOT_MAPPED = 211
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
CUDA_ERROR_ECC_UNCORRECTABLE = 214
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
CUDA_ERROR_INVALID_PTX = 218
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
CUDA_ERROR_INVALID_SOURCE = 300
CUDA_ERROR_FILE_NOT_FOUND = 301
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
CUDA_ERROR_OPERATING_SYSTEM = 304
CUDA_ERROR_INVALID_HANDLE = 400
CUDA_ERROR_ILLEGAL_STATE = 401
CUDA_ERROR_NOT_FOUND = 500
CUDA_ERROR_NOT_READY = 600
CUDA_ERROR_LAUNCH_FAILED = 700
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
CUDA_ERROR_LAUNCH_TIMEOUT = 702
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
CUDA_ERROR_ASSERT = 710
CUDA_ERROR_TOO_MANY_PEERS = 711
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
CUDA_ERROR_MISALIGNED_ADDRESS = 716
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
CUDA_ERROR_INVALID_PC = 718
CUDA_ERROR_LAUNCH_FAILED = 719
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
CUDA_ERROR_NOT_PERMITTED = 800
CUDA_ERROR_NOT_SUPPORTED = 801
CUDA_ERROR_SYSTEM_NOT_READY = 802
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
CUDA_ERROR_MPS_RPC_FAILURE = 806
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
CUDA_ERROR_CDP_NOT_SUPPORTED = 811
CUDA_ERROR_CDP_VERSION_MISMATCH = 812
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
CUDA_ERROR_CAPTURED_EVENT = 907
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
CUDA_ERROR_TIMEOUT = 909
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
CUDA_ERROR_EXTERNAL_DEVICE = 911
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
CUDA_ERROR_UNKNOWN = 999
# Function cache configurations
# no preference for shared memory or L1 (default)
CU_FUNC_CACHE_PREFER_NONE = 0x00
# prefer larger shared memory and smaller L1 cache
CU_FUNC_CACHE_PREFER_SHARED = 0x01
# prefer larger L1 cache and smaller shared memory
CU_FUNC_CACHE_PREFER_L1 = 0x02
# prefer equal sized L1 cache and shared memory
CU_FUNC_CACHE_PREFER_EQUAL = 0x03
# Context creation flags
# Automatic scheduling
CU_CTX_SCHED_AUTO = 0x00
# Set spin as default scheduling
CU_CTX_SCHED_SPIN = 0x01
# Set yield as default scheduling
CU_CTX_SCHED_YIELD = 0x02
# Set blocking synchronization as default scheduling
CU_CTX_SCHED_BLOCKING_SYNC = 0x04
CU_CTX_SCHED_MASK = 0x07
# Support mapped pinned allocations
# This flag was deprecated as of CUDA 11.0 and it no longer has effect.
# All contexts as of CUDA 3.2 behave as though the flag is enabled.
CU_CTX_MAP_HOST = 0x08
# Keep local memory allocation after launch
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10
# Trigger coredumps from exceptions in this context
CU_CTX_COREDUMP_ENABLE = 0x20
# Enable user pipe to trigger coredumps in this context
CU_CTX_USER_COREDUMP_ENABLE = 0x40
# Force synchronous blocking on cudaMemcpy/cudaMemset
CU_CTX_SYNC_MEMOPS = 0x80
CU_CTX_FLAGS_MASK = 0xff
# DEFINES
# If set, host memory is portable between CUDA contexts.
# Flag for cuMemHostAlloc()
CU_MEMHOSTALLOC_PORTABLE = 0x01
# If set, host memory is mapped into CUDA address space and
# cuMemHostGetDevicePointer() may be called on the host pointer.
# Flag for cuMemHostAlloc()
CU_MEMHOSTALLOC_DEVICEMAP = 0x02
# If set, host memory is allocated as write-combined - fast to write,
# faster to DMA, slow to read except via SSE4 streaming load instruction
# (MOVNTDQA).
# Flag for cuMemHostAlloc()
CU_MEMHOSTALLOC_WRITECOMBINED = 0x04
# If set, host memory is portable between CUDA contexts.
# Flag for cuMemHostRegister()
CU_MEMHOSTREGISTER_PORTABLE = 0x01
# If set, host memory is mapped into CUDA address space and
# cuMemHostGetDevicePointer() may be called on the host pointer.
# Flag for cuMemHostRegister()
CU_MEMHOSTREGISTER_DEVICEMAP = 0x02
# If set, the passed memory pointer is treated as pointing to some
# memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
# On Windows the flag is a no-op. On Linux that memory is marked
# as non cache-coherent for the GPU and is expected
# to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED
# if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older
# Linux kernel versions. On all other platforms, it is not supported
# and CUDA_ERROR_NOT_SUPPORTED is returned.
# Flag for cuMemHostRegister()
CU_MEMHOSTREGISTER_IOMEMORY = 0x04
# If set, the passed memory pointer is treated as pointing to memory
# that is considered read-only by the device. On platforms without
# CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
# this flag is required in order to register memory mapped
# to the CPU as read-only. Support for the use of this flag can be
# queried from the device attribute
# CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.
# Using this flag with a current context associated with a device
# that does not have this attribute set will cause cuMemHostRegister
# to error with CUDA_ERROR_NOT_SUPPORTED.
CU_MEMHOSTREGISTER_READ_ONLY = 0x08
# CUDA Mem Attach Flags
# If set, managed memory is accessible from all streams on all devices.
CU_MEM_ATTACH_GLOBAL = 0x01
# If set on a platform where the device attribute
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory is
# only accessible on the host (unless explicitly attached to a stream
# with cudaStreamAttachMemAsync, in which case it can be used in kernels
# launched on that stream).
CU_MEM_ATTACH_HOST = 0x02
# If set on a platform where the device attribute
# cudaDevAttrConcurrentManagedAccess is zero, then managed memory accesses
# on the associated device must only be from a single stream.
CU_MEM_ATTACH_SINGLE = 0x04
# Event creation flags
# Default event flag
CU_EVENT_DEFAULT = 0x0
# Event uses blocking synchronization
CU_EVENT_BLOCKING_SYNC = 0x1
# Event will not record timing data
CU_EVENT_DISABLE_TIMING = 0x2
# Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
CU_EVENT_INTERPROCESS = 0x4
# Pointer information
# The CUcontext on which a pointer was allocated or registered
CU_POINTER_ATTRIBUTE_CONTEXT = 1
# The CUmemorytype describing the physical location of a pointer
CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2
# The address at which a pointer's memory may be accessed on the device
CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3
# The address at which a pointer's memory may be accessed on the host
CU_POINTER_ATTRIBUTE_HOST_POINTER = 4
# A pair of tokens for use with the nv-p2p.h Linux kernel interface
CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5
# Synchronize every synchronous memory operation initiated on this region
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6
# A process-wide unique ID for an allocated memory region
CU_POINTER_ATTRIBUTE_BUFFER_ID = 7
# Indicates if the pointer points to managed memory
CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
# A device ordinal of a device on which a pointer was allocated or registered
CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9
# 1 if this pointer maps to an allocation
# that is suitable for cudaIpcGetMemHandle, 0 otherwise
CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10
# Starting address for this requested pointer
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11
# Size of the address range for this requested pointer
CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12
# 1 if this pointer is in a valid address range
# that is mapped to a backing allocation, 0 otherwise
CU_POINTER_ATTRIBUTE_MAPPED = 13
# Bitmask of allowed CUmemAllocationHandleType for this allocation
CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14
# 1 if the memory this pointer is referencing
# can be used with the GPUDirect RDMA API
CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
# Returns the access flags the device associated
# with the current context has on the corresponding
# memory referenced by the pointer given
CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16
# Returns the mempool handle for the allocation
# if it was allocated from a mempool. Otherwise returns NULL
CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17
# Size of the actual underlying mapping that the pointer belongs to
CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18
# The start address of the mapping that the pointer belongs to
CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19
# A process-wide unique id corresponding to the
# physical allocation the pointer belongs to
CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20
# Memory types
# Host memory
CU_MEMORYTYPE_HOST = 0x01
# Device memory
CU_MEMORYTYPE_DEVICE = 0x02
# Array memory
CU_MEMORYTYPE_ARRAY = 0x03
# Unified device or host memory
CU_MEMORYTYPE_UNIFIED = 0x04
# Device code formats
# Compiled device-class-specific device code
# Applicable options: none
CU_JIT_INPUT_CUBIN = 0
# PTX source code
# Applicable options: PTX compiler options
CU_JIT_INPUT_PTX = 1
# Bundle of multiple cubins and/or PTX of some device code
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
CU_JIT_INPUT_FATBINARY = 2
# Host object with embedded device code
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
CU_JIT_INPUT_OBJECT = 3
# Archive of host objects with embedded device code
# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
CU_JIT_INPUT_LIBRARY = 4
CU_JIT_NUM_INPUT_TYPES = 6
# Online compiler and linker options
# Max number of registers that a thread may use.
# Option type: unsigned int
# Applies to: compiler only
CU_JIT_MAX_REGISTERS = 0
# IN: Specifies minimum number of threads per block to target compilation
# for
# OUT: Returns the number of threads the compiler actually targeted.
# This restricts the resource utilization fo the compiler (e.g. max
# registers) such that a block with the given number of threads should be
# able to launch based on register limitations. Note, this option does not
# currently take into account any other resource limitations, such as
# shared memory utilization.
# Cannot be combined with ::CU_JIT_TARGET.
# Option type: unsigned int
# Applies to: compiler only
CU_JIT_THREADS_PER_BLOCK = 1
# Overwrites the option value with the total wall clock time, in
# milliseconds, spent in the compiler and linker
# Option type: float
# Applies to: compiler and linker
CU_JIT_WALL_TIME = 2
# Pointer to a buffer in which to print any log messages
# that are informational in nature (the buffer size is specified via
# option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
# Option type: char *
# Applies to: compiler and linker
CU_JIT_INFO_LOG_BUFFER = 3
# IN: Log buffer size in bytes. Log messages will be capped at this size
# (including null terminator)
# OUT: Amount of log buffer filled with messages
# Option type: unsigned int
# Applies to: compiler and linker
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4
# Pointer to a buffer in which to print any log messages that
# reflect errors (the buffer size is specified via option
# ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
# Option type: char *
# Applies to: compiler and linker
CU_JIT_ERROR_LOG_BUFFER = 5
# IN: Log buffer size in bytes. Log messages will be capped at this size
# (including null terminator)
# OUT: Amount of log buffer filled with messages
# Option type: unsigned int
# Applies to: compiler and linker
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6
# Level of optimizations to apply to generated code (0 - 4), with 4
# being the default and highest level of optimizations.
# Option type: unsigned int
# Applies to: compiler only
CU_JIT_OPTIMIZATION_LEVEL = 7
# No option value required. Determines the target based on the current
# attached context (default)
# Option type: No option value needed
# Applies to: compiler and linker
CU_JIT_TARGET_FROM_CUCONTEXT = 8
# Target is chosen based on supplied ::CUjit_target. Cannot be
# combined with ::CU_JIT_THREADS_PER_BLOCK.
# Option type: unsigned int for enumerated type ::CUjit_target
# Applies to: compiler and linker
CU_JIT_TARGET = 9
# Specifies choice of fallback strategy if matching cubin is not found.
# Choice is based on supplied ::CUjit_fallback.
# Option type: unsigned int for enumerated type ::CUjit_fallback
# Applies to: compiler only
CU_JIT_FALLBACK_STRATEGY = 10
# Specifies whether to create debug information in output (-g)
# (0: false, default)
# Option type: int
# Applies to: compiler and linker
CU_JIT_GENERATE_DEBUG_INFO = 11
# Generate verbose log messages (0: false, default)
# Option type: int
# Applies to: compiler and linker
CU_JIT_LOG_VERBOSE = 12
# Generate line number information (-lineinfo) (0: false, default)
# Option type: int
# Applies to: compiler only
CU_JIT_GENERATE_LINE_INFO = 13
# Specifies whether to enable caching explicitly (-dlcm)
# Choice is based on supplied ::CUjit_cacheMode_enum.
# Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum
# Applies to: compiler only
CU_JIT_CACHE_MODE = 14
# CUfunction_attribute
# The maximum number of threads per block, beyond which a launch of the
# function would fail. This number depends on both the function and the
# device on which the function is currently loaded.
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0
# The size in bytes of statically-allocated shared memory required by
# this function. This does not include dynamically-allocated shared
# memory requested by the user at runtime.
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1
# The size in bytes of user-allocated constant memory required by this
# function.
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2
# The size in bytes of local memory used by each thread of this function.
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3
# The number of registers used by each thread of this function.
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
# The PTX virtual architecture version for which the function was
# compiled. This value is the major PTX version * 10 + the minor PTX
# version, so a PTX version 1.3 function would return the value 13.
# Note that this may return the undefined value of 0 for cubins
# compiled prior to CUDA 3.0.
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5
# The binary architecture version for which the function was compiled.
# This value is the major binary version * 10 + the minor binary version,
# so a binary version 1.3 function would return the value 13. Note that
# this will return a value of 10 for legacy cubins that do not have a
# properly-encoded binary architecture version.
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6
# The attribute to indicate whether the function has been compiled
# with user specified option "-Xptxas --dlcm=ca" set
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
# The maximum size in bytes of dynamically-allocated shared memory
# that can be used by this function. If the user-specified
# dynamic shared memory size is larger than this value,
# the launch will fail. See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8
# On devices where the L1 cache and shared memory use the same
# hardware resources, this sets the shared memory carveout preference,
# in percent of the total shared memory. Refer to
# CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
# This is only a hint, and the driver can choose a different ratio
# if required to execute the function.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9
# If this attribute is set, the kernel must launch with a valid cluster
# size specified. See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10
# The required cluster width in blocks. The values must either all be 0
# or all be positive. The validity of the cluster dimensions
# is otherwise checked at launch time. If the value is set during
# compile time, it cannot be set at runtime.
# Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11
# The required cluster height in blocks. The values must either all be 0
# or all be positive. The validity of the cluster dimensions
# is otherwise checked at launch time.If the value is set during
# compile time, it cannot be set at runtime.
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12
# The required cluster depth in blocks. The values must either all be 0
# or all be positive. The validity of the cluster dimensions
# is otherwise checked at launch time.If the value is set during
# compile time, it cannot be set at runtime.
# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
# Whether the function can be launched with non-portable cluster size.
# 1 is allowed, 0 is disallowed. A non-portable cluster size may only
# function on the specific SKUs the program is tested on.
# The launch might fail if the program is run on a different hardware platform.
# For more details refer to link :
# https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
# The block scheduling policy of a function.
# The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
# See cuFuncSetAttribute, cuKernelSetAttribute
CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
# Device attributes
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_WIDTH = 21
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_WIDTH = 22
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_HEIGHT = 23
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH = 24
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT = 25
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH = 26
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_WIDTH = 27
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_HEIGHT = 28
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_LAYERS = 29
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTI_PROCESSOR = 39
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_WIDTH = 42
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_LAYERS = 43
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_WIDTH = 45
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_HEIGHT = 46
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH_ALT = 47
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT_ALT = 48
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH_ALT = 49
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_WIDTH = 52
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_WIDTH = 53
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_LAYERS = 54
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_WIDTH = 55
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_WIDTH = 56
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_HEIGHT = 57
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_WIDTH = 58
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_HEIGHT = 59
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_DEPTH = 60
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_WIDTH = 61
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_LAYERS = 62
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_WIDTH = 63
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_HEIGHT = 64
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_LAYERS = 65
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_WIDTH = 66
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_WIDTH = 67
CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_LAYERS = 68
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LINEAR_WIDTH = 69
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_WIDTH = 70
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_HEIGHT = 71
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_PITCH = 72
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_MIPMAPPED_WIDTH = 73
CU_DEVICE_ATTRIBUTE_MAX_MAX_TEXTURE_2D_MIPMAPPED_HEIGHT = 74
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_MIPMAPPED_WIDTH = 77
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83
CU_DEVICE_ATTRIBUTE_IS_MULTI_GPU_BOARD = 84
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97

View File

@@ -0,0 +1,36 @@
class CudaDriverError(Exception):
pass
class CudaRuntimeError(Exception):
pass
class CudaSupportError(ImportError):
pass
class NvvmError(Exception):
def __str__(self):
return '\n'.join(map(str, self.args))
class NvvmSupportError(ImportError):
pass
class NvvmWarning(Warning):
pass
class NvrtcError(Exception):
def __str__(self):
return '\n'.join(map(str, self.args))
class NvrtcCompilationError(NvrtcError):
pass
class NvrtcSupportError(ImportError):
pass

View File

@@ -0,0 +1,176 @@
"""CUDA Toolkit libraries lookup utilities.
CUDA Toolkit libraries can be available via either:
- the `cuda-nvcc` and `cuda-nvrtc` conda packages for CUDA 12,
- the `cudatoolkit` conda package for CUDA 11,
- a user supplied location from CUDA_HOME,
- a system wide location,
- package-specific locations (e.g. the Debian NVIDIA packages),
- or can be discovered by the system loader.
"""
import os
import sys
import ctypes
from numba.misc.findlib import find_lib
from numba.cuda.cuda_paths import get_cuda_paths
from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
from numba.cuda.cudadrv.error import CudaSupportError
if sys.platform == 'win32':
_dllnamepattern = '%s.dll'
_staticnamepattern = '%s.lib'
elif sys.platform == 'darwin':
_dllnamepattern = 'lib%s.dylib'
_staticnamepattern = 'lib%s.a'
else:
_dllnamepattern = 'lib%s.so'
_staticnamepattern = 'lib%s.a'
def get_libdevice():
d = get_cuda_paths()
paths = d['libdevice'].info
return paths
def open_libdevice():
with open(get_libdevice(), 'rb') as bcfile:
return bcfile.read()
def get_cudalib(lib, static=False):
"""
Find the path of a CUDA library based on a search of known locations. If
the search fails, return a generic filename for the library (e.g.
'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
loader's search mechanism.
"""
if lib == 'nvvm':
return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
else:
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
libdir = get_cuda_paths()[dir_type].info
candidates = find_lib(lib, libdir, static=static)
namepattern = _staticnamepattern if static else _dllnamepattern
return max(candidates) if candidates else namepattern % lib
def open_cudalib(lib):
path = get_cudalib(lib)
return ctypes.CDLL(path)
def check_static_lib(path):
if not os.path.isfile(path):
raise FileNotFoundError(f'{path} not found')
def _get_source_variable(lib, static=False):
if lib == 'nvvm':
return get_cuda_paths()['nvvm'].by
elif lib == 'libdevice':
return get_cuda_paths()['libdevice'].by
else:
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
return get_cuda_paths()[dir_type].by
def test():
"""Test library lookup. Path info is printed to stdout.
"""
failed = False
# Check for the driver
try:
dlloader, candidates = locate_driver_and_loader()
print('Finding driver from candidates:')
for location in candidates:
print(f'\t{location}')
print(f'Using loader {dlloader}')
print('\tTrying to load driver', end='...')
dll, path = load_driver(dlloader, candidates)
print('\tok')
print(f'\t\tLoaded from {path}')
except CudaSupportError as e:
print(f'\tERROR: failed to open driver: {e}')
failed = True
# Find the absolute location of the driver on Linux. Various driver-related
# issues have been reported by WSL2 users, and it is almost always due to a
# Linux (i.e. not- WSL2) driver being installed in a WSL2 system.
# Providing the absolute location of the driver indicates its version
# number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
# look up whether the driver was intended for "native" Linux.
if sys.platform == 'linux' and not failed:
pid = os.getpid()
mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
try:
with open(mapsfile) as f:
maps = f.read()
# It's difficult to predict all that might go wrong reading the maps
# file - in case various error conditions ensue (the file is not found,
# not readable, etc.) we use OSError to hopefully catch any of them.
except OSError:
# It's helpful to report that this went wrong to the user, but we
# don't set failed to True because this doesn't have any connection
# to actual CUDA functionality.
print(f'\tERROR: Could not open {mapsfile} to determine absolute '
'path to libcuda.so')
else:
# In this case we could read the maps, so we can report the
# relevant ones to the user
locations = set(s for s in maps.split() if 'libcuda.so' in s)
print('\tMapped libcuda.so paths:')
for location in locations:
print(f'\t\t{location}')
# Checks for dynamic libraries
libs = 'nvvm nvrtc cudart'.split()
for lib in libs:
path = get_cudalib(lib)
print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
print('\tLocated at', path)
try:
print('\tTrying to open library', end='...')
open_cudalib(lib)
print('\tok')
except OSError as e:
print('\tERROR: failed to open %s:\n%s' % (lib, e))
failed = True
# Check for cudadevrt (the only static library)
lib = 'cudadevrt'
path = get_cudalib(lib, static=True)
print('Finding {} from {}'.format(lib, _get_source_variable(lib,
static=True)))
print('\tLocated at', path)
try:
print('\tChecking library', end='...')
check_static_lib(path)
print('\tok')
except FileNotFoundError as e:
print('\tERROR: failed to find %s:\n%s' % (lib, e))
failed = True
# Check for libdevice
where = _get_source_variable('libdevice')
print(f'Finding libdevice from {where}')
path = get_libdevice()
print('\tLocated at', path)
try:
print('\tChecking library', end='...')
check_static_lib(path)
print('\tok')
except FileNotFoundError as e:
print('\tERROR: failed to find %s:\n%s' % (lib, e))
failed = True
return not failed

View File

@@ -0,0 +1,20 @@
from numba.cuda.cudadrv import devices, driver
from numba.core.registry import cpu_target
def _calc_array_sizeof(ndim):
"""
Use the ABI size in the CPU target
"""
ctx = cpu_target.target_context
return ctx.calc_array_sizeof(ndim)
def ndarray_device_allocate_data(ary):
"""
Allocate gpu data buffer
"""
datasize = driver.host_memory_size(ary)
# allocate
gpu_data = devices.get_context().memalloc(datasize)
return gpu_data

View File

@@ -0,0 +1,260 @@
from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
from enum import IntEnum
from numba.core import config
from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
NvrtcSupportError)
import functools
import os
import threading
import warnings
# Opaque handle for compilation unit
nvrtc_program = c_void_p
# Result code
nvrtc_result = c_int
class NvrtcResult(IntEnum):
NVRTC_SUCCESS = 0
NVRTC_ERROR_OUT_OF_MEMORY = 1
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
NVRTC_ERROR_INVALID_INPUT = 3
NVRTC_ERROR_INVALID_PROGRAM = 4
NVRTC_ERROR_INVALID_OPTION = 5
NVRTC_ERROR_COMPILATION = 6
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
NVRTC_ERROR_INTERNAL_ERROR = 11
_nvrtc_lock = threading.Lock()
class NvrtcProgram:
"""
A class for managing the lifetime of nvrtcProgram instances. Instances of
the class own an nvrtcProgram; when an instance is deleted, the underlying
nvrtcProgram is destroyed using the appropriate NVRTC API.
"""
def __init__(self, nvrtc, handle):
self._nvrtc = nvrtc
self._handle = handle
@property
def handle(self):
return self._handle
def __del__(self):
if self._handle:
self._nvrtc.destroy_program(self)
class NVRTC:
"""
Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
calls.
The sole instance of this class is a process-wide singleton, similar to the
NVVM interface. Initialization is protected by a lock and uses the standard
(for Numba) open_cudalib function to load the NVRTC library.
"""
_PROTOTYPES = {
# nvrtcResult nvrtcVersion(int *major, int *minor)
'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
# nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
# const char *src,
# const char *name,
# int numHeaders,
# const char * const *headers,
# const char * const *includeNames)
'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
c_int, POINTER(c_char_p), POINTER(c_char_p)),
# nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
# nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
# int numOptions,
# const char * const *options)
'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
POINTER(c_char_p)),
# nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
# nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
# nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
# size_t *cubinSizeRet);
'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
# nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
# nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
# size_t *logSizeRet);
'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
POINTER(c_size_t)),
# nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
}
# Singleton reference
__INSTANCE = None
def __new__(cls):
with _nvrtc_lock:
if cls.__INSTANCE is None:
from numba.cuda.cudadrv.libs import open_cudalib
cls.__INSTANCE = inst = object.__new__(cls)
try:
lib = open_cudalib('nvrtc')
except OSError as e:
cls.__INSTANCE = None
raise NvrtcSupportError("NVRTC cannot be loaded") from e
# Find & populate functions
for name, proto in inst._PROTOTYPES.items():
func = getattr(lib, name)
func.restype = proto[0]
func.argtypes = proto[1:]
@functools.wraps(func)
def checked_call(*args, func=func, name=name):
error = func(*args)
if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
raise NvrtcCompilationError()
elif error != NvrtcResult.NVRTC_SUCCESS:
try:
error_name = NvrtcResult(error).name
except ValueError:
error_name = ('Unknown nvrtc_result '
f'(error code: {error})')
msg = f'Failed to call {name}: {error_name}'
raise NvrtcError(msg)
setattr(inst, name, checked_call)
return cls.__INSTANCE
def get_version(self):
"""
Get the NVRTC version as a tuple (major, minor).
"""
major = c_int()
minor = c_int()
self.nvrtcVersion(byref(major), byref(minor))
return major.value, minor.value
def create_program(self, src, name):
"""
Create an NVRTC program with managed lifetime.
"""
if isinstance(src, str):
src = src.encode()
if isinstance(name, str):
name = name.encode()
handle = nvrtc_program()
# The final three arguments are for passing the contents of headers -
# this is not supported, so there are 0 headers and the header names
# and contents are null.
self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
return NvrtcProgram(self, handle)
def compile_program(self, program, options):
"""
Compile an NVRTC program. Compilation may fail due to a user error in
the source; this function returns ``True`` if there is a compilation
error and ``False`` on success.
"""
# We hold a list of encoded options to ensure they can't be collected
# prior to the call to nvrtcCompileProgram
encoded_options = [opt.encode() for opt in options]
option_pointers = [c_char_p(opt) for opt in encoded_options]
c_options_type = (c_char_p * len(options))
c_options = c_options_type(*option_pointers)
try:
self.nvrtcCompileProgram(program.handle, len(options), c_options)
return False
except NvrtcCompilationError:
return True
def destroy_program(self, program):
"""
Destroy an NVRTC program.
"""
self.nvrtcDestroyProgram(byref(program.handle))
def get_compile_log(self, program):
"""
Get the compile log as a Python string.
"""
log_size = c_size_t()
self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
log = (c_char * log_size.value)()
self.nvrtcGetProgramLog(program.handle, log)
return log.value.decode()
def get_ptx(self, program):
"""
Get the compiled PTX as a Python string.
"""
ptx_size = c_size_t()
self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
ptx = (c_char * ptx_size.value)()
self.nvrtcGetPTX(program.handle, ptx)
return ptx.value.decode()
def compile(src, name, cc):
"""
Compile a CUDA C/C++ source to PTX for a given compute capability.
:param src: The source code to compile
:type src: str
:param name: The filename of the source (for information only)
:type name: str
:param cc: A tuple ``(major, minor)`` of the compute capability
:type cc: tuple
:return: The compiled PTX and compilation log
:rtype: tuple
"""
nvrtc = NVRTC()
program = nvrtc.create_program(src, name)
# Compilation options:
# - Compile for the current device's compute capability.
# - The CUDA include path is added.
# - Relocatable Device Code (rdc) is needed to prevent device functions
# being optimized away.
major, minor = cc
arch = f'--gpu-architecture=compute_{major}{minor}'
include = f'-I{config.CUDA_INCLUDE_PATH}'
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
numba_cuda_path = os.path.dirname(cudadrv_path)
numba_include = f'-I{numba_cuda_path}'
options = [arch, include, numba_include, '-rdc', 'true']
# Compile the program
compile_error = nvrtc.compile_program(program, options)
# Get log from compilation
log = nvrtc.get_compile_log(program)
# If the compile failed, provide the log in an exception
if compile_error:
msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
raise NvrtcError(msg)
# Otherwise, if there's any content in the log, present it as a warning
if log:
msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
warnings.warn(msg)
ptx = nvrtc.get_ptx(program)
return ptx, log

View File

@@ -0,0 +1,707 @@
"""
This is a direct translation of nvvm.h
"""
import logging
import re
import sys
import warnings
from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
c_char)
import threading
from llvmlite import ir
from .error import NvvmError, NvvmSupportError, NvvmWarning
from .libs import get_libdevice, open_libdevice, open_cudalib
from numba.core import cgutils, config
logger = logging.getLogger(__name__)
ADDRSPACE_GENERIC = 0
ADDRSPACE_GLOBAL = 1
ADDRSPACE_SHARED = 3
ADDRSPACE_CONSTANT = 4
ADDRSPACE_LOCAL = 5
# Opaque handle for compilation unit
nvvm_program = c_void_p
# Result code
nvvm_result = c_int
RESULT_CODE_NAMES = '''
NVVM_SUCCESS
NVVM_ERROR_OUT_OF_MEMORY
NVVM_ERROR_PROGRAM_CREATION_FAILURE
NVVM_ERROR_IR_VERSION_MISMATCH
NVVM_ERROR_INVALID_INPUT
NVVM_ERROR_INVALID_PROGRAM
NVVM_ERROR_INVALID_IR
NVVM_ERROR_INVALID_OPTION
NVVM_ERROR_NO_MODULE_IN_PROGRAM
NVVM_ERROR_COMPILATION
'''.split()
for i, k in enumerate(RESULT_CODE_NAMES):
setattr(sys.modules[__name__], k, i)
# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
'v64:64:64-v128:128:128-n16:32:64')
_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
'v64:64:64-v128:128:128-n16:32:64')
def is_available():
"""
Return if libNVVM is available
"""
try:
NVVM()
except NvvmSupportError:
return False
else:
return True
_nvvm_lock = threading.Lock()
class NVVM(object):
'''Process-wide singleton.
'''
_PROTOTYPES = {
# nvvmResult nvvmVersion(int *major, int *minor)
'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
# nvvmResult nvvmCreateProgram(nvvmProgram *cu)
'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
# nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
# nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
# size_t size, const char *name)
'nvvmAddModuleToProgram': (
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
# nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
# const char* buffer,
# size_t size,
# const char *name)
'nvvmLazyAddModuleToProgram': (
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
# nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
# const char **options)
'nvvmCompileProgram': (
nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
# nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
# size_t *bufferSizeRet)
'nvvmGetCompiledResultSize': (
nvvm_result, nvvm_program, POINTER(c_size_t)),
# nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
# nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
# size_t *bufferSizeRet)
'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
# int* minorDbg )
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
POINTER(c_int), POINTER(c_int)),
# nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
# const char** options)
'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
POINTER(c_char_p))
}
# Singleton reference
__INSTANCE = None
def __new__(cls):
with _nvvm_lock:
if cls.__INSTANCE is None:
cls.__INSTANCE = inst = object.__new__(cls)
try:
inst.driver = open_cudalib('nvvm')
except OSError as e:
cls.__INSTANCE = None
errmsg = ("libNVVM cannot be found. Do `conda install "
"cudatoolkit`:\n%s")
raise NvvmSupportError(errmsg % e)
# Find & populate functions
for name, proto in inst._PROTOTYPES.items():
func = getattr(inst.driver, name)
func.restype = proto[0]
func.argtypes = proto[1:]
setattr(inst, name, func)
return cls.__INSTANCE
def __init__(self):
ir_versions = self.get_ir_version()
self._majorIR = ir_versions[0]
self._minorIR = ir_versions[1]
self._majorDbg = ir_versions[2]
self._minorDbg = ir_versions[3]
self._supported_ccs = get_supported_ccs()
@property
def data_layout(self):
if (self._majorIR, self._minorIR) < (1, 8):
return _datalayout_original
else:
return _datalayout_i128
@property
def supported_ccs(self):
return self._supported_ccs
def get_version(self):
major = c_int()
minor = c_int()
err = self.nvvmVersion(byref(major), byref(minor))
self.check_error(err, 'Failed to get version.')
return major.value, minor.value
def get_ir_version(self):
majorIR = c_int()
minorIR = c_int()
majorDbg = c_int()
minorDbg = c_int()
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
byref(majorDbg), byref(minorDbg))
self.check_error(err, 'Failed to get IR version.')
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
def check_error(self, error, msg, exit=False):
if error:
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
if exit:
print(exc)
sys.exit(1)
else:
raise exc
class CompilationUnit(object):
def __init__(self):
self.driver = NVVM()
self._handle = nvvm_program()
err = self.driver.nvvmCreateProgram(byref(self._handle))
self.driver.check_error(err, 'Failed to create CU')
def __del__(self):
driver = NVVM()
err = driver.nvvmDestroyProgram(byref(self._handle))
driver.check_error(err, 'Failed to destroy CU', exit=True)
def add_module(self, buffer):
"""
Add a module level NVVM IR to a compilation unit.
- The buffer should contain an NVVM module IR either in the bitcode
representation (LLVM3.0) or in the text representation.
"""
err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
len(buffer), None)
self.driver.check_error(err, 'Failed to add module')
def lazy_add_module(self, buffer):
"""
Lazily add an NVVM IR module to a compilation unit.
The buffer should contain NVVM module IR either in the bitcode
representation or in the text representation.
"""
err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
len(buffer), None)
self.driver.check_error(err, 'Failed to add module')
def compile(self, **options):
"""Perform Compilation.
Compilation options are accepted as keyword arguments, with the
following considerations:
- Underscores (`_`) in option names are converted to dashes (`-`), to
match NVVM's option name format.
- Options that take a value will be emitted in the form
"-<name>=<value>".
- Booleans passed as option values will be converted to integers.
- Options which take no value (such as `-gen-lto`) should have a value
of `None` passed in and will be emitted in the form "-<name>".
For documentation on NVVM compilation options, see the CUDA Toolkit
Documentation:
https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
"""
def stringify_option(k, v):
k = k.replace('_', '-')
if v is None:
return f'-{k}'
if isinstance(v, bool):
v = int(v)
return f'-{k}={v}'
options = [stringify_option(k, v) for k, v in options.items()]
c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
for x in options])
# verify
err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
self._try_error(err, 'Failed to verify\n')
# compile
err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
self._try_error(err, 'Failed to compile\n')
# get result
reslen = c_size_t()
err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
self._try_error(err, 'Failed to get size of compiled result.')
output_buffer = (c_char * reslen.value)()
err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
self._try_error(err, 'Failed to get compiled result.')
# get log
self.log = self.get_log()
if self.log:
warnings.warn(self.log, category=NvvmWarning)
return output_buffer[:]
def _try_error(self, err, msg):
self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
def get_log(self):
reslen = c_size_t()
err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
self.driver.check_error(err, 'Failed to get compilation log size.')
if reslen.value > 1:
logbuf = (c_char * reslen.value)()
err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
self.driver.check_error(err, 'Failed to get compilation log.')
return logbuf.value.decode('utf8') # populate log attribute
return ''
COMPUTE_CAPABILITIES = (
(3, 5), (3, 7),
(5, 0), (5, 2), (5, 3),
(6, 0), (6, 1), (6, 2),
(7, 0), (7, 2), (7, 5),
(8, 0), (8, 6), (8, 7), (8, 9),
(9, 0)
)
# Maps CTK version -> (min supported cc, max supported cc) inclusive
CTK_SUPPORTED = {
(11, 2): ((3, 5), (8, 6)),
(11, 3): ((3, 5), (8, 6)),
(11, 4): ((3, 5), (8, 7)),
(11, 5): ((3, 5), (8, 7)),
(11, 6): ((3, 5), (8, 7)),
(11, 7): ((3, 5), (8, 7)),
(11, 8): ((3, 5), (9, 0)),
(12, 0): ((5, 0), (9, 0)),
(12, 1): ((5, 0), (9, 0)),
(12, 2): ((5, 0), (9, 0)),
(12, 3): ((5, 0), (9, 0)),
(12, 4): ((5, 0), (9, 0)),
}
def ccs_supported_by_ctk(ctk_version):
try:
# For supported versions, we look up the range of supported CCs
min_cc, max_cc = CTK_SUPPORTED[ctk_version]
return tuple([cc for cc in COMPUTE_CAPABILITIES
if min_cc <= cc <= max_cc])
except KeyError:
# For unsupported CUDA toolkit versions, all we can do is assume all
# non-deprecated versions we are aware of are supported.
return tuple([cc for cc in COMPUTE_CAPABILITIES
if cc >= config.CUDA_DEFAULT_PTX_CC])
def get_supported_ccs():
try:
from numba.cuda.cudadrv.runtime import runtime
cudart_version = runtime.get_version()
except: # noqa: E722
# We can't support anything if there's an error getting the runtime
# version (e.g. if it's not present or there's another issue)
_supported_cc = ()
return _supported_cc
# Ensure the minimum CTK version requirement is met
min_cudart = min(CTK_SUPPORTED)
if cudart_version < min_cudart:
_supported_cc = ()
ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
"required version.")
warnings.warn(unsupported_ver)
return _supported_cc
_supported_cc = ccs_supported_by_ctk(cudart_version)
return _supported_cc
def find_closest_arch(mycc):
"""
Given a compute capability, return the closest compute capability supported
by the CUDA toolkit.
:param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
:return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
"""
supported_ccs = NVVM().supported_ccs
if not supported_ccs:
msg = "No supported GPU compute capabilities found. " \
"Please check your cudatoolkit version matches your CUDA version."
raise NvvmSupportError(msg)
for i, cc in enumerate(supported_ccs):
if cc == mycc:
# Matches
return cc
elif cc > mycc:
# Exceeded
if i == 0:
# CC lower than supported
msg = "GPU compute capability %d.%d is not supported" \
"(requires >=%d.%d)" % (mycc + cc)
raise NvvmSupportError(msg)
else:
# return the previous CC
return supported_ccs[i - 1]
# CC higher than supported
return supported_ccs[-1] # Choose the highest
def get_arch_option(major, minor):
"""Matches with the closest architecture option
"""
if config.FORCE_CUDA_CC:
arch = config.FORCE_CUDA_CC
else:
arch = find_closest_arch((major, minor))
return 'compute_%d%d' % arch
MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
Please ensure you have a CUDA Toolkit 11.2 or higher.
For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
$ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
For CUDA 11, ``cudatoolkit`` is required:
$ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
'''
class LibDevice(object):
_cache_ = None
def __init__(self):
if self._cache_ is None:
if get_libdevice() is None:
raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
self._cache_ = open_libdevice()
self.bc = self._cache_
def get(self):
return self.bc
cas_nvvm = """
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
""" # noqa: E501
# Translation of code from CUDA Programming Guide v6.5, section B.12
ir_numba_atomic_binary_template = """
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
%iptr = bitcast {T}* %ptr to {Ti}*
%old2 = load volatile {Ti}, {Ti}* %iptr
br label %attempt
attempt:
%old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
%dold = bitcast {Ti} %old to {T}
%dnew = {OP} {T} %dold, %val
%new = bitcast {T} %dnew to {Ti}
{CAS}
%repeat = icmp ne {Ti} %cas, %old
br i1 %repeat, label %attempt, label %done
done:
%result = bitcast {Ti} %old to {T}
ret {T} %result
}}
""" # noqa: E501
ir_numba_atomic_inc_template = """
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
entry:
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
attempt:
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%bndchk = icmp ult {T} %old, %val
%inc = add {T} %old, 1
%new = select i1 %bndchk, {T} %inc, {T} 0
{CAS}
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
done:
ret {T} %old
}}
""" # noqa: E501
ir_numba_atomic_dec_template = """
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
entry:
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
attempt:
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%dec = add {T} %old, -1
%bndchk = icmp ult {T} %dec, %val
%new = select i1 %bndchk, {T} %dec, {T} %val
{CAS}
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
done:
ret {T} %old
}}
""" # noqa: E501
ir_numba_atomic_minmax_template = """
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
%ptrval = load volatile {T}, {T}* %ptr
; Return early when:
; - For nanmin / nanmax when val is a NaN
; - For min / max when val or ptr is a NaN
%early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
br i1 %early_return, label %done, label %lt_check
lt_check:
%dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
; Continue attempts if dold less or greater than val (depending on whether min or max)
; or if dold is NaN (for nanmin / nanmax)
%cmp = fcmp {OP} {T} %dold, %val
br i1 %cmp, label %attempt, label %done
attempt:
; Attempt to swap in the value
%old = bitcast {T} %dold to {Ti}
%iptr = bitcast {T}* %ptr to {Ti}*
%new = bitcast {T} %val to {Ti}
{CAS}
%dcas = bitcast {Ti} %cas to {T}
br label %lt_check
done:
ret {T} %ptrval
}}
""" # noqa: E501
def ir_cas(Ti):
return cas_nvvm.format(Ti=Ti)
def ir_numba_atomic_binary(T, Ti, OP, FUNC):
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_binary_template.format(**params)
def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_minmax_template.format(**params)
def ir_numba_atomic_inc(T, Tu):
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
def ir_numba_atomic_dec(T, Tu):
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
def llvm_replace(llvmir):
replacements = [
('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
ir_numba_atomic_inc(T='i64', Tu='u64')),
('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
ir_numba_atomic_dec(T='i64', Tu='u64')),
('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('immarg', '')
]
for decl, fn in replacements:
llvmir = llvmir.replace(decl, fn)
llvmir = llvm140_to_70_ir(llvmir)
return llvmir
def compile_ir(llvmir, **opts):
if isinstance(llvmir, str):
llvmir = [llvmir]
if opts.pop('fastmath', False):
opts.update({
'ftz': True,
'fma': True,
'prec_div': False,
'prec_sqrt': False,
})
cu = CompilationUnit()
libdevice = LibDevice()
for mod in llvmir:
mod = llvm_replace(mod)
cu.add_module(mod.encode('utf8'))
cu.lazy_add_module(libdevice.get())
return cu.compile(**opts)
re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
def llvm140_to_70_ir(ir):
"""
Convert LLVM 14.0 IR for LLVM 7.0.
"""
buf = []
for line in ir.splitlines():
if line.startswith('attributes #'):
# Remove function attributes unsupported by LLVM 7.0
m = re_attributes_def.match(line)
attrs = m.group(1).split()
attrs = ' '.join(a for a in attrs if a != 'willreturn')
line = line.replace(m.group(1), attrs)
buf.append(line)
return '\n'.join(buf)
def set_cuda_kernel(function):
"""
Mark a function as a CUDA kernel. Kernels have the following requirements:
- Metadata that marks them as a kernel.
- Addition to the @llvm.used list, so that they will not be discarded.
- The noinline attribute is not permitted, because this causes NVVM to emit
a warning, which counts as failing IR verification.
Presently it is assumed that there is one kernel per module, which holds
for Numba-jitted functions. If this changes in future or this function is
to be used externally, this function may need modification to add to the
@llvm.used list rather than creating it.
"""
module = function.module
# Add kernel metadata
mdstr = ir.MetaDataString(module, "kernel")
mdvalue = ir.Constant(ir.IntType(32), 1)
md = module.add_metadata((function, mdstr, mdvalue))
nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
nmd.add(md)
# Create the used list
ptrty = ir.IntType(8).as_pointer()
usedty = ir.ArrayType(ptrty, 1)
fnptr = function.bitcast(ptrty)
llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
llvm_used.linkage = 'appending'
llvm_used.section = 'llvm.metadata'
llvm_used.initializer = ir.Constant(usedty, [fnptr])
# Remove 'noinline' if it is present.
function.attributes.discard('noinline')
def add_ir_version(mod):
"""Add NVVM IR version to module"""
# We specify the IR version to match the current NVVM's IR version
i32 = ir.IntType(32)
ir_versions = [i32(v) for v in NVVM().get_ir_version()]
md_ver = mod.add_metadata(ir_versions)
mod.add_named_metadata('nvvmir.version', md_ver)

View File

@@ -0,0 +1,10 @@
"""
Declarations of the Runtime API functions.
"""
from ctypes import c_int, POINTER
API_PROTOTYPES = {
# cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
}

View File

@@ -0,0 +1,142 @@
"""
CUDA Runtime wrapper.
This provides a very minimal set of bindings, since the Runtime API is not
really used in Numba except for querying the Runtime version.
"""
import ctypes
import functools
import sys
from numba.core import config
from numba.cuda.cudadrv.driver import ERROR_MAP, make_logger
from numba.cuda.cudadrv.error import CudaSupportError, CudaRuntimeError
from numba.cuda.cudadrv.libs import open_cudalib
from numba.cuda.cudadrv.rtapi import API_PROTOTYPES
from numba.cuda.cudadrv import enums
class CudaRuntimeAPIError(CudaRuntimeError):
"""
Raised when there is an error accessing a C API from the CUDA Runtime.
"""
def __init__(self, code, msg):
self.code = code
self.msg = msg
super().__init__(code, msg)
def __str__(self):
return "[%s] %s" % (self.code, self.msg)
class Runtime:
"""
Runtime object that lazily binds runtime API functions.
"""
def __init__(self):
self.is_initialized = False
def _initialize(self):
# lazily initialize logger
global _logger
_logger = make_logger()
if config.DISABLE_CUDA:
msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
"in the environment, or because CUDA is unsupported on "
"32-bit systems.")
raise CudaSupportError(msg)
self.lib = open_cudalib('cudart')
self.is_initialized = True
def __getattr__(self, fname):
# First request of a runtime API function
try:
proto = API_PROTOTYPES[fname]
except KeyError:
raise AttributeError(fname)
restype = proto[0]
argtypes = proto[1:]
if not self.is_initialized:
self._initialize()
# Find function in runtime library
libfn = self._find_api(fname)
libfn.restype = restype
libfn.argtypes = argtypes
safe_call = self._wrap_api_call(fname, libfn)
setattr(self, fname, safe_call)
return safe_call
def _wrap_api_call(self, fname, libfn):
@functools.wraps(libfn)
def safe_cuda_api_call(*args):
_logger.debug('call runtime api: %s', libfn.__name__)
retcode = libfn(*args)
self._check_error(fname, retcode)
return safe_cuda_api_call
def _check_error(self, fname, retcode):
if retcode != enums.CUDA_SUCCESS:
errname = ERROR_MAP.get(retcode, "cudaErrorUnknown")
msg = "Call to %s results in %s" % (fname, errname)
_logger.error(msg)
raise CudaRuntimeAPIError(retcode, msg)
def _find_api(self, fname):
try:
return getattr(self.lib, fname)
except AttributeError:
pass
# Not found.
# Delay missing function error to use
def absent_function(*args, **kws):
msg = "runtime missing function: %s."
raise CudaRuntimeError(msg % fname)
setattr(self, fname, absent_function)
return absent_function
def get_version(self):
"""
Returns the CUDA Runtime version as a tuple (major, minor).
"""
rtver = ctypes.c_int()
self.cudaRuntimeGetVersion(ctypes.byref(rtver))
# The version is encoded as (1000 * major) + (10 * minor)
major = rtver.value // 1000
minor = (rtver.value - (major * 1000)) // 10
return (major, minor)
def is_supported_version(self):
"""
Returns True if the CUDA Runtime is a supported version.
"""
return self.get_version() in self.supported_versions
@property
def supported_versions(self):
"""A tuple of all supported CUDA toolkit versions. Versions are given in
the form ``(major_version, minor_version)``."""
if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
# Only 64-bit Linux and Windows are supported
return ()
return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
(11, 7))
runtime = Runtime()
def get_version():
"""
Return the runtime version as a tuple of (major, minor)
"""
return runtime.get_version()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,140 @@
import math
from numba.core import types
from numba.core.typing.templates import ConcreteTemplate, signature, Registry
registry = Registry()
infer_global = registry.register_global
@infer_global(math.acos)
@infer_global(math.acosh)
@infer_global(math.asin)
@infer_global(math.asinh)
@infer_global(math.atan)
@infer_global(math.atanh)
@infer_global(math.cosh)
@infer_global(math.degrees)
@infer_global(math.erf)
@infer_global(math.erfc)
@infer_global(math.expm1)
@infer_global(math.gamma)
@infer_global(math.lgamma)
@infer_global(math.log1p)
@infer_global(math.radians)
@infer_global(math.sinh)
@infer_global(math.tanh)
@infer_global(math.tan)
class Math_unary(ConcreteTemplate):
cases = [
signature(types.float64, types.int64),
signature(types.float64, types.uint64),
signature(types.float32, types.float32),
signature(types.float64, types.float64),
]
@infer_global(math.sin)
@infer_global(math.cos)
@infer_global(math.ceil)
@infer_global(math.floor)
@infer_global(math.sqrt)
@infer_global(math.log)
@infer_global(math.log2)
@infer_global(math.log10)
@infer_global(math.exp)
@infer_global(math.fabs)
@infer_global(math.trunc)
class Math_unary_with_fp16(ConcreteTemplate):
cases = [
signature(types.float64, types.int64),
signature(types.float64, types.uint64),
signature(types.float32, types.float32),
signature(types.float64, types.float64),
signature(types.float16, types.float16),
]
@infer_global(math.atan2)
class Math_atan2(ConcreteTemplate):
key = math.atan2
cases = [
signature(types.float64, types.int64, types.int64),
signature(types.float64, types.uint64, types.uint64),
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
]
@infer_global(math.hypot)
class Math_hypot(ConcreteTemplate):
key = math.hypot
cases = [
signature(types.float64, types.int64, types.int64),
signature(types.float64, types.uint64, types.uint64),
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
]
@infer_global(math.copysign)
@infer_global(math.fmod)
class Math_binary(ConcreteTemplate):
cases = [
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
]
@infer_global(math.remainder)
class Math_remainder(ConcreteTemplate):
cases = [
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
]
@infer_global(math.pow)
class Math_pow(ConcreteTemplate):
cases = [
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
signature(types.float32, types.float32, types.int32),
signature(types.float64, types.float64, types.int32),
]
@infer_global(math.frexp)
class Math_frexp(ConcreteTemplate):
cases = [
signature(types.Tuple([types.float32, types.int32]), types.float32),
signature(types.Tuple([types.float64, types.int32]), types.float64),
]
@infer_global(math.ldexp)
class Math_ldexp(ConcreteTemplate):
cases = [
signature(types.float32, types.float32, types.int32),
signature(types.float64, types.float64, types.int32),
]
@infer_global(math.isinf)
@infer_global(math.isnan)
@infer_global(math.isfinite)
class Math_isnan(ConcreteTemplate):
cases = [
signature(types.boolean, types.int64),
signature(types.boolean, types.uint64),
signature(types.boolean, types.float32),
signature(types.boolean, types.float64),
]
@infer_global(math.modf)
class Math_modf(ConcreteTemplate):
cases = [
signature(types.UniTuple(types.float64, 2), types.float64),
signature(types.UniTuple(types.float32, 2), types.float32)
]

View File

@@ -0,0 +1,191 @@
from warnings import warn
from numba.core import types, config, sigutils
from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning
from numba.cuda.compiler import declare_device_function
from numba.cuda.dispatcher import CUDADispatcher
from numba.cuda.simulator.kernel import FakeCUDAKernel
_msg_deprecated_signature_arg = ("Deprecated keyword argument `{0}`. "
"Signatures should be passed as the first "
"positional argument.")
def jit(func_or_sig=None, device=False, inline=False, link=None, debug=None,
opt=True, lineinfo=False, cache=False, **kws):
"""
JIT compile a Python function for CUDA GPUs.
:param func_or_sig: A function to JIT compile, or *signatures* of a
function to compile. If a function is supplied, then a
:class:`Dispatcher <numba.cuda.dispatcher.CUDADispatcher>` is returned.
Otherwise, ``func_or_sig`` may be a signature or a list of signatures,
and a function is returned. The returned function accepts another
function, which it will compile and then return a :class:`Dispatcher
<numba.cuda.dispatcher.CUDADispatcher>`. See :ref:`jit-decorator` for
more information about passing signatures.
.. note:: A kernel cannot have any return value.
:param device: Indicates whether this is a device function.
:type device: bool
:param link: A list of files containing PTX or CUDA C/C++ source to link
with the function
:type link: list
:param debug: If True, check for exceptions thrown when executing the
kernel. Since this degrades performance, this should only be used for
debugging purposes. If set to True, then ``opt`` should be set to False.
Defaults to False. (The default value can be overridden by setting
environment variable ``NUMBA_CUDA_DEBUGINFO=1``.)
:param fastmath: When True, enables fastmath optimizations as outlined in
the :ref:`CUDA Fast Math documentation <cuda-fast-math>`.
:param max_registers: Request that the kernel is limited to using at most
this number of registers per thread. The limit may not be respected if
the ABI requires a greater number of registers than that requested.
Useful for increasing occupancy.
:param opt: Whether to compile from LLVM IR to PTX with optimization
enabled. When ``True``, ``-opt=3`` is passed to NVVM. When
``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``.
:type opt: bool
:param lineinfo: If True, generate a line mapping between source code and
assembly code. This enables inspection of the source code in NVIDIA
profiling tools and correlation with program counter sampling.
:type lineinfo: bool
:param cache: If True, enables the file-based cache for this function.
:type cache: bool
"""
if link is None:
link = []
if link and config.ENABLE_CUDASIM:
raise NotImplementedError('Cannot link PTX in the simulator')
if kws.get('boundscheck'):
raise NotImplementedError("bounds checking is not supported for CUDA")
if kws.get('argtypes') is not None:
msg = _msg_deprecated_signature_arg.format('argtypes')
raise DeprecationError(msg)
if kws.get('restype') is not None:
msg = _msg_deprecated_signature_arg.format('restype')
raise DeprecationError(msg)
if kws.get('bind') is not None:
msg = _msg_deprecated_signature_arg.format('bind')
raise DeprecationError(msg)
debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
fastmath = kws.get('fastmath', False)
extensions = kws.get('extensions', [])
if debug and opt:
msg = ("debug=True with opt=True (the default) "
"is not supported by CUDA. This may result in a crash"
" - set debug=False or opt=False.")
warn(NumbaInvalidConfigWarning(msg))
if debug and lineinfo:
msg = ("debug and lineinfo are mutually exclusive. Use debug to get "
"full debug info (this disables some optimizations), or "
"lineinfo for line info only with code generation unaffected.")
warn(NumbaInvalidConfigWarning(msg))
if device and kws.get('link'):
raise ValueError("link keyword invalid for device function")
if sigutils.is_signature(func_or_sig):
signatures = [func_or_sig]
specialized = True
elif isinstance(func_or_sig, list):
signatures = func_or_sig
specialized = False
else:
signatures = None
if signatures is not None:
if config.ENABLE_CUDASIM:
def jitwrapper(func):
return FakeCUDAKernel(func, device=device, fastmath=fastmath)
return jitwrapper
def _jit(func):
targetoptions = kws.copy()
targetoptions['debug'] = debug
targetoptions['lineinfo'] = lineinfo
targetoptions['link'] = link
targetoptions['opt'] = opt
targetoptions['fastmath'] = fastmath
targetoptions['device'] = device
targetoptions['extensions'] = extensions
disp = CUDADispatcher(func, targetoptions=targetoptions)
if cache:
disp.enable_caching()
for sig in signatures:
argtypes, restype = sigutils.normalize_signature(sig)
if restype and not device and restype != types.void:
raise TypeError("CUDA kernel must have void return type.")
if device:
from numba.core import typeinfer
with typeinfer.register_dispatcher(disp):
disp.compile_device(argtypes, restype)
else:
disp.compile(argtypes)
disp._specialized = specialized
disp.disable_compile()
return disp
return _jit
else:
if func_or_sig is None:
if config.ENABLE_CUDASIM:
def autojitwrapper(func):
return FakeCUDAKernel(func, device=device,
fastmath=fastmath)
else:
def autojitwrapper(func):
return jit(func, device=device, debug=debug, opt=opt,
lineinfo=lineinfo, link=link, cache=cache, **kws)
return autojitwrapper
# func_or_sig is a function
else:
if config.ENABLE_CUDASIM:
return FakeCUDAKernel(func_or_sig, device=device,
fastmath=fastmath)
else:
targetoptions = kws.copy()
targetoptions['debug'] = debug
targetoptions['lineinfo'] = lineinfo
targetoptions['opt'] = opt
targetoptions['link'] = link
targetoptions['fastmath'] = fastmath
targetoptions['device'] = device
targetoptions['extensions'] = extensions
disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
if cache:
disp.enable_caching()
return disp
def declare_device(name, sig):
"""
Declare the signature of a foreign function. Returns a descriptor that can
be used to call the function from a Python kernel.
:param name: The name of the foreign function.
:type name: str
:param sig: The Numba signature of the function.
"""
argtypes, restype = sigutils.normalize_signature(sig)
if restype is None:
msg = 'Return type must be provided for device declarations'
raise TypeError(msg)
return declare_device_function(name, restype, argtypes)

View File

@@ -0,0 +1,33 @@
from numba.core.descriptors import TargetDescriptor
from numba.core.options import TargetOptions
from .target import CUDATargetContext, CUDATypingContext
class CUDATargetOptions(TargetOptions):
pass
class CUDATarget(TargetDescriptor):
def __init__(self, name):
self.options = CUDATargetOptions
# The typing and target contexts are initialized only when needed -
# this prevents an attempt to load CUDA libraries at import time on
# systems that might not have them present.
self._typingctx = None
self._targetctx = None
super().__init__(name)
@property
def typing_context(self):
if self._typingctx is None:
self._typingctx = CUDATypingContext()
return self._typingctx
@property
def target_context(self):
if self._targetctx is None:
self._targetctx = CUDATargetContext(self._typingctx)
return self._targetctx
cuda_target = CUDATarget('cuda')

View File

@@ -0,0 +1,89 @@
# Re export
import sys
from numba.cuda import cg
from .stubs import (threadIdx, blockIdx, blockDim, gridDim, laneid, warpsize,
syncwarp, shared, local, const, atomic,
shfl_sync_intrinsic, vote_sync_intrinsic, match_any_sync,
match_all_sync, threadfence_block, threadfence_system,
threadfence, selp, popc, brev, clz, ffs, fma, cbrt,
activemask, lanemask_lt, nanosleep, fp16,
_vector_type_stubs)
from .intrinsics import (grid, gridsize, syncthreads, syncthreads_and,
syncthreads_count, syncthreads_or)
from .cudadrv.error import CudaSupportError
from numba.cuda.cudadrv.driver import (BaseCUDAMemoryManager,
HostOnlyCUDAMemoryManager,
GetIpcHandleMixin, MemoryPointer,
MappedMemory, PinnedMemory, MemoryInfo,
IpcHandle, set_memory_manager)
from numba.cuda.cudadrv.runtime import runtime
from .cudadrv import nvvm
from numba.cuda import initialize
from .errors import KernelRuntimeError
from .decorators import jit, declare_device
from .api import *
from .api import _auto_device
from .args import In, Out, InOut
from .intrinsic_wrapper import (all_sync, any_sync, eq_sync, ballot_sync,
shfl_sync, shfl_up_sync, shfl_down_sync,
shfl_xor_sync)
from .kernels import reduction
reduce = Reduce = reduction.Reduce
# Expose vector type constructors and aliases as module level attributes.
for vector_type_stub in _vector_type_stubs:
setattr(sys.modules[__name__], vector_type_stub.__name__, vector_type_stub)
for alias in vector_type_stub.aliases:
setattr(sys.modules[__name__], alias, vector_type_stub)
del vector_type_stub, _vector_type_stubs
def is_available():
"""Returns a boolean to indicate the availability of a CUDA GPU.
This will initialize the driver if it hasn't been initialized.
"""
# whilst `driver.is_available` will init the driver itself,
# the driver initialization may raise and as a result break
# test discovery/orchestration as `cuda.is_available` is often
# used as a guard for whether to run a CUDA test, the try/except
# below is to handle this case.
driver_is_available = False
try:
driver_is_available = driver.driver.is_available
except CudaSupportError:
pass
return driver_is_available and nvvm.is_available()
def is_supported_version():
"""Returns True if the CUDA Runtime is a supported version.
Unsupported versions (e.g. newer versions than those known to Numba)
may still work; this function provides a facility to check whether the
current Numba version is tested and known to work with the current
runtime version. If the current version is unsupported, the caller can
decide how to act. Options include:
- Continuing silently,
- Emitting a warning,
- Generating an error or otherwise preventing the use of CUDA.
"""
return runtime.is_supported_version()
def cuda_error():
"""Returns None if there was no error initializing the CUDA driver.
If there was an error initializing the driver, a string describing the
error is returned.
"""
return driver.driver.initialization_error
initialize.initialize_all()

View File

@@ -0,0 +1,919 @@
"""
Implements custom ufunc dispatch mechanism for non-CPU devices.
"""
from abc import ABCMeta, abstractmethod
from collections import OrderedDict
import operator
import warnings
from functools import reduce
import numpy as np
from numba.np.ufunc.ufuncbuilder import _BaseUFuncBuilder, parse_identity
from numba.core import types, sigutils
from numba.core.typing import signature
from numba.np.ufunc.sigparse import parse_signature
def _broadcast_axis(a, b):
"""
Raises
------
ValueError if broadcast fails
"""
if a == b:
return a
elif a == 1:
return b
elif b == 1:
return a
else:
raise ValueError("failed to broadcast {0} and {1}".format(a, b))
def _pairwise_broadcast(shape1, shape2):
"""
Raises
------
ValueError if broadcast fails
"""
shape1, shape2 = map(tuple, [shape1, shape2])
while len(shape1) < len(shape2):
shape1 = (1,) + shape1
while len(shape1) > len(shape2):
shape2 = (1,) + shape2
return tuple(_broadcast_axis(a, b) for a, b in zip(shape1, shape2))
def _multi_broadcast(*shapelist):
"""
Raises
------
ValueError if broadcast fails
"""
assert shapelist
result = shapelist[0]
others = shapelist[1:]
try:
for i, each in enumerate(others, start=1):
result = _pairwise_broadcast(result, each)
except ValueError:
raise ValueError("failed to broadcast argument #{0}".format(i))
else:
return result
class UFuncMechanism(object):
"""
Prepare ufunc arguments for vectorize.
"""
DEFAULT_STREAM = None
SUPPORT_DEVICE_SLICING = False
def __init__(self, typemap, args):
"""Never used directly by user. Invoke by UFuncMechanism.call().
"""
self.typemap = typemap
self.args = args
nargs = len(self.args)
self.argtypes = [None] * nargs
self.scalarpos = []
self.signature = None
self.arrays = [None] * nargs
def _fill_arrays(self):
"""
Get all arguments in array form
"""
for i, arg in enumerate(self.args):
if self.is_device_array(arg):
self.arrays[i] = self.as_device_array(arg)
elif isinstance(arg, (int, float, complex, np.number)):
# Is scalar
self.scalarpos.append(i)
else:
self.arrays[i] = np.asarray(arg)
def _fill_argtypes(self):
"""
Get dtypes
"""
for i, ary in enumerate(self.arrays):
if ary is not None:
dtype = getattr(ary, 'dtype')
if dtype is None:
dtype = np.asarray(ary).dtype
self.argtypes[i] = dtype
def _resolve_signature(self):
"""Resolve signature.
May have ambiguous case.
"""
matches = []
# Resolve scalar args exact match first
if self.scalarpos:
# Try resolve scalar arguments
for formaltys in self.typemap:
match_map = []
for i, (formal, actual) in enumerate(zip(formaltys,
self.argtypes)):
if actual is None:
actual = np.asarray(self.args[i]).dtype
match_map.append(actual == formal)
if all(match_map):
matches.append(formaltys)
# No matching with exact match; try coercing the scalar arguments
if not matches:
matches = []
for formaltys in self.typemap:
all_matches = all(actual is None or formal == actual
for formal, actual in
zip(formaltys, self.argtypes))
if all_matches:
matches.append(formaltys)
if not matches:
raise TypeError("No matching version. GPU ufunc requires array "
"arguments to have the exact types. This behaves "
"like regular ufunc with casting='no'.")
if len(matches) > 1:
raise TypeError("Failed to resolve ufunc due to ambiguous "
"signature. Too many untyped scalars. "
"Use numpy dtype object to type tag.")
# Try scalar arguments
self.argtypes = matches[0]
def _get_actual_args(self):
"""Return the actual arguments
Casts scalar arguments to np.array.
"""
for i in self.scalarpos:
self.arrays[i] = np.array([self.args[i]], dtype=self.argtypes[i])
return self.arrays
def _broadcast(self, arys):
"""Perform numpy ufunc broadcasting
"""
shapelist = [a.shape for a in arys]
shape = _multi_broadcast(*shapelist)
for i, ary in enumerate(arys):
if ary.shape == shape:
pass
else:
if self.is_device_array(ary):
arys[i] = self.broadcast_device(ary, shape)
else:
ax_differs = [ax for ax in range(len(shape))
if ax >= ary.ndim
or ary.shape[ax] != shape[ax]]
missingdim = len(shape) - len(ary.shape)
strides = [0] * missingdim + list(ary.strides)
for ax in ax_differs:
strides[ax] = 0
strided = np.lib.stride_tricks.as_strided(ary,
shape=shape,
strides=strides)
arys[i] = self.force_array_layout(strided)
return arys
def get_arguments(self):
"""Prepare and return the arguments for the ufunc.
Does not call to_device().
"""
self._fill_arrays()
self._fill_argtypes()
self._resolve_signature()
arys = self._get_actual_args()
return self._broadcast(arys)
def get_function(self):
"""Returns (result_dtype, function)
"""
return self.typemap[self.argtypes]
def is_device_array(self, obj):
"""Is the `obj` a device array?
Override in subclass
"""
return False
def as_device_array(self, obj):
"""Convert the `obj` to a device array
Override in subclass
Default implementation is an identity function
"""
return obj
def broadcast_device(self, ary, shape):
"""Handles ondevice broadcasting
Override in subclass to add support.
"""
raise NotImplementedError("broadcasting on device is not supported")
def force_array_layout(self, ary):
"""Ensures array layout met device requirement.
Override in sublcass
"""
return ary
@classmethod
def call(cls, typemap, args, kws):
"""Perform the entire ufunc call mechanism.
"""
# Handle keywords
stream = kws.pop('stream', cls.DEFAULT_STREAM)
out = kws.pop('out', None)
if kws:
warnings.warn("unrecognized keywords: %s" % ', '.join(kws))
# Begin call resolution
cr = cls(typemap, args)
args = cr.get_arguments()
resty, func = cr.get_function()
outshape = args[0].shape
# Adjust output value
if out is not None and cr.is_device_array(out):
out = cr.as_device_array(out)
def attempt_ravel(a):
if cr.SUPPORT_DEVICE_SLICING:
raise NotImplementedError
try:
# Call the `.ravel()` method
return a.ravel()
except NotImplementedError:
# If it is not a device array
if not cr.is_device_array(a):
raise
# For device array, retry ravel on the host by first
# copying it back.
else:
hostary = cr.to_host(a, stream).ravel()
return cr.to_device(hostary, stream)
if args[0].ndim > 1:
args = [attempt_ravel(a) for a in args]
# Prepare argument on the device
devarys = []
any_device = False
for a in args:
if cr.is_device_array(a):
devarys.append(a)
any_device = True
else:
dev_a = cr.to_device(a, stream=stream)
devarys.append(dev_a)
# Launch
shape = args[0].shape
if out is None:
# No output is provided
devout = cr.allocate_device_array(shape, resty, stream=stream)
devarys.extend([devout])
cr.launch(func, shape[0], stream, devarys)
if any_device:
# If any of the arguments are on device,
# Keep output on the device
return devout.reshape(outshape)
else:
# Otherwise, transfer output back to host
return devout.copy_to_host().reshape(outshape)
elif cr.is_device_array(out):
# If output is provided and it is a device array,
# Return device array
if out.ndim > 1:
out = attempt_ravel(out)
devout = out
devarys.extend([devout])
cr.launch(func, shape[0], stream, devarys)
return devout.reshape(outshape)
else:
# If output is provided and it is a host array,
# Return host array
assert out.shape == shape
assert out.dtype == resty
devout = cr.allocate_device_array(shape, resty, stream=stream)
devarys.extend([devout])
cr.launch(func, shape[0], stream, devarys)
return devout.copy_to_host(out, stream=stream).reshape(outshape)
def to_device(self, hostary, stream):
"""Implement to device transfer
Override in subclass
"""
raise NotImplementedError
def to_host(self, devary, stream):
"""Implement to host transfer
Override in subclass
"""
raise NotImplementedError
def allocate_device_array(self, shape, dtype, stream):
"""Implements device allocation
Override in subclass
"""
raise NotImplementedError
def launch(self, func, count, stream, args):
"""Implements device function invocation
Override in subclass
"""
raise NotImplementedError
def to_dtype(ty):
if isinstance(ty, types.EnumMember):
ty = ty.dtype
return np.dtype(str(ty))
class DeviceVectorize(_BaseUFuncBuilder):
def __init__(self, func, identity=None, cache=False, targetoptions=None):
if targetoptions is None:
targetoptions = {}
if cache:
raise TypeError("caching is not supported")
for opt in targetoptions:
if opt == 'nopython':
warnings.warn("nopython kwarg for cuda target is redundant",
RuntimeWarning)
else:
fmt = "Unrecognized options. "
fmt += "cuda vectorize target does not support option: '%s'"
raise KeyError(fmt % opt)
self.py_func = func
self.identity = parse_identity(identity)
# { arg_dtype: (return_dtype), cudakernel }
self.kernelmap = OrderedDict()
@property
def pyfunc(self):
return self.py_func
def add(self, sig=None):
# compile core as device function
args, return_type = sigutils.normalize_signature(sig)
devfnsig = signature(return_type, *args)
funcname = self.pyfunc.__name__
kernelsource = self._get_kernel_source(self._kernel_template,
devfnsig, funcname)
corefn, return_type = self._compile_core(devfnsig)
glbl = self._get_globals(corefn)
sig = signature(types.void, *([a[:] for a in args] + [return_type[:]]))
exec(kernelsource, glbl)
stager = glbl['__vectorized_%s' % funcname]
kernel = self._compile_kernel(stager, sig)
argdtypes = tuple(to_dtype(t) for t in devfnsig.args)
resdtype = to_dtype(return_type)
self.kernelmap[tuple(argdtypes)] = resdtype, kernel
def build_ufunc(self):
raise NotImplementedError
def _get_kernel_source(self, template, sig, funcname):
args = ['a%d' % i for i in range(len(sig.args))]
fmts = dict(name=funcname,
args=', '.join(args),
argitems=', '.join('%s[__tid__]' % i for i in args))
return template.format(**fmts)
def _compile_core(self, sig):
raise NotImplementedError
def _get_globals(self, corefn):
raise NotImplementedError
def _compile_kernel(self, fnobj, sig):
raise NotImplementedError
class DeviceGUFuncVectorize(_BaseUFuncBuilder):
def __init__(
self,
func,
sig,
identity=None,
cache=False,
targetoptions=None,
writable_args=(),
):
if targetoptions is None:
targetoptions = {}
if cache:
raise TypeError("caching is not supported")
if writable_args:
raise TypeError("writable_args are not supported")
# Allow nopython flag to be set.
if not targetoptions.pop('nopython', True):
raise TypeError("nopython flag must be True")
# Are there any more target options?
if targetoptions:
opts = ', '.join([repr(k) for k in targetoptions.keys()])
fmt = "The following target options are not supported: {0}"
raise TypeError(fmt.format(opts))
self.py_func = func
self.identity = parse_identity(identity)
self.signature = sig
self.inputsig, self.outputsig = parse_signature(self.signature)
# Maps from a tuple of input_dtypes to (output_dtypes, kernel)
self.kernelmap = OrderedDict()
@property
def pyfunc(self):
return self.py_func
def add(self, sig=None):
indims = [len(x) for x in self.inputsig]
outdims = [len(x) for x in self.outputsig]
args, return_type = sigutils.normalize_signature(sig)
# It is only valid to specify types.none as a return type, or to not
# specify the return type (where the "Python None" is the return type)
valid_return_type = return_type in (types.none, None)
if not valid_return_type:
raise TypeError('guvectorized functions cannot return values: '
f'signature {sig} specifies {return_type} return '
'type')
funcname = self.py_func.__name__
src = expand_gufunc_template(self._kernel_template, indims,
outdims, funcname, args)
glbls = self._get_globals(sig)
exec(src, glbls)
fnobj = glbls['__gufunc_{name}'.format(name=funcname)]
outertys = list(_determine_gufunc_outer_types(args, indims + outdims))
kernel = self._compile_kernel(fnobj, sig=tuple(outertys))
nout = len(outdims)
dtypes = [np.dtype(str(t.dtype)) for t in outertys]
indtypes = tuple(dtypes[:-nout])
outdtypes = tuple(dtypes[-nout:])
self.kernelmap[indtypes] = outdtypes, kernel
def _compile_kernel(self, fnobj, sig):
raise NotImplementedError
def _get_globals(self, sig):
raise NotImplementedError
def _determine_gufunc_outer_types(argtys, dims):
for at, nd in zip(argtys, dims):
if isinstance(at, types.Array):
yield at.copy(ndim=nd + 1)
else:
if nd > 0:
raise ValueError("gufunc signature mismatch: ndim>0 for scalar")
yield types.Array(dtype=at, ndim=1, layout='A')
def expand_gufunc_template(template, indims, outdims, funcname, argtypes):
"""Expand gufunc source template
"""
argdims = indims + outdims
argnames = ["arg{0}".format(i) for i in range(len(argdims))]
checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a)
for a in argnames]))
inputs = [_gen_src_for_indexing(aref, adims, atype)
for aref, adims, atype in zip(argnames, indims, argtypes)]
outputs = [_gen_src_for_indexing(aref, adims, atype)
for aref, adims, atype in zip(argnames[len(indims):], outdims,
argtypes[len(indims):])]
argitems = inputs + outputs
src = template.format(name=funcname, args=', '.join(argnames),
checkedarg=checkedarg,
argitems=', '.join(argitems))
return src
def _gen_src_for_indexing(aref, adims, atype):
return "{aref}[{sliced}]".format(aref=aref,
sliced=_gen_src_index(adims, atype))
def _gen_src_index(adims, atype):
if adims > 0:
return ','.join(['__tid__'] + [':'] * adims)
elif isinstance(atype, types.Array) and atype.ndim - 1 == adims:
# Special case for 0-nd in shape-signature but
# 1d array in type signature.
# Slice it so that the result has the same dimension.
return '__tid__:(__tid__ + 1)'
else:
return '__tid__'
class GUFuncEngine(object):
'''Determine how to broadcast and execute a gufunc
base on input shape and signature
'''
@classmethod
def from_signature(cls, signature):
return cls(*parse_signature(signature))
def __init__(self, inputsig, outputsig):
# signatures
self.sin = inputsig
self.sout = outputsig
# argument count
self.nin = len(self.sin)
self.nout = len(self.sout)
def schedule(self, ishapes):
if len(ishapes) != self.nin:
raise TypeError('invalid number of input argument')
# associate symbol values for input signature
symbolmap = {}
outer_shapes = []
inner_shapes = []
for argn, (shape, symbols) in enumerate(zip(ishapes, self.sin)):
argn += 1 # start from 1 for human
inner_ndim = len(symbols)
if len(shape) < inner_ndim:
fmt = "arg #%d: insufficient inner dimension"
raise ValueError(fmt % (argn,))
if inner_ndim:
inner_shape = shape[-inner_ndim:]
outer_shape = shape[:-inner_ndim]
else:
inner_shape = ()
outer_shape = shape
for axis, (dim, sym) in enumerate(zip(inner_shape, symbols)):
axis += len(outer_shape)
if sym in symbolmap:
if symbolmap[sym] != dim:
fmt = "arg #%d: shape[%d] mismatch argument"
raise ValueError(fmt % (argn, axis))
symbolmap[sym] = dim
outer_shapes.append(outer_shape)
inner_shapes.append(inner_shape)
# solve output shape
oshapes = []
for outsig in self.sout:
oshape = []
for sym in outsig:
oshape.append(symbolmap[sym])
oshapes.append(tuple(oshape))
# find the biggest outershape as looping dimension
sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]
largest_i = np.argmax(sizes)
loopdims = outer_shapes[largest_i]
pinned = [False] * self.nin # same argument for each iteration
for i, d in enumerate(outer_shapes):
if d != loopdims:
if d == (1,) or d == ():
pinned[i] = True
else:
fmt = "arg #%d: outer dimension mismatch"
raise ValueError(fmt % (i + 1,))
return GUFuncSchedule(self, inner_shapes, oshapes, loopdims, pinned)
class GUFuncSchedule(object):
def __init__(self, parent, ishapes, oshapes, loopdims, pinned):
self.parent = parent
# core shapes
self.ishapes = ishapes
self.oshapes = oshapes
# looping dimension
self.loopdims = loopdims
self.loopn = reduce(operator.mul, loopdims, 1)
# flags
self.pinned = pinned
self.output_shapes = [loopdims + s for s in oshapes]
def __str__(self):
import pprint
attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned'
values = [(k, getattr(self, k)) for k in attrs]
return pprint.pformat(dict(values))
class GeneralizedUFunc(object):
def __init__(self, kernelmap, engine):
self.kernelmap = kernelmap
self.engine = engine
self.max_blocksize = 2 ** 30
def __call__(self, *args, **kws):
callsteps = self._call_steps(self.engine.nin, self.engine.nout,
args, kws)
indtypes, schedule, outdtypes, kernel = self._schedule(
callsteps.inputs, callsteps.outputs)
callsteps.adjust_input_types(indtypes)
outputs = callsteps.prepare_outputs(schedule, outdtypes)
inputs = callsteps.prepare_inputs()
parameters = self._broadcast(schedule, inputs, outputs)
callsteps.launch_kernel(kernel, schedule.loopn, parameters)
return callsteps.post_process_outputs(outputs)
def _schedule(self, inputs, outs):
input_shapes = [a.shape for a in inputs]
schedule = self.engine.schedule(input_shapes)
# find kernel
indtypes = tuple(i.dtype for i in inputs)
try:
outdtypes, kernel = self.kernelmap[indtypes]
except KeyError:
# No exact match, then use the first compatible.
# This does not match the numpy dispatching exactly.
# Later, we may just jit a new version for the missing signature.
indtypes = self._search_matching_signature(indtypes)
# Select kernel
outdtypes, kernel = self.kernelmap[indtypes]
# check output
for sched_shape, out in zip(schedule.output_shapes, outs):
if out is not None and sched_shape != out.shape:
raise ValueError('output shape mismatch')
return indtypes, schedule, outdtypes, kernel
def _search_matching_signature(self, idtypes):
"""
Given the input types in `idtypes`, return a compatible sequence of
types that is defined in `kernelmap`.
Note: Ordering is guaranteed by `kernelmap` being a OrderedDict
"""
for sig in self.kernelmap.keys():
if all(np.can_cast(actual, desired)
for actual, desired in zip(sig, idtypes)):
return sig
else:
raise TypeError("no matching signature")
def _broadcast(self, schedule, params, retvals):
assert schedule.loopn > 0, "zero looping dimension"
odim = 1 if not schedule.loopdims else schedule.loopn
newparams = []
for p, cs in zip(params, schedule.ishapes):
if not cs and p.size == 1:
# Broadcast scalar input
devary = self._broadcast_scalar_input(p, odim)
newparams.append(devary)
else:
# Broadcast vector input
newparams.append(self._broadcast_array(p, odim, cs))
newretvals = []
for retval, oshape in zip(retvals, schedule.oshapes):
newretvals.append(retval.reshape(odim, *oshape))
return tuple(newparams) + tuple(newretvals)
def _broadcast_array(self, ary, newdim, innerdim):
newshape = (newdim,) + innerdim
# No change in shape
if ary.shape == newshape:
return ary
# Creating new dimension
elif len(ary.shape) < len(newshape):
assert newshape[-len(ary.shape):] == ary.shape, \
"cannot add dim and reshape at the same time"
return self._broadcast_add_axis(ary, newshape)
# Collapsing dimension
else:
return ary.reshape(*newshape)
def _broadcast_add_axis(self, ary, newshape):
raise NotImplementedError("cannot add new axis")
def _broadcast_scalar_input(self, ary, shape):
raise NotImplementedError
class GUFuncCallSteps(metaclass=ABCMeta):
"""
Implements memory management and kernel launch operations for GUFunc calls.
One instance of this class is instantiated for each call, and the instance
is specific to the arguments given to the GUFunc call.
The base class implements the overall logic; subclasses provide
target-specific implementations of individual functions.
"""
# The base class uses these slots; subclasses may provide additional slots.
__slots__ = [
'outputs',
'inputs',
'_copy_result_to_host',
]
@abstractmethod
def launch_kernel(self, kernel, nelem, args):
"""Implement the kernel launch"""
@abstractmethod
def is_device_array(self, obj):
"""
Return True if `obj` is a device array for this target, False
otherwise.
"""
@abstractmethod
def as_device_array(self, obj):
"""
Return `obj` as a device array on this target.
May return `obj` directly if it is already on the target.
"""
@abstractmethod
def to_device(self, hostary):
"""
Copy `hostary` to the device and return the device array.
"""
@abstractmethod
def allocate_device_array(self, shape, dtype):
"""
Allocate a new uninitialized device array with the given shape and
dtype.
"""
def __init__(self, nin, nout, args, kwargs):
outputs = kwargs.get('out')
# Ensure the user has passed a correct number of arguments
if outputs is None and len(args) not in (nin, (nin + nout)):
def pos_argn(n):
return f'{n} positional argument{"s" * (n != 1)}'
msg = (f'This gufunc accepts {pos_argn(nin)} (when providing '
f'input only) or {pos_argn(nin + nout)} (when providing '
f'input and output). Got {pos_argn(len(args))}.')
raise TypeError(msg)
if outputs is not None and len(args) > nin:
raise ValueError("cannot specify argument 'out' as both positional "
"and keyword")
else:
# If the user did not pass outputs either in the out kwarg or as
# positional arguments, then we need to generate an initial list of
# "placeholder" outputs using None as a sentry value
outputs = [outputs] * nout
# Ensure all output device arrays are Numba device arrays - for
# example, any output passed in that supports the CUDA Array Interface
# is converted to a Numba CUDA device array; others are left untouched.
all_user_outputs_are_host = True
self.outputs = []
for output in outputs:
if self.is_device_array(output):
self.outputs.append(self.as_device_array(output))
all_user_outputs_are_host = False
else:
self.outputs.append(output)
all_host_arrays = not any([self.is_device_array(a) for a in args])
# - If any of the arguments are device arrays, we leave the output on
# the device.
self._copy_result_to_host = (all_host_arrays and
all_user_outputs_are_host)
# Normalize arguments - ensure they are either device- or host-side
# arrays (as opposed to lists, tuples, etc).
def normalize_arg(a):
if self.is_device_array(a):
convert = self.as_device_array
else:
convert = np.asarray
return convert(a)
normalized_args = [normalize_arg(a) for a in args]
self.inputs = normalized_args[:nin]
# Check if there are extra arguments for outputs.
unused_inputs = normalized_args[nin:]
if unused_inputs:
self.outputs = unused_inputs
def adjust_input_types(self, indtypes):
"""
Attempt to cast the inputs to the required types if necessary
and if they are not device arrays.
Side effect: Only affects the elements of `inputs` that require
a type cast.
"""
for i, (ity, val) in enumerate(zip(indtypes, self.inputs)):
if ity != val.dtype:
if not hasattr(val, 'astype'):
msg = ("compatible signature is possible by casting but "
"{0} does not support .astype()").format(type(val))
raise TypeError(msg)
# Cast types
self.inputs[i] = val.astype(ity)
def prepare_outputs(self, schedule, outdtypes):
"""
Returns a list of output parameters that all reside on the target
device.
Outputs that were passed-in to the GUFunc are used if they reside on the
device; other outputs are allocated as necessary.
"""
outputs = []
for shape, dtype, output in zip(schedule.output_shapes, outdtypes,
self.outputs):
if output is None or self._copy_result_to_host:
output = self.allocate_device_array(shape, dtype)
outputs.append(output)
return outputs
def prepare_inputs(self):
"""
Returns a list of input parameters that all reside on the target device.
"""
def ensure_device(parameter):
if self.is_device_array(parameter):
convert = self.as_device_array
else:
convert = self.to_device
return convert(parameter)
return [ensure_device(p) for p in self.inputs]
def post_process_outputs(self, outputs):
"""
Moves the given output(s) to the host if necessary.
Returns a single value (e.g. an array) if there was one output, or a
tuple of arrays if there were multiple. Although this feels a little
jarring, it is consistent with the behavior of GUFuncs in general.
"""
if self._copy_result_to_host:
outputs = [self.to_host(output, self_output)
for output, self_output in zip(outputs, self.outputs)]
elif self.outputs[0] is not None:
outputs = self.outputs
if len(outputs) == 1:
return outputs[0]
else:
return tuple(outputs)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,59 @@
import numbers
from numba.core.errors import LoweringError
class KernelRuntimeError(RuntimeError):
def __init__(self, msg, tid=None, ctaid=None):
self.tid = tid
self.ctaid = ctaid
self.msg = msg
t = ("An exception was raised in thread=%s block=%s\n"
"\t%s")
msg = t % (self.tid, self.ctaid, self.msg)
super(KernelRuntimeError, self).__init__(msg)
class CudaLoweringError(LoweringError):
pass
_launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
"kernels.html#kernel-invocation")
missing_launch_config_msg = """
Kernel launch configuration was not specified. Use the syntax:
kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn)
See {} for help.
""".format(_launch_help_url)
def normalize_kernel_dimensions(griddim, blockdim):
"""
Normalize and validate the user-supplied kernel dimensions.
"""
def check_dim(dim, name):
if not isinstance(dim, (tuple, list)):
dim = [dim]
else:
dim = list(dim)
if len(dim) > 3:
raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
'got %r' % (name, dim))
for v in dim:
if not isinstance(v, numbers.Integral):
raise TypeError('%s must be a sequence of integers, got %r'
% (name, dim))
while len(dim) < 3:
dim.append(1)
return tuple(dim)
if None in (griddim, blockdim):
raise ValueError(missing_launch_config_msg)
griddim = check_dim(griddim, 'griddim')
blockdim = check_dim(blockdim, 'blockdim')
return griddim, blockdim

View File

@@ -0,0 +1,7 @@
"""
Added for symmetry with the core API
"""
from numba.core.extending import intrinsic as _intrinsic
intrinsic = _intrinsic(target='cuda')

View File

@@ -0,0 +1,13 @@
def initialize_all():
# Import models to register them with the data model manager
import numba.cuda.models # noqa: F401
from numba.cuda.decorators import jit
from numba.cuda.dispatcher import CUDADispatcher
from numba.core.target_extension import (target_registry,
dispatcher_registry,
jit_registry)
cuda_target = target_registry["cuda"]
jit_registry[cuda_target] = jit
dispatcher_registry[cuda_target] = CUDADispatcher

View File

@@ -0,0 +1,77 @@
from .decorators import jit
import numba
@jit(device=True)
def all_sync(mask, predicate):
"""
If for all threads in the masked warp the predicate is true, then
a non-zero value is returned, otherwise 0 is returned.
"""
return numba.cuda.vote_sync_intrinsic(mask, 0, predicate)[1]
@jit(device=True)
def any_sync(mask, predicate):
"""
If for any thread in the masked warp the predicate is true, then
a non-zero value is returned, otherwise 0 is returned.
"""
return numba.cuda.vote_sync_intrinsic(mask, 1, predicate)[1]
@jit(device=True)
def eq_sync(mask, predicate):
"""
If for all threads in the masked warp the boolean predicate is the same,
then a non-zero value is returned, otherwise 0 is returned.
"""
return numba.cuda.vote_sync_intrinsic(mask, 2, predicate)[1]
@jit(device=True)
def ballot_sync(mask, predicate):
"""
Returns a mask of all threads in the warp whose predicate is true,
and are within the given mask.
"""
return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
@jit(device=True)
def shfl_sync(mask, value, src_lane):
"""
Shuffles value across the masked warp and returns the value
from src_lane. If this is outside the warp, then the
given value is returned.
"""
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
@jit(device=True)
def shfl_up_sync(mask, value, delta):
"""
Shuffles value across the masked warp and returns the value
from (laneid - delta). If this is outside the warp, then the
given value is returned.
"""
return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
@jit(device=True)
def shfl_down_sync(mask, value, delta):
"""
Shuffles value across the masked warp and returns the value
from (laneid + delta). If this is outside the warp, then the
given value is returned.
"""
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
@jit(device=True)
def shfl_xor_sync(mask, value, lane_mask):
"""
Shuffles value across the masked warp and returns the value
from (laneid ^ lane_mask).
"""
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]

View File

@@ -0,0 +1,198 @@
from llvmlite import ir
from numba import cuda, types
from numba.core import cgutils
from numba.core.errors import RequireLiteralValue, NumbaValueError
from numba.core.typing import signature
from numba.core.extending import overload_attribute
from numba.cuda import nvvmutils
from numba.cuda.extending import intrinsic
#-------------------------------------------------------------------------------
# Grid functions
def _type_grid_function(ndim):
val = ndim.literal_value
if val == 1:
restype = types.int64
elif val in (2, 3):
restype = types.UniTuple(types.int64, val)
else:
raise NumbaValueError('argument can only be 1, 2, 3')
return signature(restype, types.int32)
@intrinsic
def grid(typingctx, ndim):
'''grid(ndim)
Return the absolute position of the current thread in the entire grid of
blocks. *ndim* should correspond to the number of dimensions declared when
instantiating the kernel. If *ndim* is 1, a single integer is returned.
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
Computation of the first integer is as follows::
cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
and is similar for the other two indices, but using the ``y`` and ``z``
attributes.
'''
if not isinstance(ndim, types.IntegerLiteral):
raise RequireLiteralValue(ndim)
sig = _type_grid_function(ndim)
def codegen(context, builder, sig, args):
restype = sig.return_type
if restype == types.int64:
return nvvmutils.get_global_id(builder, dim=1)
elif isinstance(restype, types.UniTuple):
ids = nvvmutils.get_global_id(builder, dim=restype.count)
return cgutils.pack_array(builder, ids)
return sig, codegen
@intrinsic
def gridsize(typingctx, ndim):
'''gridsize(ndim)
Return the absolute size (or shape) in threads of the entire grid of
blocks. *ndim* should correspond to the number of dimensions declared when
instantiating the kernel. If *ndim* is 1, a single integer is returned.
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
Computation of the first integer is as follows::
cuda.blockDim.x * cuda.gridDim.x
and is similar for the other two indices, but using the ``y`` and ``z``
attributes.
'''
if not isinstance(ndim, types.IntegerLiteral):
raise RequireLiteralValue(ndim)
sig = _type_grid_function(ndim)
def _nthreads_for_dim(builder, dim):
i64 = ir.IntType(64)
ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
def codegen(context, builder, sig, args):
restype = sig.return_type
nx = _nthreads_for_dim(builder, 'x')
if restype == types.int64:
return nx
elif isinstance(restype, types.UniTuple):
ny = _nthreads_for_dim(builder, 'y')
if restype.count == 2:
return cgutils.pack_array(builder, (nx, ny))
elif restype.count == 3:
nz = _nthreads_for_dim(builder, 'z')
return cgutils.pack_array(builder, (nx, ny, nz))
return sig, codegen
@intrinsic
def _warpsize(typingctx):
sig = signature(types.int32)
def codegen(context, builder, sig, args):
return nvvmutils.call_sreg(builder, 'warpsize')
return sig, codegen
@overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
def cuda_warpsize(mod):
'''
The size of a warp. All architectures implemented to date have a warp size
of 32.
'''
def get(mod):
return _warpsize()
return get
#-------------------------------------------------------------------------------
# syncthreads
@intrinsic
def syncthreads(typingctx):
'''
Synchronize all threads in the same thread block. This function implements
the same pattern as barriers in traditional multi-threaded programming: this
function waits until all threads in the block call it, at which point it
returns control to all its callers.
'''
sig = signature(types.none)
def codegen(context, builder, sig, args):
fname = 'llvm.nvvm.barrier0'
lmod = builder.module
fnty = ir.FunctionType(ir.VoidType(), ())
sync = cgutils.get_or_insert_function(lmod, fnty, fname)
builder.call(sync, ())
return context.get_dummy_value()
return sig, codegen
def _syncthreads_predicate(typingctx, predicate, fname):
if not isinstance(predicate, types.Integer):
return None
sig = signature(types.i4, types.i4)
def codegen(context, builder, sig, args):
fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
return builder.call(sync, args)
return sig, codegen
@intrinsic
def syncthreads_count(typingctx, predicate):
'''
syncthreads_count(predicate)
An extension to numba.cuda.syncthreads where the return value is a count
of the threads where predicate is true.
'''
fname = 'llvm.nvvm.barrier0.popc'
return _syncthreads_predicate(typingctx, predicate, fname)
@intrinsic
def syncthreads_and(typingctx, predicate):
'''
syncthreads_and(predicate)
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
true for all threads or 0 otherwise.
'''
fname = 'llvm.nvvm.barrier0.and'
return _syncthreads_predicate(typingctx, predicate, fname)
@intrinsic
def syncthreads_or(typingctx, predicate):
'''
syncthreads_or(predicate)
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
true for any thread or 0 otherwise.
'''
fname = 'llvm.nvvm.barrier0.or'
return _syncthreads_predicate(typingctx, predicate, fname)

View File

@@ -0,0 +1,262 @@
"""
A library written in CUDA Python for generating reduction kernels
"""
from numba.np.numpy_support import from_dtype
_WARPSIZE = 32
_NUMWARPS = 4
def _gpu_reduce_factory(fn, nbtype):
from numba import cuda
reduce_op = cuda.jit(device=True)(fn)
inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
max_blocksize = _NUMWARPS * _WARPSIZE
@cuda.jit(device=True)
def inner_warp_reduction(sm_partials, init):
"""
Compute reduction within a single warp
"""
tid = cuda.threadIdx.x
warpid = tid // _WARPSIZE
laneid = tid % _WARPSIZE
sm_this = sm_partials[warpid, :]
sm_this[laneid] = init
cuda.syncwarp()
width = _WARPSIZE // 2
while width:
if laneid < width:
old = sm_this[laneid]
sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
cuda.syncwarp()
width //= 2
@cuda.jit(device=True)
def device_reduce_full_block(arr, partials, sm_partials):
"""
Partially reduce `arr` into `partials` using `sm_partials` as working
space. The algorithm goes like:
array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
block-0: | x | | | x | |
block-1: | | x | | | x |
block-2: | | | x | | |
The array is divided into chunks of 128 (size of a threadblock).
The threadblocks consumes the chunks in roundrobin scheduling.
First, a threadblock loads a chunk into temp memory. Then, all
subsequent chunks are combined into the temp memory.
Once all chunks are processed. Inner-block reduction is performed
on the temp memory. So that, there will just be one scalar result
per block. The result from each block is stored to `partials` at
the dedicated slot.
"""
tid = cuda.threadIdx.x
blkid = cuda.blockIdx.x
blksz = cuda.blockDim.x
gridsz = cuda.gridDim.x
# block strided loop to compute the reduction
start = tid + blksz * blkid
stop = arr.size
step = blksz * gridsz
# load first value
tmp = arr[start]
# loop over all values in block-stride
for i in range(start + step, stop, step):
tmp = reduce_op(tmp, arr[i])
cuda.syncthreads()
# inner-warp reduction
inner_warp_reduction(sm_partials, tmp)
cuda.syncthreads()
# at this point, only the first slot for each warp in tsm_partials
# is valid.
# finish up block reduction
# warning: this is assuming 4 warps.
# assert numwarps == 4
if tid < 2:
sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
sm_partials[tid + 2, 0])
cuda.syncwarp()
if tid == 0:
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
@cuda.jit(device=True)
def device_reduce_partial_block(arr, partials, sm_partials):
"""
This computes reduction on `arr`.
This device function must be used by 1 threadblock only.
The blocksize must match `arr.size` and must not be greater than 128.
"""
tid = cuda.threadIdx.x
blkid = cuda.blockIdx.x
blksz = cuda.blockDim.x
warpid = tid // _WARPSIZE
laneid = tid % _WARPSIZE
size = arr.size
# load first value
tid = cuda.threadIdx.x
value = arr[tid]
sm_partials[warpid, laneid] = value
cuda.syncthreads()
if (warpid + 1) * _WARPSIZE < size:
# fully populated warps
inner_warp_reduction(sm_partials, value)
else:
# partially populated warps
# NOTE: this uses a very inefficient sequential algorithm
if laneid == 0:
sm_this = sm_partials[warpid, :]
base = warpid * _WARPSIZE
for i in range(1, size - base):
sm_this[0] = reduce_op(sm_this[0], sm_this[i])
cuda.syncthreads()
# finish up
if tid == 0:
num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
result = sm_partials[0, 0]
for i in range(1, num_active_warps):
result = reduce_op(result, sm_partials[i, 0])
partials[blkid] = result
def gpu_reduce_block_strided(arr, partials, init, use_init):
"""
Perform reductions on *arr* and writing out partial reduction result
into *partials*. The length of *partials* is determined by the
number of threadblocks. The initial value is set with *init*.
Launch config:
Blocksize must be multiple of warpsize and it is limited to 4 warps.
"""
tid = cuda.threadIdx.x
sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
dtype=nbtype)
if cuda.blockDim.x == max_blocksize:
device_reduce_full_block(arr, partials, sm_partials)
else:
device_reduce_partial_block(arr, partials, sm_partials)
# deal with the initializer
if use_init and tid == 0 and cuda.blockIdx.x == 0:
partials[0] = reduce_op(partials[0], init)
return cuda.jit(gpu_reduce_block_strided)
class Reduce(object):
"""Create a reduction object that reduces values using a given binary
function. The binary function is compiled once and cached inside this
object. Keeping this object alive will prevent re-compilation.
"""
_cache = {}
def __init__(self, functor):
"""
:param functor: A function implementing a binary operation for
reduction. It will be compiled as a CUDA device
function using ``cuda.jit(device=True)``.
"""
self._functor = functor
def _compile(self, dtype):
key = self._functor, dtype
if key in self._cache:
kernel = self._cache[key]
else:
kernel = _gpu_reduce_factory(self._functor, from_dtype(dtype))
self._cache[key] = kernel
return kernel
def __call__(self, arr, size=None, res=None, init=0, stream=0):
"""Performs a full reduction.
:param arr: A host or device array.
:param size: Optional integer specifying the number of elements in
``arr`` to reduce. If this parameter is not specified, the
entire array is reduced.
:param res: Optional device array into which to write the reduction
result to. The result is written into the first element of
this array. If this parameter is specified, then no
communication of the reduction output takes place from the
device to the host.
:param init: Optional initial value for the reduction, the type of which
must match ``arr.dtype``.
:param stream: Optional CUDA stream in which to perform the reduction.
If no stream is specified, the default stream of 0 is
used.
:return: If ``res`` is specified, ``None`` is returned. Otherwise, the
result of the reduction is returned.
"""
from numba import cuda
# ensure 1d array
if arr.ndim != 1:
raise TypeError("only support 1D array")
# adjust array size
if size is not None:
arr = arr[:size]
init = arr.dtype.type(init) # ensure the right type
# return `init` if `arr` is empty
if arr.size < 1:
return init
kernel = self._compile(arr.dtype)
# Perform the reduction on the GPU
blocksize = _NUMWARPS * _WARPSIZE
size_full = (arr.size // blocksize) * blocksize
size_partial = arr.size - size_full
full_blockct = min(size_full // blocksize, _WARPSIZE * 2)
# allocate size of partials array
partials_size = full_blockct
if size_partial:
partials_size += 1
partials = cuda.device_array(shape=partials_size, dtype=arr.dtype)
if size_full:
# kernel for the fully populated threadblocks
kernel[full_blockct, blocksize, stream](arr[:size_full],
partials[:full_blockct],
init,
True)
if size_partial:
# kernel for partially populated threadblocks
kernel[1, size_partial, stream](arr[size_full:],
partials[full_blockct:],
init,
not full_blockct)
if partials.size > 1:
# finish up
kernel[1, partials_size, stream](partials, partials, init, False)
# handle return value
if res is not None:
res[:1].copy_to_device(partials[:1], stream=stream)
return
else:
return partials[0]

View File

@@ -0,0 +1,65 @@
from numba import cuda
from numba.cuda.cudadrv.driver import driver
import math
from numba.np import numpy_support as nps
def transpose(a, b=None):
"""Compute the transpose of 'a' and store it into 'b', if given,
and return it. If 'b' is not given, allocate a new array
and return that.
This implements the algorithm documented in
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
:param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
the device its stream will be used to perform the transpose (and to copy
`b` to the device if necessary).
"""
# prefer `a`'s stream if
stream = getattr(a, 'stream', 0)
if not b:
cols, rows = a.shape
strides = a.dtype.itemsize * cols, a.dtype.itemsize
b = cuda.cudadrv.devicearray.DeviceNDArray(
(rows, cols),
strides,
dtype=a.dtype,
stream=stream)
dt = nps.from_dtype(a.dtype)
tpb = driver.get_device().MAX_THREADS_PER_BLOCK
# we need to factor available threads into x and y axis
tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
tile_height = int(tpb / tile_width)
tile_shape = (tile_height, tile_width + 1)
@cuda.jit
def kernel(input, output):
tile = cuda.shared.array(shape=tile_shape, dtype=dt)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x * cuda.blockDim.x
by = cuda.blockIdx.y * cuda.blockDim.y
x = by + tx
y = bx + ty
if by + ty < input.shape[0] and bx + tx < input.shape[1]:
tile[ty, tx] = input[by + ty, bx + tx]
cuda.syncthreads()
if y < output.shape[0] and x < output.shape[1]:
output[y, x] = tile[tx, ty]
# one block per tile, plus one for remainders
blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
# one thread per tile element
threads = tile_height, tile_width
kernel[blocks, threads, stream](a, b)
return b

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,17 @@
from numba.cuda import libdevice, libdevicefuncs
from numba.core.typing.templates import ConcreteTemplate, Registry
registry = Registry()
register_global = registry.register_global
def libdevice_declare(func, retty, args):
class Libdevice_function(ConcreteTemplate):
cases = [libdevicefuncs.create_signature(retty, args)]
pyfunc = getattr(libdevice, func[5:])
register_global(pyfunc)(Libdevice_function)
for func, (retty, args) in libdevicefuncs.functions.items():
libdevice_declare(func, retty, args)

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More