Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/api.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/api.py
@@ -0,0 +1,525 @@
+"""
+API that are reported to numba.cuda
+"""
+
+
+import contextlib
+import os
+
+import numpy as np
+
+from .cudadrv import devicearray, devices, driver
+from numba.core import config
+from numba.cuda.api_util import prepare_shape_strides_dtype
+
+# NDarray device helper
+
+require_context = devices.require_context
+current_context = devices.get_context
+gpus = devices.gpus
+
+
+@require_context
+def from_cuda_array_interface(desc, owner=None, sync=True):
+    """Create a DeviceNDArray from a cuda-array-interface description.
+    The ``owner`` is the owner of the underlying memory.
+    The resulting DeviceNDArray will acquire a reference from it.
+
+    If ``sync`` is ``True``, then the imported stream (if present) will be
+    synchronized.
+    """
+    version = desc.get('version')
+    # Mask introduced in version 1
+    if 1 <= version:
+        mask = desc.get('mask')
+        # Would ideally be better to detect if the mask is all valid
+        if mask is not None:
+            raise NotImplementedError('Masked arrays are not supported')
+
+    shape = desc['shape']
+    strides = desc.get('strides')
+    dtype = np.dtype(desc['typestr'])
+
+    shape, strides, dtype = prepare_shape_strides_dtype(
+        shape, strides, dtype, order='C')
+    size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
+
+    devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
+    data = driver.MemoryPointer(
+        current_context(), devptr, size=size, owner=owner)
+    stream_ptr = desc.get('stream', None)
+    if stream_ptr is not None:
+        stream = external_stream(stream_ptr)
+        if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
+            stream.synchronize()
+    else:
+        stream = 0 # No "Numba default stream", not the CUDA default stream
+    da = devicearray.DeviceNDArray(shape=shape, strides=strides,
+                                   dtype=dtype, gpu_data=data,
+                                   stream=stream)
+    return da
+
+
+def as_cuda_array(obj, sync=True):
+    """Create a DeviceNDArray from any object that implements
+    the :ref:`cuda array interface <cuda-array-interface>`.
+
+    A view of the underlying GPU buffer is created.  No copying of the data
+    is done.  The resulting DeviceNDArray will acquire a reference from `obj`.
+
+    If ``sync`` is ``True``, then the imported stream (if present) will be
+    synchronized.
+    """
+    if not is_cuda_array(obj):
+        raise TypeError("*obj* doesn't implement the cuda array interface.")
+    else:
+        return from_cuda_array_interface(obj.__cuda_array_interface__,
+                                         owner=obj, sync=sync)
+
+
+def is_cuda_array(obj):
+    """Test if the object has defined the `__cuda_array_interface__` attribute.
+
+    Does not verify the validity of the interface.
+    """
+    return hasattr(obj, '__cuda_array_interface__')
+
+
+def is_float16_supported():
+    """Whether 16-bit floats are supported.
+
+    float16 is always supported in current versions of Numba - returns True.
+    """
+    return True
+
+
+@require_context
+def to_device(obj, stream=0, copy=True, to=None):
+    """to_device(obj, stream=0, copy=True, to=None)
+
+    Allocate and transfer a numpy ndarray or structured scalar to the device.
+
+    To copy host->device a numpy array::
+
+        ary = np.arange(10)
+        d_ary = cuda.to_device(ary)
+
+    To enqueue the transfer to a stream::
+
+        stream = cuda.stream()
+        d_ary = cuda.to_device(ary, stream=stream)
+
+    The resulting ``d_ary`` is a ``DeviceNDArray``.
+
+    To copy device->host::
+
+        hary = d_ary.copy_to_host()
+
+    To copy device->host to an existing array::
+
+        ary = np.empty(shape=d_ary.shape, dtype=d_ary.dtype)
+        d_ary.copy_to_host(ary)
+
+    To enqueue the transfer to a stream::
+
+        hary = d_ary.copy_to_host(stream=stream)
+    """
+    if to is None:
+        to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
+                                          user_explicit=True)
+        return to
+    if copy:
+        to.copy_to_device(obj, stream=stream)
+    return to
+
+
+@require_context
+def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
+    """device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
+
+    Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
+                                     stream=stream)
+
+
+@require_context
+def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                  attach_global=True):
+    """managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                     attach_global=True)
+
+    Allocate a np.ndarray with a buffer that is managed.
+    Similar to np.empty().
+
+    Managed memory is supported on Linux / x86 and PowerPC, and is considered
+    experimental on Windows and Linux / AArch64.
+
+    :param attach_global: A flag indicating whether to attach globally. Global
+                          attachment implies that the memory is accessible from
+                          any stream on any device. If ``False``, attachment is
+                          *host*, and memory is only accessible by devices
+                          with Compute Capability 6.0 and later.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
+    buffer = current_context().memallocmanaged(bytesize,
+                                               attach_global=attach_global)
+    npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                       buffer=buffer)
+    managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
+    managedview.device_setup(buffer, stream=stream)
+    return managedview
+
+
+@require_context
+def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
+    """pinned_array(shape, dtype=np.float64, strides=None, order='C')
+
+    Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
+    (pagelocked).  Similar to :func:`np.empty() <numpy.empty>`.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    bytesize = driver.memory_size_from_info(shape, strides,
+                                            dtype.itemsize)
+    buffer = current_context().memhostalloc(bytesize)
+    return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                      buffer=buffer)
+
+
+@require_context
+def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                 portable=False, wc=False):
+    """mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                    portable=False, wc=False)
+
+    Allocate a mapped ndarray with a buffer that is pinned and mapped on
+    to the device. Similar to np.empty()
+
+    :param portable: a boolean flag to allow the allocated device memory to be
+              usable in multiple devices.
+    :param wc: a boolean flag to enable writecombined allocation which is faster
+        to write by the host and to read by the device, but slower to
+        write by the host and slower to write by the device.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
+    buffer = current_context().memhostalloc(bytesize, mapped=True)
+    npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                       buffer=buffer)
+    mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
+    mappedview.device_setup(buffer, stream=stream)
+    return mappedview
+
+
+@contextlib.contextmanager
+@require_context
+def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
+    """
+    A context manager that opens a IPC *handle* (*CUipcMemHandle*) that is
+    represented as a sequence of bytes (e.g. *bytes*, tuple of int)
+    and represent it as an array of the given *shape*, *strides* and *dtype*.
+    The *strides* can be omitted.  In that case, it is assumed to be a 1D
+    C contiguous array.
+
+    Yields a device array.
+
+    The IPC handle is closed automatically when context manager exits.
+    """
+    dtype = np.dtype(dtype)
+    # compute size
+    size = np.prod(shape) * dtype.itemsize
+    # manually recreate the IPC mem handle
+    if driver.USE_NV_BINDING:
+        driver_handle = driver.binding.CUipcMemHandle()
+        driver_handle.reserved = handle
+    else:
+        driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle)
+    # use *IpcHandle* to open the IPC memory
+    ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
+    yield ipchandle.open_array(current_context(), shape=shape,
+                               strides=strides, dtype=dtype)
+    ipchandle.close()
+
+
+def synchronize():
+    "Synchronize the current context."
+    return current_context().synchronize()
+
+
+def _contiguous_strides_like_array(ary):
+    """
+    Given an array, compute strides for a new contiguous array of the same
+    shape.
+    """
+    # Don't recompute strides if the default strides will be sufficient to
+    # create a contiguous array.
+    if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
+        return None
+
+    # Otherwise, we need to compute new strides using an algorithm adapted from
+    # NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
+    # core/src/multiarray/ctors.c. We permute the strides in ascending order
+    # then compute the stride for the dimensions with the same permutation.
+
+    # Stride permutation. E.g. a stride array (4, -2, 12) becomes
+    # [(1, -2), (0, 4), (2, 12)]
+    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm.sort(key=lambda x: x[1])
+
+    # Compute new strides using permutation
+    strides = [0] * len(ary.strides)
+    stride = ary.dtype.itemsize
+    for i_perm, _ in strideperm:
+        strides[i_perm] = stride
+        stride *= ary.shape[i_perm]
+    return tuple(strides)
+
+
+def _order_like_array(ary):
+    if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
+        return 'F'
+    else:
+        return 'C'
+
+
+def device_array_like(ary, stream=0):
+    """
+    Call :func:`device_array() <numba.cuda.device_array>` with information from
+    the array.
+    """
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order, stream=stream)
+
+
+def mapped_array_like(ary, stream=0, portable=False, wc=False):
+    """
+    Call :func:`mapped_array() <numba.cuda.mapped_array>` with the information
+    from the array.
+    """
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order, stream=stream, portable=portable, wc=wc)
+
+
+def pinned_array_like(ary):
+    """
+    Call :func:`pinned_array() <numba.cuda.pinned_array>` with the information
+    from the array.
+    """
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order)
+
+
+# Stream helper
+@require_context
+def stream():
+    """
+    Create a CUDA stream that represents a command queue for the device.
+    """
+    return current_context().create_stream()
+
+
+@require_context
+def default_stream():
+    """
+    Get the default CUDA stream. CUDA semantics in general are that the default
+    stream is either the legacy default stream or the per-thread default stream
+    depending on which CUDA APIs are in use. In Numba, the APIs for the legacy
+    default stream are always the ones in use, but an option to use APIs for
+    the per-thread default stream may be provided in future.
+    """
+    return current_context().get_default_stream()
+
+
+@require_context
+def legacy_default_stream():
+    """
+    Get the legacy default CUDA stream.
+    """
+    return current_context().get_legacy_default_stream()
+
+
+@require_context
+def per_thread_default_stream():
+    """
+    Get the per-thread default CUDA stream.
+    """
+    return current_context().get_per_thread_default_stream()
+
+
+@require_context
+def external_stream(ptr):
+    """Create a Numba stream object for a stream allocated outside Numba.
+
+    :param ptr: Pointer to the external stream to wrap in a Numba Stream
+    :type ptr: int
+    """
+    return current_context().create_external_stream(ptr)
+
+
+# Page lock
+@require_context
+@contextlib.contextmanager
+def pinned(*arylist):
+    """A context manager for temporary pinning a sequence of host ndarrays.
+    """
+    pmlist = []
+    for ary in arylist:
+        pm = current_context().mempin(ary, driver.host_pointer(ary),
+                                      driver.host_memory_size(ary),
+                                      mapped=False)
+        pmlist.append(pm)
+    yield
+
+
+@require_context
+@contextlib.contextmanager
+def mapped(*arylist, **kws):
+    """A context manager for temporarily mapping a sequence of host ndarrays.
+    """
+    assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
+    stream = kws.get('stream', 0)
+    pmlist = []
+    devarylist = []
+    for ary in arylist:
+        pm = current_context().mempin(ary, driver.host_pointer(ary),
+                                      driver.host_memory_size(ary),
+                                      mapped=True)
+        pmlist.append(pm)
+        devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
+        devarylist.append(devary)
+    try:
+        if len(devarylist) == 1:
+            yield devarylist[0]
+        else:
+            yield devarylist
+    finally:
+        # When exiting from `with cuda.mapped(*arrs) as mapped_arrs:`, the name
+        # `mapped_arrs` stays in scope, blocking automatic unmapping based on
+        # reference count. We therefore invoke the finalizer manually.
+        for pm in pmlist:
+            pm.free()
+
+
+def event(timing=True):
+    """
+    Create a CUDA event. Timing data is only recorded by the event if it is
+    created with ``timing=True``.
+    """
+    evt = current_context().create_event(timing=timing)
+    return evt
+
+
+event_elapsed_time = driver.event_elapsed_time
+
+
+# Device selection
+
+def select_device(device_id):
+    """
+    Make the context associated with device *device_id* the current context.
+
+    Returns a Device instance.
+
+    Raises exception on error.
+    """
+    context = devices.get_context(device_id)
+    return context.device
+
+
+def get_current_device():
+    "Get current device associated with the current thread"
+    return current_context().device
+
+
+def list_devices():
+    "Return a list of all detected devices"
+    return devices.gpus
+
+
+def close():
+    """
+    Explicitly clears all contexts in the current thread, and destroys all
+    contexts if the current thread is the main thread.
+    """
+    devices.reset()
+
+
+def _auto_device(ary, stream=0, copy=True):
+    return devicearray.auto_device(ary, stream=stream, copy=copy)
+
+
+def detect():
+    """
+    Detect supported CUDA hardware and print a summary of the detected hardware.
+
+    Returns a boolean indicating whether any supported devices were detected.
+    """
+    devlist = list_devices()
+    print('Found %d CUDA devices' % len(devlist))
+    supported_count = 0
+    for dev in devlist:
+        attrs = []
+        cc = dev.compute_capability
+        kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
+        tcc = dev.TCC_DRIVER
+        fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
+        attrs += [('Compute Capability', '%d.%d' % cc)]
+        attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
+        attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
+        attrs += [('UUID', dev.uuid)]
+        attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
+        if os.name == "nt":
+            attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
+        attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
+        if cc < (3, 5):
+            support = '[NOT SUPPORTED: CC < 3.5]'
+        elif cc < (5, 0):
+            support = '[SUPPORTED (DEPRECATED)]'
+            supported_count += 1
+        else:
+            support = '[SUPPORTED]'
+            supported_count += 1
+
+        print('id %d    %20s %40s' % (dev.id, dev.name, support))
+        for key, val in attrs:
+            print('%40s: %s' % (key, val))
+
+    print('Summary:')
+    print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
+    return supported_count > 0
+
+
+@contextlib.contextmanager
+def defer_cleanup():
+    """
+    Temporarily disable memory deallocation.
+    Use this to prevent resource deallocation breaking asynchronous execution.
+
+    For example::
+
+        with defer_cleanup():
+            # all cleanup is deferred in here
+            do_speed_critical_code()
+        # cleanup can occur here
+
+    Note: this context manager can be nested.
+    """
+    with current_context().defer_cleanup():
+        yield
+
+
+profiling = require_context(driver.profiling)
+profile_start = require_context(driver.profile_start)
+profile_stop = require_context(driver.profile_stop)