Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/init.py
@@ -0,0 +1,22 @@
+from numba import runtests
+from numba.core import config
+
+if config.ENABLE_CUDASIM:
+    from .simulator_init import *
+else:
+    from .device_init import *
+    from .device_init import _auto_device
+
+from numba.cuda.compiler import (compile, compile_for_current_device,
+                                 compile_ptx, compile_ptx_for_current_device)
+
+# Are we the numba.cuda built in to upstream Numba, or the out-of-tree
+# NVIDIA-maintained target?
+implementation = "Built-in"
+
+
+def test(*args, **kwargs):
+    if not is_available():
+        raise cuda_error()
+
+    return runtests.main("numba.cuda.tests", *args, **kwargs)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/api.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/api.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/api_util.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/api_util.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/args.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/args.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cg.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cg.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/codegen.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/codegen.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/compiler.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/compiler.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cuda_paths.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cuda_paths.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cudadecl.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cudadecl.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cudaimpl.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cudaimpl.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cudamath.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/cudamath.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/decorators.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/decorators.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/descriptor.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/descriptor.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/device_init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/device_init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/deviceufunc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/deviceufunc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/dispatcher.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/dispatcher.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/errors.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/errors.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/extending.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/extending.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/initialize.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/initialize.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/intrinsic_wrapper.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/intrinsic_wrapper.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/intrinsics.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/intrinsics.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdevice.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdevice.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdevicedecl.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdevicedecl.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdevicefuncs.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdevicefuncs.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdeviceimpl.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/libdeviceimpl.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/mathimpl.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/mathimpl.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/models.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/models.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/nvvmutils.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/nvvmutils.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/printimpl.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/printimpl.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/random.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/random.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/simulator_init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/simulator_init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/stubs.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/stubs.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/target.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/target.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/testing.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/testing.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/types.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/types.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/ufuncs.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/ufuncs.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/vector_types.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/vector_types.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/vectorizers.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/pycache/vectorizers.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/api.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/api.py
@@ -0,0 +1,525 @@
+"""
+API that are reported to numba.cuda
+"""
+
+
+import contextlib
+import os
+
+import numpy as np
+
+from .cudadrv import devicearray, devices, driver
+from numba.core import config
+from numba.cuda.api_util import prepare_shape_strides_dtype
+
+# NDarray device helper
+
+require_context = devices.require_context
+current_context = devices.get_context
+gpus = devices.gpus
+
+
+@require_context
+def from_cuda_array_interface(desc, owner=None, sync=True):
+    """Create a DeviceNDArray from a cuda-array-interface description.
+    The ``owner`` is the owner of the underlying memory.
+    The resulting DeviceNDArray will acquire a reference from it.
+
+    If ``sync`` is ``True``, then the imported stream (if present) will be
+    synchronized.
+    """
+    version = desc.get('version')
+    # Mask introduced in version 1
+    if 1 <= version:
+        mask = desc.get('mask')
+        # Would ideally be better to detect if the mask is all valid
+        if mask is not None:
+            raise NotImplementedError('Masked arrays are not supported')
+
+    shape = desc['shape']
+    strides = desc.get('strides')
+    dtype = np.dtype(desc['typestr'])
+
+    shape, strides, dtype = prepare_shape_strides_dtype(
+        shape, strides, dtype, order='C')
+    size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
+
+    devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
+    data = driver.MemoryPointer(
+        current_context(), devptr, size=size, owner=owner)
+    stream_ptr = desc.get('stream', None)
+    if stream_ptr is not None:
+        stream = external_stream(stream_ptr)
+        if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
+            stream.synchronize()
+    else:
+        stream = 0 # No "Numba default stream", not the CUDA default stream
+    da = devicearray.DeviceNDArray(shape=shape, strides=strides,
+                                   dtype=dtype, gpu_data=data,
+                                   stream=stream)
+    return da
+
+
+def as_cuda_array(obj, sync=True):
+    """Create a DeviceNDArray from any object that implements
+    the :ref:`cuda array interface <cuda-array-interface>`.
+
+    A view of the underlying GPU buffer is created.  No copying of the data
+    is done.  The resulting DeviceNDArray will acquire a reference from `obj`.
+
+    If ``sync`` is ``True``, then the imported stream (if present) will be
+    synchronized.
+    """
+    if not is_cuda_array(obj):
+        raise TypeError("*obj* doesn't implement the cuda array interface.")
+    else:
+        return from_cuda_array_interface(obj.__cuda_array_interface__,
+                                         owner=obj, sync=sync)
+
+
+def is_cuda_array(obj):
+    """Test if the object has defined the `__cuda_array_interface__` attribute.
+
+    Does not verify the validity of the interface.
+    """
+    return hasattr(obj, '__cuda_array_interface__')
+
+
+def is_float16_supported():
+    """Whether 16-bit floats are supported.
+
+    float16 is always supported in current versions of Numba - returns True.
+    """
+    return True
+
+
+@require_context
+def to_device(obj, stream=0, copy=True, to=None):
+    """to_device(obj, stream=0, copy=True, to=None)
+
+    Allocate and transfer a numpy ndarray or structured scalar to the device.
+
+    To copy host->device a numpy array::
+
+        ary = np.arange(10)
+        d_ary = cuda.to_device(ary)
+
+    To enqueue the transfer to a stream::
+
+        stream = cuda.stream()
+        d_ary = cuda.to_device(ary, stream=stream)
+
+    The resulting ``d_ary`` is a ``DeviceNDArray``.
+
+    To copy device->host::
+
+        hary = d_ary.copy_to_host()
+
+    To copy device->host to an existing array::
+
+        ary = np.empty(shape=d_ary.shape, dtype=d_ary.dtype)
+        d_ary.copy_to_host(ary)
+
+    To enqueue the transfer to a stream::
+
+        hary = d_ary.copy_to_host(stream=stream)
+    """
+    if to is None:
+        to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
+                                          user_explicit=True)
+        return to
+    if copy:
+        to.copy_to_device(obj, stream=stream)
+    return to
+
+
+@require_context
+def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
+    """device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
+
+    Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
+                                     stream=stream)
+
+
+@require_context
+def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                  attach_global=True):
+    """managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                     attach_global=True)
+
+    Allocate a np.ndarray with a buffer that is managed.
+    Similar to np.empty().
+
+    Managed memory is supported on Linux / x86 and PowerPC, and is considered
+    experimental on Windows and Linux / AArch64.
+
+    :param attach_global: A flag indicating whether to attach globally. Global
+                          attachment implies that the memory is accessible from
+                          any stream on any device. If ``False``, attachment is
+                          *host*, and memory is only accessible by devices
+                          with Compute Capability 6.0 and later.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
+    buffer = current_context().memallocmanaged(bytesize,
+                                               attach_global=attach_global)
+    npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                       buffer=buffer)
+    managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
+    managedview.device_setup(buffer, stream=stream)
+    return managedview
+
+
+@require_context
+def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
+    """pinned_array(shape, dtype=np.float64, strides=None, order='C')
+
+    Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
+    (pagelocked).  Similar to :func:`np.empty() <numpy.empty>`.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    bytesize = driver.memory_size_from_info(shape, strides,
+                                            dtype.itemsize)
+    buffer = current_context().memhostalloc(bytesize)
+    return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                      buffer=buffer)
+
+
+@require_context
+def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                 portable=False, wc=False):
+    """mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
+                    portable=False, wc=False)
+
+    Allocate a mapped ndarray with a buffer that is pinned and mapped on
+    to the device. Similar to np.empty()
+
+    :param portable: a boolean flag to allow the allocated device memory to be
+              usable in multiple devices.
+    :param wc: a boolean flag to enable writecombined allocation which is faster
+        to write by the host and to read by the device, but slower to
+        write by the host and slower to write by the device.
+    """
+    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
+                                                        order)
+    bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
+    buffer = current_context().memhostalloc(bytesize, mapped=True)
+    npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                       buffer=buffer)
+    mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
+    mappedview.device_setup(buffer, stream=stream)
+    return mappedview
+
+
+@contextlib.contextmanager
+@require_context
+def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
+    """
+    A context manager that opens a IPC *handle* (*CUipcMemHandle*) that is
+    represented as a sequence of bytes (e.g. *bytes*, tuple of int)
+    and represent it as an array of the given *shape*, *strides* and *dtype*.
+    The *strides* can be omitted.  In that case, it is assumed to be a 1D
+    C contiguous array.
+
+    Yields a device array.
+
+    The IPC handle is closed automatically when context manager exits.
+    """
+    dtype = np.dtype(dtype)
+    # compute size
+    size = np.prod(shape) * dtype.itemsize
+    # manually recreate the IPC mem handle
+    if driver.USE_NV_BINDING:
+        driver_handle = driver.binding.CUipcMemHandle()
+        driver_handle.reserved = handle
+    else:
+        driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle)
+    # use *IpcHandle* to open the IPC memory
+    ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
+    yield ipchandle.open_array(current_context(), shape=shape,
+                               strides=strides, dtype=dtype)
+    ipchandle.close()
+
+
+def synchronize():
+    "Synchronize the current context."
+    return current_context().synchronize()
+
+
+def _contiguous_strides_like_array(ary):
+    """
+    Given an array, compute strides for a new contiguous array of the same
+    shape.
+    """
+    # Don't recompute strides if the default strides will be sufficient to
+    # create a contiguous array.
+    if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
+        return None
+
+    # Otherwise, we need to compute new strides using an algorithm adapted from
+    # NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
+    # core/src/multiarray/ctors.c. We permute the strides in ascending order
+    # then compute the stride for the dimensions with the same permutation.
+
+    # Stride permutation. E.g. a stride array (4, -2, 12) becomes
+    # [(1, -2), (0, 4), (2, 12)]
+    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm.sort(key=lambda x: x[1])
+
+    # Compute new strides using permutation
+    strides = [0] * len(ary.strides)
+    stride = ary.dtype.itemsize
+    for i_perm, _ in strideperm:
+        strides[i_perm] = stride
+        stride *= ary.shape[i_perm]
+    return tuple(strides)
+
+
+def _order_like_array(ary):
+    if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
+        return 'F'
+    else:
+        return 'C'
+
+
+def device_array_like(ary, stream=0):
+    """
+    Call :func:`device_array() <numba.cuda.device_array>` with information from
+    the array.
+    """
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order, stream=stream)
+
+
+def mapped_array_like(ary, stream=0, portable=False, wc=False):
+    """
+    Call :func:`mapped_array() <numba.cuda.mapped_array>` with the information
+    from the array.
+    """
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order, stream=stream, portable=portable, wc=wc)
+
+
+def pinned_array_like(ary):
+    """
+    Call :func:`pinned_array() <numba.cuda.pinned_array>` with the information
+    from the array.
+    """
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order)
+
+
+# Stream helper
+@require_context
+def stream():
+    """
+    Create a CUDA stream that represents a command queue for the device.
+    """
+    return current_context().create_stream()
+
+
+@require_context
+def default_stream():
+    """
+    Get the default CUDA stream. CUDA semantics in general are that the default
+    stream is either the legacy default stream or the per-thread default stream
+    depending on which CUDA APIs are in use. In Numba, the APIs for the legacy
+    default stream are always the ones in use, but an option to use APIs for
+    the per-thread default stream may be provided in future.
+    """
+    return current_context().get_default_stream()
+
+
+@require_context
+def legacy_default_stream():
+    """
+    Get the legacy default CUDA stream.
+    """
+    return current_context().get_legacy_default_stream()
+
+
+@require_context
+def per_thread_default_stream():
+    """
+    Get the per-thread default CUDA stream.
+    """
+    return current_context().get_per_thread_default_stream()
+
+
+@require_context
+def external_stream(ptr):
+    """Create a Numba stream object for a stream allocated outside Numba.
+
+    :param ptr: Pointer to the external stream to wrap in a Numba Stream
+    :type ptr: int
+    """
+    return current_context().create_external_stream(ptr)
+
+
+# Page lock
+@require_context
+@contextlib.contextmanager
+def pinned(*arylist):
+    """A context manager for temporary pinning a sequence of host ndarrays.
+    """
+    pmlist = []
+    for ary in arylist:
+        pm = current_context().mempin(ary, driver.host_pointer(ary),
+                                      driver.host_memory_size(ary),
+                                      mapped=False)
+        pmlist.append(pm)
+    yield
+
+
+@require_context
+@contextlib.contextmanager
+def mapped(*arylist, **kws):
+    """A context manager for temporarily mapping a sequence of host ndarrays.
+    """
+    assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
+    stream = kws.get('stream', 0)
+    pmlist = []
+    devarylist = []
+    for ary in arylist:
+        pm = current_context().mempin(ary, driver.host_pointer(ary),
+                                      driver.host_memory_size(ary),
+                                      mapped=True)
+        pmlist.append(pm)
+        devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
+        devarylist.append(devary)
+    try:
+        if len(devarylist) == 1:
+            yield devarylist[0]
+        else:
+            yield devarylist
+    finally:
+        # When exiting from `with cuda.mapped(*arrs) as mapped_arrs:`, the name
+        # `mapped_arrs` stays in scope, blocking automatic unmapping based on
+        # reference count. We therefore invoke the finalizer manually.
+        for pm in pmlist:
+            pm.free()
+
+
+def event(timing=True):
+    """
+    Create a CUDA event. Timing data is only recorded by the event if it is
+    created with ``timing=True``.
+    """
+    evt = current_context().create_event(timing=timing)
+    return evt
+
+
+event_elapsed_time = driver.event_elapsed_time
+
+
+# Device selection
+
+def select_device(device_id):
+    """
+    Make the context associated with device *device_id* the current context.
+
+    Returns a Device instance.
+
+    Raises exception on error.
+    """
+    context = devices.get_context(device_id)
+    return context.device
+
+
+def get_current_device():
+    "Get current device associated with the current thread"
+    return current_context().device
+
+
+def list_devices():
+    "Return a list of all detected devices"
+    return devices.gpus
+
+
+def close():
+    """
+    Explicitly clears all contexts in the current thread, and destroys all
+    contexts if the current thread is the main thread.
+    """
+    devices.reset()
+
+
+def _auto_device(ary, stream=0, copy=True):
+    return devicearray.auto_device(ary, stream=stream, copy=copy)
+
+
+def detect():
+    """
+    Detect supported CUDA hardware and print a summary of the detected hardware.
+
+    Returns a boolean indicating whether any supported devices were detected.
+    """
+    devlist = list_devices()
+    print('Found %d CUDA devices' % len(devlist))
+    supported_count = 0
+    for dev in devlist:
+        attrs = []
+        cc = dev.compute_capability
+        kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
+        tcc = dev.TCC_DRIVER
+        fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
+        attrs += [('Compute Capability', '%d.%d' % cc)]
+        attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
+        attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
+        attrs += [('UUID', dev.uuid)]
+        attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
+        if os.name == "nt":
+            attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
+        attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
+        if cc < (3, 5):
+            support = '[NOT SUPPORTED: CC < 3.5]'
+        elif cc < (5, 0):
+            support = '[SUPPORTED (DEPRECATED)]'
+            supported_count += 1
+        else:
+            support = '[SUPPORTED]'
+            supported_count += 1
+
+        print('id %d    %20s %40s' % (dev.id, dev.name, support))
+        for key, val in attrs:
+            print('%40s: %s' % (key, val))
+
+    print('Summary:')
+    print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
+    return supported_count > 0
+
+
+@contextlib.contextmanager
+def defer_cleanup():
+    """
+    Temporarily disable memory deallocation.
+    Use this to prevent resource deallocation breaking asynchronous execution.
+
+    For example::
+
+        with defer_cleanup():
+            # all cleanup is deferred in here
+            do_speed_critical_code()
+        # cleanup can occur here
+
+    Note: this context manager can be nested.
+    """
+    with current_context().defer_cleanup():
+        yield
+
+
+profiling = require_context(driver.profiling)
+profile_start = require_context(driver.profile_start)
+profile_stop = require_context(driver.profile_stop)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/api_util.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/api_util.py
@@ -0,0 +1,30 @@
+import numpy as np
+
+
+def prepare_shape_strides_dtype(shape, strides, dtype, order):
+    dtype = np.dtype(dtype)
+    if isinstance(shape, int):
+        shape = (shape,)
+    if isinstance(strides, int):
+        strides = (strides,)
+    else:
+        strides = strides or _fill_stride_by_order(shape, dtype, order)
+    return shape, strides, dtype
+
+
+def _fill_stride_by_order(shape, dtype, order):
+    nd = len(shape)
+    if nd == 0:
+        return ()
+    strides = [0] * nd
+    if order == 'C':
+        strides[-1] = dtype.itemsize
+        for d in reversed(range(nd - 1)):
+            strides[d] = strides[d + 1] * shape[d + 1]
+    elif order == 'F':
+        strides[0] = dtype.itemsize
+        for d in range(1, nd):
+            strides[d] = strides[d - 1] * shape[d - 1]
+    else:
+        raise ValueError('must be either C/F order')
+    return tuple(strides)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/args.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/args.py
@@ -0,0 +1,77 @@
+"""
+Hints to wrap Kernel arguments to indicate how to manage host-device
+memory transfers before & after the kernel call.
+"""
+import abc
+
+from numba.core.typing.typeof import typeof, Purpose
+
+
+class ArgHint(metaclass=abc.ABCMeta):
+    def __init__(self, value):
+        self.value = value
+
+    @abc.abstractmethod
+    def to_device(self, retr, stream=0):
+        """
+        :param stream: a stream to use when copying data
+        :param retr:
+            a list of clean-up work to do after the kernel's been run.
+            Append 0-arg lambdas to it!
+        :return: a value (usually an `DeviceNDArray`) to be passed to
+            the kernel
+        """
+        pass
+
+    @property
+    def _numba_type_(self):
+        return typeof(self.value, Purpose.argument)
+
+
+class In(ArgHint):
+    def to_device(self, retr, stream=0):
+        from .cudadrv.devicearray import auto_device
+        devary, _ = auto_device(
+            self.value,
+            stream=stream)
+        # A dummy writeback functor to keep devary alive until the kernel
+        # is called.
+        retr.append(lambda: devary)
+        return devary
+
+
+class Out(ArgHint):
+    def to_device(self, retr, stream=0):
+        from .cudadrv.devicearray import auto_device
+        devary, conv = auto_device(
+            self.value,
+            copy=False,
+            stream=stream)
+        if conv:
+            retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
+        return devary
+
+
+class InOut(ArgHint):
+    def to_device(self, retr, stream=0):
+        from .cudadrv.devicearray import auto_device
+        devary, conv = auto_device(
+            self.value,
+            stream=stream)
+        if conv:
+            retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
+        return devary
+
+
+def wrap_arg(value, default=InOut):
+    return value if isinstance(value, ArgHint) else default(value)
+
+
+__all__ = [
+    'In',
+    'Out',
+    'InOut',
+
+    'ArgHint',
+    'wrap_arg',
+]
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cg.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cg.py
@@ -0,0 +1,62 @@
+from numba.core import types
+from numba.core.extending import overload, overload_method
+from numba.core.typing import signature
+from numba.cuda import nvvmutils
+from numba.cuda.extending import intrinsic
+from numba.cuda.types import grid_group, GridGroup as GridGroupClass
+
+
+class GridGroup:
+    """A cooperative group representing the entire grid"""
+
+    def sync() -> None:
+        """Synchronize this grid group"""
+
+
+def this_grid() -> GridGroup:
+    """Get the current grid group."""
+    return GridGroup()
+
+
+@intrinsic
+def _this_grid(typingctx):
+    sig = signature(grid_group)
+
+    def codegen(context, builder, sig, args):
+        one = context.get_constant(types.int32, 1)
+        mod = builder.module
+        return builder.call(
+            nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
+            (one,))
+
+    return sig, codegen
+
+
+@overload(this_grid, target='cuda')
+def _ol_this_grid():
+    def impl():
+        return _this_grid()
+
+    return impl
+
+
+@intrinsic
+def _grid_group_sync(typingctx, group):
+    sig = signature(types.int32, group)
+
+    def codegen(context, builder, sig, args):
+        flags = context.get_constant(types.int32, 0)
+        mod = builder.module
+        return builder.call(
+            nvvmutils.declare_cudaCGSynchronize(mod),
+            (*args, flags))
+
+    return sig, codegen
+
+
+@overload_method(GridGroupClass, 'sync', target='cuda')
+def _ol_grid_group_sync(group):
+    def impl(group):
+        return _grid_group_sync(group)
+
+    return impl
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/codegen.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/codegen.py
@@ -0,0 +1,378 @@
+from llvmlite import ir
+
+from numba.core import config, serialize
+from numba.core.codegen import Codegen, CodeLibrary
+from .cudadrv import devices, driver, nvvm, runtime
+from numba.cuda.cudadrv.libs import get_cudalib
+
+import os
+import subprocess
+import tempfile
+
+
+CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
+
+
+def run_nvdisasm(cubin, flags):
+    # nvdisasm only accepts input from a file, so we need to write out to a
+    # temp file and clean up afterwards.
+    fd = None
+    fname = None
+    try:
+        fd, fname = tempfile.mkstemp()
+        with open(fname, 'wb') as f:
+            f.write(cubin)
+
+        try:
+            cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        except FileNotFoundError as e:
+            msg = ("nvdisasm has not been found. You may need "
+                   "to install the CUDA toolkit and ensure that "
+                   "it is available on your PATH.\n")
+            raise RuntimeError(msg) from e
+        return cp.stdout.decode('utf-8')
+    finally:
+        if fd is not None:
+            os.close(fd)
+        if fname is not None:
+            os.unlink(fname)
+
+
+def disassemble_cubin(cubin):
+    # Request lineinfo in disassembly
+    flags = ['-gi']
+    return run_nvdisasm(cubin, flags)
+
+
+def disassemble_cubin_for_cfg(cubin):
+    # Request control flow graph in disassembly
+    flags = ['-cfg']
+    return run_nvdisasm(cubin, flags)
+
+
+class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
+    """
+    The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
+    compute capabilities. It also loads cubins to multiple devices (via
+    get_cufunc), which may be of different compute capabilities.
+    """
+
+    def __init__(self, codegen, name, entry_name=None, max_registers=None,
+                 nvvm_options=None):
+        """
+        codegen:
+            Codegen object.
+        name:
+            Name of the function in the source.
+        entry_name:
+            Name of the kernel function in the binary, if this is a global
+            kernel and not a device function.
+        max_registers:
+            The maximum register usage to aim for when linking.
+        nvvm_options:
+                Dict of options to pass to NVVM.
+        """
+        super().__init__(codegen, name)
+
+        # The llvmlite module for this library.
+        self._module = None
+        # CodeLibrary objects that will be "linked" into this library. The
+        # modules within them are compiled from NVVM IR to PTX along with the
+        # IR from this module - in that sense they are "linked" by NVVM at PTX
+        # generation time, rather than at link time.
+        self._linking_libraries = set()
+        # Files to link with the generated PTX. These are linked using the
+        # Driver API at link time.
+        self._linking_files = set()
+        # Should we link libcudadevrt?
+        self.needs_cudadevrt = False
+
+        # Cache the LLVM IR string
+        self._llvm_strs = None
+        # Maps CC -> PTX string
+        self._ptx_cache = {}
+        # Maps CC -> LTO-IR
+        self._ltoir_cache = {}
+        # Maps CC -> cubin
+        self._cubin_cache = {}
+        # Maps CC -> linker info output for cubin
+        self._linkerinfo_cache = {}
+        # Maps Device numeric ID -> cufunc
+        self._cufunc_cache = {}
+
+        self._max_registers = max_registers
+        if nvvm_options is None:
+            nvvm_options = {}
+        self._nvvm_options = nvvm_options
+        self._entry_name = entry_name
+
+    @property
+    def llvm_strs(self):
+        if self._llvm_strs is None:
+            self._llvm_strs = [str(mod) for mod in self.modules]
+        return self._llvm_strs
+
+    def get_llvm_str(self):
+        return "\n\n".join(self.llvm_strs)
+
+    def _ensure_cc(self, cc):
+        if cc is not None:
+            return cc
+
+        device = devices.get_context().device
+        return device.compute_capability
+
+    def get_asm_str(self, cc=None):
+        cc = self._ensure_cc(cc)
+
+        ptxes = self._ptx_cache.get(cc, None)
+        if ptxes:
+            return ptxes
+
+        arch = nvvm.get_arch_option(*cc)
+        options = self._nvvm_options.copy()
+        options['arch'] = arch
+
+        irs = self.llvm_strs
+
+        ptx = nvvm.compile_ir(irs, **options)
+
+        # Sometimes the result from NVVM contains trailing whitespace and
+        # nulls, which we strip so that the assembly dump looks a little
+        # tidier.
+        ptx = ptx.decode().strip('\x00').strip()
+
+        if config.DUMP_ASSEMBLY:
+            print(("ASSEMBLY %s" % self._name).center(80, '-'))
+            print(ptx)
+            print('=' * 80)
+
+        self._ptx_cache[cc] = ptx
+
+        return ptx
+
+    def get_ltoir(self, cc=None):
+        cc = self._ensure_cc(cc)
+
+        ltoir = self._ltoir_cache.get(cc, None)
+        if ltoir is not None:
+            return ltoir
+
+        arch = nvvm.get_arch_option(*cc)
+        options = self._nvvm_options.copy()
+        options['arch'] = arch
+        options['gen-lto'] = None
+
+        irs = self.llvm_strs
+        ltoir = nvvm.compile_ir(irs, **options)
+        self._ltoir_cache[cc] = ltoir
+
+        return ltoir
+
+    def get_cubin(self, cc=None):
+        cc = self._ensure_cc(cc)
+
+        cubin = self._cubin_cache.get(cc, None)
+        if cubin:
+            return cubin
+
+        linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
+
+        if linker.lto:
+            ltoir = self.get_ltoir(cc=cc)
+            linker.add_ltoir(ltoir)
+        else:
+            ptx = self.get_asm_str(cc=cc)
+            linker.add_ptx(ptx.encode())
+
+        for path in self._linking_files:
+            linker.add_file_guess_ext(path)
+        if self.needs_cudadevrt:
+            linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
+
+        cubin = linker.complete()
+        self._cubin_cache[cc] = cubin
+        self._linkerinfo_cache[cc] = linker.info_log
+
+        return cubin
+
+    def get_cufunc(self):
+        if self._entry_name is None:
+            msg = "Missing entry_name - are you trying to get the cufunc " \
+                  "for a device function?"
+            raise RuntimeError(msg)
+
+        ctx = devices.get_context()
+        device = ctx.device
+
+        cufunc = self._cufunc_cache.get(device.id, None)
+        if cufunc:
+            return cufunc
+
+        cubin = self.get_cubin(cc=device.compute_capability)
+        module = ctx.create_module_image(cubin)
+
+        # Load
+        cufunc = module.get_function(self._entry_name)
+
+        # Populate caches
+        self._cufunc_cache[device.id] = cufunc
+
+        return cufunc
+
+    def get_linkerinfo(self, cc):
+        try:
+            return self._linkerinfo_cache[cc]
+        except KeyError:
+            raise KeyError(f'No linkerinfo for CC {cc}')
+
+    def get_sass(self, cc=None):
+        return disassemble_cubin(self.get_cubin(cc=cc))
+
+    def get_sass_cfg(self, cc=None):
+        return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
+
+    def add_ir_module(self, mod):
+        self._raise_if_finalized()
+        if self._module is not None:
+            raise RuntimeError('CUDACodeLibrary only supports one module')
+        self._module = mod
+
+    def add_linking_library(self, library):
+        library._ensure_finalized()
+
+        # We don't want to allow linking more libraries in after finalization
+        # because our linked libraries are modified by the finalization, and we
+        # won't be able to finalize again after adding new ones
+        self._raise_if_finalized()
+
+        self._linking_libraries.add(library)
+
+    def add_linking_file(self, filepath):
+        self._linking_files.add(filepath)
+
+    def get_function(self, name):
+        for fn in self._module.functions:
+            if fn.name == name:
+                return fn
+        raise KeyError(f'Function {name} not found')
+
+    @property
+    def modules(self):
+        return [self._module] + [mod for lib in self._linking_libraries
+                                 for mod in lib.modules]
+
+    @property
+    def linking_libraries(self):
+        # Libraries we link to may link to other libraries, so we recursively
+        # traverse the linking libraries property to build up a list of all
+        # linked libraries.
+        libs = []
+        for lib in self._linking_libraries:
+            libs.extend(lib.linking_libraries)
+            libs.append(lib)
+        return libs
+
+    def finalize(self):
+        # Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
+        # we only adjust the linkage of functions. Global kernels (with
+        # external linkage) have their linkage untouched. Device functions are
+        # set linkonce_odr to prevent them appearing in the PTX.
+
+        self._raise_if_finalized()
+
+        # Note in-place modification of the linkage of functions in linked
+        # libraries. This presently causes no issues as only device functions
+        # are shared across code libraries, so they would always need their
+        # linkage set to linkonce_odr. If in a future scenario some code
+        # libraries require linkonce_odr linkage of functions in linked
+        # modules, and another code library requires another linkage, each code
+        # library will need to take its own private copy of its linked modules.
+        #
+        # See also discussion on PR #890:
+        # https://github.com/numba/numba/pull/890
+        for library in self._linking_libraries:
+            for mod in library.modules:
+                for fn in mod.functions:
+                    if not fn.is_declaration:
+                        fn.linkage = 'linkonce_odr'
+
+        self._finalized = True
+
+    def _reduce_states(self):
+        """
+        Reduce the instance for serialization. We retain the PTX and cubins,
+        but loaded functions are discarded. They are recreated when needed
+        after deserialization.
+        """
+        if self._linking_files:
+            msg = 'Cannot pickle CUDACodeLibrary with linking files'
+            raise RuntimeError(msg)
+        if not self._finalized:
+            raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
+        return dict(
+            codegen=None,
+            name=self.name,
+            entry_name=self._entry_name,
+            llvm_strs=self.llvm_strs,
+            ptx_cache=self._ptx_cache,
+            cubin_cache=self._cubin_cache,
+            linkerinfo_cache=self._linkerinfo_cache,
+            max_registers=self._max_registers,
+            nvvm_options=self._nvvm_options,
+            needs_cudadevrt=self.needs_cudadevrt
+        )
+
+    @classmethod
+    def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
+                 cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
+                 needs_cudadevrt):
+        """
+        Rebuild an instance.
+        """
+        instance = cls(codegen, name, entry_name=entry_name)
+
+        instance._llvm_strs = llvm_strs
+        instance._ptx_cache = ptx_cache
+        instance._cubin_cache = cubin_cache
+        instance._linkerinfo_cache = linkerinfo_cache
+
+        instance._max_registers = max_registers
+        instance._nvvm_options = nvvm_options
+        instance.needs_cudadevrt = needs_cudadevrt
+
+        instance._finalized = True
+
+        return instance
+
+
+class JITCUDACodegen(Codegen):
+    """
+    This codegen implementation for CUDA only generates optimized LLVM IR.
+    Generation of PTX code is done separately (see numba.cuda.compiler).
+    """
+
+    _library_class = CUDACodeLibrary
+
+    def __init__(self, module_name):
+        pass
+
+    def _create_empty_module(self, name):
+        ir_module = ir.Module(name)
+        ir_module.triple = CUDA_TRIPLE
+        ir_module.data_layout = nvvm.NVVM().data_layout
+        nvvm.add_ir_version(ir_module)
+        return ir_module
+
+    def _add_module(self, module):
+        pass
+
+    def magic_tuple(self):
+        """
+        Return a tuple unambiguously describing the codegen behaviour.
+        """
+        ctx = devices.get_context()
+        cc = ctx.device.compute_capability
+        return (runtime.runtime.get_version(), cc)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/compiler.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/compiler.py
@@ -0,0 +1,422 @@
+from llvmlite import ir
+from numba.core.typing.templates import ConcreteTemplate
+from numba.core import types, typing, funcdesc, config, compiler, sigutils
+from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
+                                 DefaultPassBuilder, Flags, Option,
+                                 CompileResult)
+from numba.core.compiler_lock import global_compiler_lock
+from numba.core.compiler_machinery import (LoweringPass,
+                                           PassManager, register_pass)
+from numba.core.errors import NumbaInvalidConfigWarning
+from numba.core.typed_passes import (IRLegalization, NativeLowering,
+                                     AnnotateTypes)
+from warnings import warn
+from numba.cuda.api import get_current_device
+from numba.cuda.target import CUDACABICallConv
+
+
+def _nvvm_options_type(x):
+    if x is None:
+        return None
+
+    else:
+        assert isinstance(x, dict)
+        return x
+
+
+class CUDAFlags(Flags):
+    nvvm_options = Option(
+        type=_nvvm_options_type,
+        default=None,
+        doc="NVVM options",
+    )
+    compute_capability = Option(
+        type=tuple,
+        default=None,
+        doc="Compute Capability",
+    )
+
+
+# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
+# id.  This is because the entry point is used as a key into a dict of
+# overloads by the base dispatcher. The id of the CCR is the only small and
+# unique property of a CompileResult in the CUDA target (cf. the CPU target,
+# which uses its entry_point, which is a pointer value).
+#
+# This does feel a little hackish, and there are two ways in which this could
+# be improved:
+#
+# 1. We could change the core of Numba so that each CompileResult has its own
+#    unique ID that can be used as a key - e.g. a count, similar to the way in
+#    which types have unique counts.
+# 2. At some future time when kernel launch uses a compiled function, the entry
+#    point will no longer need to be a synthetic value, but will instead be a
+#    pointer to the compiled function as in the CPU target.
+
+class CUDACompileResult(CompileResult):
+    @property
+    def entry_point(self):
+        return id(self)
+
+
+def cuda_compile_result(**entries):
+    entries = sanitize_compile_result_entries(entries)
+    return CUDACompileResult(**entries)
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDABackend(LoweringPass):
+
+    _name = "cuda_backend"
+
+    def __init__(self):
+        LoweringPass.__init__(self)
+
+    def run_pass(self, state):
+        """
+        Back-end: Packages lowering output in a compile result
+        """
+        lowered = state['cr']
+        signature = typing.signature(state.return_type, *state.args)
+
+        state.cr = cuda_compile_result(
+            typing_context=state.typingctx,
+            target_context=state.targetctx,
+            typing_error=state.status.fail_reason,
+            type_annotation=state.type_annotation,
+            library=state.library,
+            call_helper=lowered.call_helper,
+            signature=signature,
+            fndesc=lowered.fndesc,
+        )
+        return True
+
+
+@register_pass(mutates_CFG=False, analysis_only=False)
+class CreateLibrary(LoweringPass):
+    """
+    Create a CUDACodeLibrary for the NativeLowering pass to populate. The
+    NativeLowering pass will create a code library if none exists, but we need
+    to set it up with nvvm_options from the flags if they are present.
+    """
+
+    _name = "create_library"
+
+    def __init__(self):
+        LoweringPass.__init__(self)
+
+    def run_pass(self, state):
+        codegen = state.targetctx.codegen()
+        name = state.func_id.func_qualname
+        nvvm_options = state.flags.nvvm_options
+        state.library = codegen.create_library(name, nvvm_options=nvvm_options)
+        # Enable object caching upfront so that the library can be serialized.
+        state.library.enable_object_caching()
+
+        return True
+
+
+class CUDACompiler(CompilerBase):
+    def define_pipelines(self):
+        dpb = DefaultPassBuilder
+        pm = PassManager('cuda')
+
+        untyped_passes = dpb.define_untyped_pipeline(self.state)
+        pm.passes.extend(untyped_passes.passes)
+
+        typed_passes = dpb.define_typed_pipeline(self.state)
+        pm.passes.extend(typed_passes.passes)
+
+        lowering_passes = self.define_cuda_lowering_pipeline(self.state)
+        pm.passes.extend(lowering_passes.passes)
+
+        pm.finalize()
+        return [pm]
+
+    def define_cuda_lowering_pipeline(self, state):
+        pm = PassManager('cuda_lowering')
+        # legalise
+        pm.add_pass(IRLegalization,
+                    "ensure IR is legal prior to lowering")
+        pm.add_pass(AnnotateTypes, "annotate types")
+
+        # lower
+        pm.add_pass(CreateLibrary, "create library")
+        pm.add_pass(NativeLowering, "native lowering")
+        pm.add_pass(CUDABackend, "cuda backend")
+
+        pm.finalize()
+        return pm
+
+
+@global_compiler_lock
+def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
+                 inline=False, fastmath=False, nvvm_options=None,
+                 cc=None):
+    if cc is None:
+        raise ValueError('Compute Capability must be supplied')
+
+    from .descriptor import cuda_target
+    typingctx = cuda_target.typing_context
+    targetctx = cuda_target.target_context
+
+    flags = CUDAFlags()
+    # Do not compile (generate native code), just lower (to LLVM)
+    flags.no_compile = True
+    flags.no_cpython_wrapper = True
+    flags.no_cfunc_wrapper = True
+
+    # Both debug and lineinfo turn on debug information in the compiled code,
+    # but we keep them separate arguments in case we later want to overload
+    # some other behavior on the debug flag. In particular, -opt=3 is not
+    # supported with debug enabled, and enabling only lineinfo should not
+    # affect the error model.
+    if debug or lineinfo:
+        flags.debuginfo = True
+
+    if lineinfo:
+        flags.dbg_directives_only = True
+
+    if debug:
+        flags.error_model = 'python'
+    else:
+        flags.error_model = 'numpy'
+
+    if inline:
+        flags.forceinline = True
+    if fastmath:
+        flags.fastmath = True
+    if nvvm_options:
+        flags.nvvm_options = nvvm_options
+    flags.compute_capability = cc
+
+    # Run compilation pipeline
+    from numba.core.target_extension import target_override
+    with target_override('cuda'):
+        cres = compiler.compile_extra(typingctx=typingctx,
+                                      targetctx=targetctx,
+                                      func=pyfunc,
+                                      args=args,
+                                      return_type=return_type,
+                                      flags=flags,
+                                      locals={},
+                                      pipeline_class=CUDACompiler)
+
+    library = cres.library
+    library.finalize()
+
+    return cres
+
+
+def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
+                       nvvm_options):
+    """
+    Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
+
+    The C ABI wrapper will have the same name as the source Python function.
+    """
+    # The wrapper will be contained in a new library that links to the wrapped
+    # function's library
+    library = lib.codegen.create_library(f'{lib.name}_function_',
+                                         entry_name=wrapper_function_name,
+                                         nvvm_options=nvvm_options)
+    library.add_linking_library(lib)
+
+    # Determine the caller (C ABI) and wrapper (Numba ABI) function types
+    argtypes = fndesc.argtypes
+    restype = fndesc.restype
+    c_call_conv = CUDACABICallConv(context)
+    wrapfnty = c_call_conv.get_function_type(restype, argtypes)
+    fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
+
+    # Create a new module and declare the callee
+    wrapper_module = context.create_module("cuda.cabi.wrapper")
+    func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
+
+    # Define the caller - populate it with a call to the callee and return
+    # its return value
+
+    wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
+    builder = ir.IRBuilder(wrapfn.append_basic_block(''))
+
+    arginfo = context.get_arg_packer(argtypes)
+    callargs = arginfo.from_arguments(builder, wrapfn.args)
+    # We get (status, return_value), but we ignore the status since we
+    # can't propagate it through the C ABI anyway
+    _, return_value = context.call_conv.call_function(
+        builder, func, restype, argtypes, callargs)
+    builder.ret(return_value)
+
+    library.add_ir_module(wrapper_module)
+    library.finalize()
+    return library
+
+
+@global_compiler_lock
+def compile(pyfunc, sig, debug=False, lineinfo=False, device=True,
+            fastmath=False, cc=None, opt=True, abi="c", abi_info=None,
+            output='ptx'):
+    """Compile a Python function to PTX or LTO-IR for a given set of argument
+    types.
+
+    :param pyfunc: The Python function to compile.
+    :param sig: The signature representing the function's input and output
+                types. If this is a tuple of argument types without a return
+                type, the inferred return type is returned by this function. If
+                a signature including a return type is passed, the compiled code
+                will include a cast from the inferred return type to the
+                specified return type, and this function will return the
+                specified return type.
+    :param debug: Whether to include debug info in the compiled code.
+    :type debug: bool
+    :param lineinfo: Whether to include a line mapping from the compiled code
+                     to the source code. Usually this is used with optimized
+                     code (since debug mode would automatically include this),
+                     so we want debug info in the LLVM IR but only the line
+                     mapping in the final output.
+    :type lineinfo: bool
+    :param device: Whether to compile a device function.
+    :type device: bool
+    :param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
+                     prec_div=, and fma=1)
+    :type fastmath: bool
+    :param cc: Compute capability to compile for, as a tuple
+               ``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
+    :type cc: tuple
+    :param opt: Enable optimizations. Defaults to ``True``.
+    :type opt: bool
+    :param abi: The ABI for a compiled function - either ``"numba"`` or
+                ``"c"``. Note that the Numba ABI is not considered stable.
+                The C ABI is only supported for device functions at present.
+    :type abi: str
+    :param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
+                     one option, ``"abi_name"``, for providing the wrapper
+                     function's name. The ``"numba"`` ABI has no options.
+    :type abi_info: dict
+    :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
+    :type output: str
+    :return: (code, resty): The compiled code and inferred return type
+    :rtype: tuple
+    """
+    if abi not in ("numba", "c"):
+        raise NotImplementedError(f'Unsupported ABI: {abi}')
+
+    if abi == 'c' and not device:
+        raise NotImplementedError('The C ABI is not supported for kernels')
+
+    if output not in ("ptx", "ltoir"):
+        raise NotImplementedError(f'Unsupported output type: {output}')
+
+    if debug and opt:
+        msg = ("debug=True with opt=True (the default) "
+               "is not supported by CUDA. This may result in a crash"
+               " - set debug=False or opt=False.")
+        warn(NumbaInvalidConfigWarning(msg))
+
+    lto = (output == 'ltoir')
+    abi_info = abi_info or dict()
+
+    nvvm_options = {
+        'fastmath': fastmath,
+        'opt': 3 if opt else 0
+    }
+
+    if lto:
+        nvvm_options['gen-lto'] = None
+
+    args, return_type = sigutils.normalize_signature(sig)
+
+    cc = cc or config.CUDA_DEFAULT_PTX_CC
+    cres = compile_cuda(pyfunc, return_type, args, debug=debug,
+                        lineinfo=lineinfo, fastmath=fastmath,
+                        nvvm_options=nvvm_options, cc=cc)
+    resty = cres.signature.return_type
+
+    if resty and not device and resty != types.void:
+        raise TypeError("CUDA kernel must have void return type.")
+
+    tgt = cres.target_context
+
+    if device:
+        lib = cres.library
+        if abi == "c":
+            wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
+            lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
+                                     nvvm_options)
+    else:
+        code = pyfunc.__code__
+        filename = code.co_filename
+        linenum = code.co_firstlineno
+
+        lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
+                                              lineinfo, nvvm_options, filename,
+                                              linenum)
+
+    if lto:
+        code = lib.get_ltoir(cc=cc)
+    else:
+        code = lib.get_asm_str(cc=cc)
+    return code, resty
+
+
+def compile_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
+                               device=True, fastmath=False, opt=True,
+                               abi="c", abi_info=None, output='ptx'):
+    """Compile a Python function to PTX or LTO-IR for a given signature for the
+    current device's compute capabilility. This calls :func:`compile` with an
+    appropriate ``cc`` value for the current device."""
+    cc = get_current_device().compute_capability
+    return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
+                   fastmath=fastmath, cc=cc, opt=opt, abi=abi,
+                   abi_info=abi_info, output=output)
+
+
+def compile_ptx(pyfunc, sig, debug=False, lineinfo=False, device=False,
+                fastmath=False, cc=None, opt=True, abi="numba", abi_info=None):
+    """Compile a Python function to PTX for a given signature. See
+    :func:`compile`. The defaults for this function are to compile a kernel
+    with the Numba ABI, rather than :func:`compile`'s default of compiling a
+    device function with the C ABI."""
+    return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
+                   fastmath=fastmath, cc=cc, opt=opt, abi=abi,
+                   abi_info=abi_info, output='ptx')
+
+
+def compile_ptx_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
+                                   device=False, fastmath=False, opt=True,
+                                   abi="numba", abi_info=None):
+    """Compile a Python function to PTX for a given signature for the current
+    device's compute capabilility. See :func:`compile_ptx`."""
+    cc = get_current_device().compute_capability
+    return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
+                       device=device, fastmath=fastmath, cc=cc, opt=opt,
+                       abi=abi, abi_info=abi_info)
+
+
+def declare_device_function(name, restype, argtypes):
+    return declare_device_function_template(name, restype, argtypes).key
+
+
+def declare_device_function_template(name, restype, argtypes):
+    from .descriptor import cuda_target
+    typingctx = cuda_target.typing_context
+    targetctx = cuda_target.target_context
+    sig = typing.signature(restype, *argtypes)
+    extfn = ExternFunction(name, sig)
+
+    class device_function_template(ConcreteTemplate):
+        key = extfn
+        cases = [sig]
+
+    fndesc = funcdesc.ExternalFunctionDescriptor(
+        name=name, restype=restype, argtypes=argtypes)
+    typingctx.insert_user_function(extfn, device_function_template)
+    targetctx.insert_user_function(extfn, fndesc)
+
+    return device_function_template
+
+
+class ExternFunction(object):
+    def __init__(self, name, sig):
+        self.name = name
+        self.sig = sig
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cpp_function_wrappers.cu
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cpp_function_wrappers.cu
@@ -0,0 +1,47 @@
+#include "cuda_fp16.h"
+
+#define FNDEF(fname) __numba_wrapper_ ## fname
+
+#define UNARY_FUNCTION(fname) extern "C" __device__ int\
+  FNDEF(fname)(						\
+    short* return_value,\
+  short x\
+)\
+{\
+    __half retval = fname(__short_as_half (x));\
+\
+  *return_value = __half_as_short (retval);\
+  /* Signal that no Python exception occurred */	\
+  return 0;\
+}\
+
+extern "C" __device__ int
+FNDEF(hdiv)(
+  short* return_value,
+  short x,
+  short y
+)
+{
+  __half retval = __hdiv(__short_as_half (x), __short_as_half (y));
+  
+  *return_value = __half_as_short (retval);
+  // Signal that no Python exception occurred
+  return 0;
+}
+
+UNARY_FUNCTION(hsin)
+UNARY_FUNCTION(hcos)
+UNARY_FUNCTION(hlog)
+UNARY_FUNCTION(hlog10)
+UNARY_FUNCTION(hlog2)
+UNARY_FUNCTION(hexp)
+UNARY_FUNCTION(hexp10)
+UNARY_FUNCTION(hexp2)
+UNARY_FUNCTION(hsqrt)
+UNARY_FUNCTION(hrsqrt)
+UNARY_FUNCTION(hfloor)
+UNARY_FUNCTION(hceil)
+UNARY_FUNCTION(hrcp)
+UNARY_FUNCTION(hrint)
+UNARY_FUNCTION(htrunc)
+
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cuda_fp16.h
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cuda_fp16.h
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cuda_fp16.hpp
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cuda_fp16.hpp
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cuda_paths.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cuda_paths.py
@@ -0,0 +1,258 @@
+import sys
+import re
+import os
+from collections import namedtuple
+
+from numba.core.config import IS_WIN32
+from numba.misc.findlib import find_lib, find_file
+
+
+_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
+
+
+def _find_valid_path(options):
+    """Find valid path from *options*, which is a list of 2-tuple of
+    (name, path).  Return first pair where *path* is not None.
+    If no valid path is found, return ('<unknown>', None)
+    """
+    for by, data in options:
+        if data is not None:
+            return by, data
+    else:
+        return '<unknown>', None
+
+
+def _get_libdevice_path_decision():
+    options = [
+        ('Conda environment', get_conda_ctk()),
+        ('Conda environment (NVIDIA package)', get_nvidia_libdevice_ctk()),
+        ('CUDA_HOME', get_cuda_home('nvvm', 'libdevice')),
+        ('System', get_system_ctk('nvvm', 'libdevice')),
+        ('Debian package', get_debian_pkg_libdevice()),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _nvvm_lib_dir():
+    if IS_WIN32:
+        return 'nvvm', 'bin'
+    else:
+        return 'nvvm', 'lib64'
+
+
+def _get_nvvm_path_decision():
+    options = [
+        ('Conda environment', get_conda_ctk()),
+        ('Conda environment (NVIDIA package)', get_nvidia_nvvm_ctk()),
+        ('CUDA_HOME', get_cuda_home(*_nvvm_lib_dir())),
+        ('System', get_system_ctk(*_nvvm_lib_dir())),
+    ]
+    by, path = _find_valid_path(options)
+    return by, path
+
+
+def _get_libdevice_paths():
+    by, libdir = _get_libdevice_path_decision()
+    # Search for pattern
+    pat = r'libdevice(\.\d+)*\.bc$'
+    candidates = find_file(re.compile(pat), libdir)
+    # Keep only the max (most recent version) of the bitcode files.
+    out = max(candidates, default=None)
+    return _env_path_tuple(by, out)
+
+
+def _cudalib_path():
+    if IS_WIN32:
+        return 'bin'
+    else:
+        return 'lib64'
+
+
+def _cuda_home_static_cudalib_path():
+    if IS_WIN32:
+        return ('lib', 'x64')
+    else:
+        return ('lib64',)
+
+
+def _get_cudalib_dir_path_decision():
+    options = [
+        ('Conda environment', get_conda_ctk()),
+        ('Conda environment (NVIDIA package)', get_nvidia_cudalib_ctk()),
+        ('CUDA_HOME', get_cuda_home(_cudalib_path())),
+        ('System', get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_static_cudalib_dir_path_decision():
+    options = [
+        ('Conda environment', get_conda_ctk()),
+        ('Conda environment (NVIDIA package)', get_nvidia_static_cudalib_ctk()),
+        ('CUDA_HOME', get_cuda_home(*_cuda_home_static_cudalib_path())),
+        ('System', get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_cudalib_dir():
+    by, libdir = _get_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def _get_static_cudalib_dir():
+    by, libdir = _get_static_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def get_system_ctk(*subdirs):
+    """Return path to system-wide cudatoolkit; or, None if it doesn't exist.
+    """
+    # Linux?
+    if sys.platform.startswith('linux'):
+        # Is cuda alias to /usr/local/cuda?
+        # We are intentionally not getting versioned cuda installation.
+        base = '/usr/local/cuda'
+        if os.path.exists(base):
+            return os.path.join(base, *subdirs)
+
+
+def get_conda_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    if not is_conda_env:
+        return
+    # Assume the existence of NVVM to imply cudatoolkit installed
+    paths = find_lib('nvvm')
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_nvvm_ctk():
+    """Return path to directory containing the NVVM shared library.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    if not is_conda_env:
+        return
+
+    # Assume the existence of NVVM in the conda env implies that a CUDA toolkit
+    # conda package is installed.
+
+    # First, try the location used on Linux and the Windows 11.x packages
+    libdir = os.path.join(sys.prefix, 'nvvm', _cudalib_path())
+    if not os.path.exists(libdir) or not os.path.isdir(libdir):
+        # If that fails, try the location used for Windows 12.x packages
+        libdir = os.path.join(sys.prefix, 'Library', 'nvvm', _cudalib_path())
+        if not os.path.exists(libdir) or not os.path.isdir(libdir):
+            # If that doesn't exist either, assume we don't have the NVIDIA
+            # conda package
+            return
+
+    paths = find_lib('nvvm', libdir=libdir)
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_libdevice_ctk():
+    """Return path to directory containing the libdevice library.
+    """
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    nvvm_dir = os.path.dirname(nvvm_ctk)
+    return os.path.join(nvvm_dir, 'libdevice')
+
+
+def get_nvidia_cudalib_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit.
+    """
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    subdir = 'bin' if IS_WIN32 else 'lib'
+    return os.path.join(env_dir, subdir)
+
+
+def get_nvidia_static_cudalib_ctk():
+    """Return path to directory containing the static libraries of cudatoolkit.
+    """
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+
+    if IS_WIN32 and ("Library" not in nvvm_ctk):
+        # Location specific to CUDA 11.x packages on Windows
+        dirs = ('Lib', 'x64')
+    else:
+        # Linux, or Windows with CUDA 12.x packages
+        dirs = ('lib',)
+
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    return os.path.join(env_dir, *dirs)
+
+
+def get_cuda_home(*subdirs):
+    """Get paths of CUDA_HOME.
+    If *subdirs* are the subdirectory name to be appended in the resulting
+    path.
+    """
+    cuda_home = os.environ.get('CUDA_HOME')
+    if cuda_home is None:
+        # Try Windows CUDA installation without Anaconda
+        cuda_home = os.environ.get('CUDA_PATH')
+    if cuda_home is not None:
+        return os.path.join(cuda_home, *subdirs)
+
+
+def _get_nvvm_path():
+    by, path = _get_nvvm_path_decision()
+    candidates = find_lib('nvvm', path)
+    path = max(candidates) if candidates else None
+    return _env_path_tuple(by, path)
+
+
+def get_cuda_paths():
+    """Returns a dictionary mapping component names to a 2-tuple
+    of (source_variable, info).
+
+    The returned dictionary will have the following keys and infos:
+    - "nvvm": file_path
+    - "libdevice": List[Tuple[arch, file_path]]
+    - "cudalib_dir": directory_path
+
+    Note: The result of the function is cached.
+    """
+    # Check cache
+    if hasattr(get_cuda_paths, '_cached_result'):
+        return get_cuda_paths._cached_result
+    else:
+        # Not in cache
+        d = {
+            'nvvm': _get_nvvm_path(),
+            'libdevice': _get_libdevice_paths(),
+            'cudalib_dir': _get_cudalib_dir(),
+            'static_cudalib_dir': _get_static_cudalib_dir(),
+        }
+        # Cache result
+        get_cuda_paths._cached_result = d
+        return d
+
+
+def get_debian_pkg_libdevice():
+    """
+    Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
+    exists.
+    """
+    pkg_libdevice_location = '/usr/lib/nvidia-cuda-toolkit/libdevice'
+    if not os.path.exists(pkg_libdevice_location):
+        return None
+    return pkg_libdevice_location
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadecl.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadecl.py
@@ -0,0 +1,806 @@
+import operator
+from numba.core import types
+from numba.core.typing.npydecl import (parse_dtype, parse_shape,
+                                       register_number_classes,
+                                       register_numpy_ufunc,
+                                       trigonometric_functions,
+                                       comparison_functions,
+                                       math_operations,
+                                       bit_twiddling_functions)
+from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
+                                         AbstractTemplate, CallableTemplate,
+                                         signature, Registry)
+from numba.cuda.types import dim3
+from numba.core.typeconv import Conversion
+from numba import cuda
+from numba.cuda.compiler import declare_device_function_template
+
+registry = Registry()
+register = registry.register
+register_attr = registry.register_attr
+register_global = registry.register_global
+
+register_number_classes(register_global)
+
+
+class Cuda_array_decl(CallableTemplate):
+    def generic(self):
+        def typer(shape, dtype):
+
+            # Only integer literals and tuples of integer literals are valid
+            # shapes
+            if isinstance(shape, types.Integer):
+                if not isinstance(shape, types.IntegerLiteral):
+                    return None
+            elif isinstance(shape, (types.Tuple, types.UniTuple)):
+                if any([not isinstance(s, types.IntegerLiteral)
+                        for s in shape]):
+                    return None
+            else:
+                return None
+
+            ndim = parse_shape(shape)
+            nb_dtype = parse_dtype(dtype)
+            if nb_dtype is not None and ndim is not None:
+                return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
+
+        return typer
+
+
+@register
+class Cuda_shared_array(Cuda_array_decl):
+    key = cuda.shared.array
+
+
+@register
+class Cuda_local_array(Cuda_array_decl):
+    key = cuda.local.array
+
+
+@register
+class Cuda_const_array_like(CallableTemplate):
+    key = cuda.const.array_like
+
+    def generic(self):
+        def typer(ndarray):
+            return ndarray
+        return typer
+
+
+@register
+class Cuda_threadfence_device(ConcreteTemplate):
+    key = cuda.threadfence
+    cases = [signature(types.none)]
+
+
+@register
+class Cuda_threadfence_block(ConcreteTemplate):
+    key = cuda.threadfence_block
+    cases = [signature(types.none)]
+
+
+@register
+class Cuda_threadfence_system(ConcreteTemplate):
+    key = cuda.threadfence_system
+    cases = [signature(types.none)]
+
+
+@register
+class Cuda_syncwarp(ConcreteTemplate):
+    key = cuda.syncwarp
+    cases = [signature(types.none), signature(types.none, types.i4)]
+
+
+@register
+class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
+    key = cuda.shfl_sync_intrinsic
+    cases = [
+        signature(types.Tuple((types.i4, types.b1)),
+                  types.i4, types.i4, types.i4, types.i4, types.i4),
+        signature(types.Tuple((types.i8, types.b1)),
+                  types.i4, types.i4, types.i8, types.i4, types.i4),
+        signature(types.Tuple((types.f4, types.b1)),
+                  types.i4, types.i4, types.f4, types.i4, types.i4),
+        signature(types.Tuple((types.f8, types.b1)),
+                  types.i4, types.i4, types.f8, types.i4, types.i4),
+    ]
+
+
+@register
+class Cuda_vote_sync_intrinsic(ConcreteTemplate):
+    key = cuda.vote_sync_intrinsic
+    cases = [signature(types.Tuple((types.i4, types.b1)),
+                       types.i4, types.i4, types.b1)]
+
+
+@register
+class Cuda_match_any_sync(ConcreteTemplate):
+    key = cuda.match_any_sync
+    cases = [
+        signature(types.i4, types.i4, types.i4),
+        signature(types.i4, types.i4, types.i8),
+        signature(types.i4, types.i4, types.f4),
+        signature(types.i4, types.i4, types.f8),
+    ]
+
+
+@register
+class Cuda_match_all_sync(ConcreteTemplate):
+    key = cuda.match_all_sync
+    cases = [
+        signature(types.Tuple((types.i4, types.b1)), types.i4, types.i4),
+        signature(types.Tuple((types.i4, types.b1)), types.i4, types.i8),
+        signature(types.Tuple((types.i4, types.b1)), types.i4, types.f4),
+        signature(types.Tuple((types.i4, types.b1)), types.i4, types.f8),
+    ]
+
+
+@register
+class Cuda_activemask(ConcreteTemplate):
+    key = cuda.activemask
+    cases = [signature(types.uint32)]
+
+
+@register
+class Cuda_lanemask_lt(ConcreteTemplate):
+    key = cuda.lanemask_lt
+    cases = [signature(types.uint32)]
+
+
+@register
+class Cuda_popc(ConcreteTemplate):
+    """
+    Supported types from `llvm.popc`
+    [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
+    """
+    key = cuda.popc
+    cases = [
+        signature(types.int8, types.int8),
+        signature(types.int16, types.int16),
+        signature(types.int32, types.int32),
+        signature(types.int64, types.int64),
+        signature(types.uint8, types.uint8),
+        signature(types.uint16, types.uint16),
+        signature(types.uint32, types.uint32),
+        signature(types.uint64, types.uint64),
+    ]
+
+
+@register
+class Cuda_fma(ConcreteTemplate):
+    """
+    Supported types from `llvm.fma`
+    [here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics)
+    """
+    key = cuda.fma
+    cases = [
+        signature(types.float32, types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64, types.float64),
+    ]
+
+
+@register
+class Cuda_hfma(ConcreteTemplate):
+    key = cuda.fp16.hfma
+    cases = [
+        signature(types.float16, types.float16, types.float16, types.float16)
+    ]
+
+
+@register
+class Cuda_cbrt(ConcreteTemplate):
+
+    key = cuda.cbrt
+    cases = [
+        signature(types.float32, types.float32),
+        signature(types.float64, types.float64),
+    ]
+
+
+@register
+class Cuda_brev(ConcreteTemplate):
+    key = cuda.brev
+    cases = [
+        signature(types.uint32, types.uint32),
+        signature(types.uint64, types.uint64),
+    ]
+
+
+@register
+class Cuda_clz(ConcreteTemplate):
+    """
+    Supported types from `llvm.ctlz`
+    [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
+    """
+    key = cuda.clz
+    cases = [
+        signature(types.int8, types.int8),
+        signature(types.int16, types.int16),
+        signature(types.int32, types.int32),
+        signature(types.int64, types.int64),
+        signature(types.uint8, types.uint8),
+        signature(types.uint16, types.uint16),
+        signature(types.uint32, types.uint32),
+        signature(types.uint64, types.uint64),
+    ]
+
+
+@register
+class Cuda_ffs(ConcreteTemplate):
+    """
+    Supported types from `llvm.cttz`
+    [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
+    """
+    key = cuda.ffs
+    cases = [
+        signature(types.uint32, types.int8),
+        signature(types.uint32, types.int16),
+        signature(types.uint32, types.int32),
+        signature(types.uint32, types.int64),
+        signature(types.uint32, types.uint8),
+        signature(types.uint32, types.uint16),
+        signature(types.uint32, types.uint32),
+        signature(types.uint32, types.uint64),
+    ]
+
+
+@register
+class Cuda_selp(AbstractTemplate):
+    key = cuda.selp
+
+    def generic(self, args, kws):
+        assert not kws
+        test, a, b = args
+
+        # per docs
+        # http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+        supported_types = (types.float64, types.float32,
+                           types.int16, types.uint16,
+                           types.int32, types.uint32,
+                           types.int64, types.uint64)
+
+        if a != b or a not in supported_types:
+            return
+
+        return signature(a, test, a, a)
+
+
+def _genfp16_unary(l_key):
+    @register
+    class Cuda_fp16_unary(ConcreteTemplate):
+        key = l_key
+        cases = [signature(types.float16, types.float16)]
+
+    return Cuda_fp16_unary
+
+
+def _genfp16_unary_operator(l_key):
+    @register_global(l_key)
+    class Cuda_fp16_unary(AbstractTemplate):
+        key = l_key
+
+        def generic(self, args, kws):
+            assert not kws
+            if len(args) == 1 and args[0] == types.float16:
+                return signature(types.float16, types.float16)
+
+    return Cuda_fp16_unary
+
+
+def _genfp16_binary(l_key):
+    @register
+    class Cuda_fp16_binary(ConcreteTemplate):
+        key = l_key
+        cases = [signature(types.float16, types.float16, types.float16)]
+
+    return Cuda_fp16_binary
+
+
+@register_global(float)
+class Float(AbstractTemplate):
+
+    def generic(self, args, kws):
+        assert not kws
+
+        [arg] = args
+
+        if arg == types.float16:
+            return signature(arg, arg)
+
+
+def _genfp16_binary_comparison(l_key):
+    @register
+    class Cuda_fp16_cmp(ConcreteTemplate):
+        key = l_key
+
+        cases = [
+            signature(types.b1, types.float16, types.float16)
+        ]
+    return Cuda_fp16_cmp
+
+# If multiple ConcreteTemplates provide typing for a single function, then
+# function resolution will pick the first compatible typing it finds even if it
+# involves inserting a cast that would be considered undesirable (in this
+# specific case, float16s could be cast to float32s for comparisons).
+#
+# To work around this, we instead use an AbstractTemplate that implements
+# exactly the casting logic that we desire. The AbstractTemplate gets
+# considered in preference to ConcreteTemplates during typing.
+#
+# This is tracked as Issue #7863 (https://github.com/numba/numba/issues/7863) -
+# once this is resolved it should be possible to replace this AbstractTemplate
+# with a ConcreteTemplate to simplify the logic.
+
+
+def _fp16_binary_operator(l_key, retty):
+    @register_global(l_key)
+    class Cuda_fp16_operator(AbstractTemplate):
+        key = l_key
+
+        def generic(self, args, kws):
+            assert not kws
+
+            if len(args) == 2 and \
+                    (args[0] == types.float16 or args[1] == types.float16):
+                if (args[0] == types.float16):
+                    convertible = self.context.can_convert(args[1], args[0])
+                else:
+                    convertible = self.context.can_convert(args[0], args[1])
+
+                # We allow three cases here:
+                #
+                # 1. fp16 to fp16 - Conversion.exact
+                # 2. fp16 to other types fp16 can be promoted to
+                #  - Conversion.promote
+                # 3. fp16 to int8 (safe conversion) -
+                #  - Conversion.safe
+
+                if (convertible == Conversion.exact) or \
+                   (convertible == Conversion.promote) or \
+                   (convertible == Conversion.safe):
+                    return signature(retty, types.float16, types.float16)
+
+    return Cuda_fp16_operator
+
+
+def _genfp16_comparison_operator(op):
+    return _fp16_binary_operator(op, types.b1)
+
+
+def _genfp16_binary_operator(op):
+    return _fp16_binary_operator(op, types.float16)
+
+
+Cuda_hadd = _genfp16_binary(cuda.fp16.hadd)
+Cuda_add = _genfp16_binary_operator(operator.add)
+Cuda_iadd = _genfp16_binary_operator(operator.iadd)
+Cuda_hsub = _genfp16_binary(cuda.fp16.hsub)
+Cuda_sub = _genfp16_binary_operator(operator.sub)
+Cuda_isub = _genfp16_binary_operator(operator.isub)
+Cuda_hmul = _genfp16_binary(cuda.fp16.hmul)
+Cuda_mul = _genfp16_binary_operator(operator.mul)
+Cuda_imul = _genfp16_binary_operator(operator.imul)
+Cuda_hmax = _genfp16_binary(cuda.fp16.hmax)
+Cuda_hmin = _genfp16_binary(cuda.fp16.hmin)
+Cuda_hneg = _genfp16_unary(cuda.fp16.hneg)
+Cuda_neg = _genfp16_unary_operator(operator.neg)
+Cuda_habs = _genfp16_unary(cuda.fp16.habs)
+Cuda_abs = _genfp16_unary_operator(abs)
+Cuda_heq = _genfp16_binary_comparison(cuda.fp16.heq)
+_genfp16_comparison_operator(operator.eq)
+Cuda_hne = _genfp16_binary_comparison(cuda.fp16.hne)
+_genfp16_comparison_operator(operator.ne)
+Cuda_hge = _genfp16_binary_comparison(cuda.fp16.hge)
+_genfp16_comparison_operator(operator.ge)
+Cuda_hgt = _genfp16_binary_comparison(cuda.fp16.hgt)
+_genfp16_comparison_operator(operator.gt)
+Cuda_hle = _genfp16_binary_comparison(cuda.fp16.hle)
+_genfp16_comparison_operator(operator.le)
+Cuda_hlt = _genfp16_binary_comparison(cuda.fp16.hlt)
+_genfp16_comparison_operator(operator.lt)
+_genfp16_binary_operator(operator.truediv)
+_genfp16_binary_operator(operator.itruediv)
+
+
+def _resolve_wrapped_unary(fname):
+    decl = declare_device_function_template(f'__numba_wrapper_{fname}',
+                                            types.float16,
+                                            (types.float16,))
+    return types.Function(decl)
+
+
+def _resolve_wrapped_binary(fname):
+    decl = declare_device_function_template(f'__numba_wrapper_{fname}',
+                                            types.float16,
+                                            (types.float16, types.float16,))
+    return types.Function(decl)
+
+
+hsin_device = _resolve_wrapped_unary('hsin')
+hcos_device = _resolve_wrapped_unary('hcos')
+hlog_device = _resolve_wrapped_unary('hlog')
+hlog10_device = _resolve_wrapped_unary('hlog10')
+hlog2_device = _resolve_wrapped_unary('hlog2')
+hexp_device = _resolve_wrapped_unary('hexp')
+hexp10_device = _resolve_wrapped_unary('hexp10')
+hexp2_device = _resolve_wrapped_unary('hexp2')
+hsqrt_device = _resolve_wrapped_unary('hsqrt')
+hrsqrt_device = _resolve_wrapped_unary('hrsqrt')
+hfloor_device = _resolve_wrapped_unary('hfloor')
+hceil_device = _resolve_wrapped_unary('hceil')
+hrcp_device = _resolve_wrapped_unary('hrcp')
+hrint_device = _resolve_wrapped_unary('hrint')
+htrunc_device = _resolve_wrapped_unary('htrunc')
+hdiv_device = _resolve_wrapped_binary('hdiv')
+
+
+# generate atomic operations
+def _gen(l_key, supported_types):
+    @register
+    class Cuda_atomic(AbstractTemplate):
+        key = l_key
+
+        def generic(self, args, kws):
+            assert not kws
+            ary, idx, val = args
+
+            if ary.dtype not in supported_types:
+                return
+
+            if ary.ndim == 1:
+                return signature(ary.dtype, ary, types.intp, ary.dtype)
+            elif ary.ndim > 1:
+                return signature(ary.dtype, ary, idx, ary.dtype)
+    return Cuda_atomic
+
+
+all_numba_types = (types.float64, types.float32,
+                   types.int32, types.uint32,
+                   types.int64, types.uint64)
+
+integer_numba_types = (types.int32, types.uint32,
+                       types.int64, types.uint64)
+
+unsigned_int_numba_types = (types.uint32, types.uint64)
+
+Cuda_atomic_add = _gen(cuda.atomic.add, all_numba_types)
+Cuda_atomic_sub = _gen(cuda.atomic.sub, all_numba_types)
+Cuda_atomic_max = _gen(cuda.atomic.max, all_numba_types)
+Cuda_atomic_min = _gen(cuda.atomic.min, all_numba_types)
+Cuda_atomic_nanmax = _gen(cuda.atomic.nanmax, all_numba_types)
+Cuda_atomic_nanmin = _gen(cuda.atomic.nanmin, all_numba_types)
+Cuda_atomic_and = _gen(cuda.atomic.and_, integer_numba_types)
+Cuda_atomic_or = _gen(cuda.atomic.or_, integer_numba_types)
+Cuda_atomic_xor = _gen(cuda.atomic.xor, integer_numba_types)
+Cuda_atomic_inc = _gen(cuda.atomic.inc, unsigned_int_numba_types)
+Cuda_atomic_dec = _gen(cuda.atomic.dec, unsigned_int_numba_types)
+Cuda_atomic_exch = _gen(cuda.atomic.exch, integer_numba_types)
+
+
+@register
+class Cuda_atomic_compare_and_swap(AbstractTemplate):
+    key = cuda.atomic.compare_and_swap
+
+    def generic(self, args, kws):
+        assert not kws
+        ary, old, val = args
+        dty = ary.dtype
+
+        if dty in integer_numba_types and ary.ndim == 1:
+            return signature(dty, ary, dty, dty)
+
+
+@register
+class Cuda_atomic_cas(AbstractTemplate):
+    key = cuda.atomic.cas
+
+    def generic(self, args, kws):
+        assert not kws
+        ary, idx, old, val = args
+        dty = ary.dtype
+
+        if dty not in integer_numba_types:
+            return
+
+        if ary.ndim == 1:
+            return signature(dty, ary, types.intp, dty, dty)
+        elif ary.ndim > 1:
+            return signature(dty, ary, idx, dty, dty)
+
+
+@register
+class Cuda_nanosleep(ConcreteTemplate):
+    key = cuda.nanosleep
+
+    cases = [signature(types.void, types.uint32)]
+
+
+@register_attr
+class Dim3_attrs(AttributeTemplate):
+    key = dim3
+
+    def resolve_x(self, mod):
+        return types.int32
+
+    def resolve_y(self, mod):
+        return types.int32
+
+    def resolve_z(self, mod):
+        return types.int32
+
+
+@register_attr
+class CudaSharedModuleTemplate(AttributeTemplate):
+    key = types.Module(cuda.shared)
+
+    def resolve_array(self, mod):
+        return types.Function(Cuda_shared_array)
+
+
+@register_attr
+class CudaConstModuleTemplate(AttributeTemplate):
+    key = types.Module(cuda.const)
+
+    def resolve_array_like(self, mod):
+        return types.Function(Cuda_const_array_like)
+
+
+@register_attr
+class CudaLocalModuleTemplate(AttributeTemplate):
+    key = types.Module(cuda.local)
+
+    def resolve_array(self, mod):
+        return types.Function(Cuda_local_array)
+
+
+@register_attr
+class CudaAtomicTemplate(AttributeTemplate):
+    key = types.Module(cuda.atomic)
+
+    def resolve_add(self, mod):
+        return types.Function(Cuda_atomic_add)
+
+    def resolve_sub(self, mod):
+        return types.Function(Cuda_atomic_sub)
+
+    def resolve_and_(self, mod):
+        return types.Function(Cuda_atomic_and)
+
+    def resolve_or_(self, mod):
+        return types.Function(Cuda_atomic_or)
+
+    def resolve_xor(self, mod):
+        return types.Function(Cuda_atomic_xor)
+
+    def resolve_inc(self, mod):
+        return types.Function(Cuda_atomic_inc)
+
+    def resolve_dec(self, mod):
+        return types.Function(Cuda_atomic_dec)
+
+    def resolve_exch(self, mod):
+        return types.Function(Cuda_atomic_exch)
+
+    def resolve_max(self, mod):
+        return types.Function(Cuda_atomic_max)
+
+    def resolve_min(self, mod):
+        return types.Function(Cuda_atomic_min)
+
+    def resolve_nanmin(self, mod):
+        return types.Function(Cuda_atomic_nanmin)
+
+    def resolve_nanmax(self, mod):
+        return types.Function(Cuda_atomic_nanmax)
+
+    def resolve_compare_and_swap(self, mod):
+        return types.Function(Cuda_atomic_compare_and_swap)
+
+    def resolve_cas(self, mod):
+        return types.Function(Cuda_atomic_cas)
+
+
+@register_attr
+class CudaFp16Template(AttributeTemplate):
+    key = types.Module(cuda.fp16)
+
+    def resolve_hadd(self, mod):
+        return types.Function(Cuda_hadd)
+
+    def resolve_hsub(self, mod):
+        return types.Function(Cuda_hsub)
+
+    def resolve_hmul(self, mod):
+        return types.Function(Cuda_hmul)
+
+    def resolve_hdiv(self, mod):
+        return hdiv_device
+
+    def resolve_hneg(self, mod):
+        return types.Function(Cuda_hneg)
+
+    def resolve_habs(self, mod):
+        return types.Function(Cuda_habs)
+
+    def resolve_hfma(self, mod):
+        return types.Function(Cuda_hfma)
+
+    def resolve_hsin(self, mod):
+        return hsin_device
+
+    def resolve_hcos(self, mod):
+        return hcos_device
+
+    def resolve_hlog(self, mod):
+        return hlog_device
+
+    def resolve_hlog10(self, mod):
+        return hlog10_device
+
+    def resolve_hlog2(self, mod):
+        return hlog2_device
+
+    def resolve_hexp(self, mod):
+        return hexp_device
+
+    def resolve_hexp10(self, mod):
+        return hexp10_device
+
+    def resolve_hexp2(self, mod):
+        return hexp2_device
+
+    def resolve_hfloor(self, mod):
+        return hfloor_device
+
+    def resolve_hceil(self, mod):
+        return hceil_device
+
+    def resolve_hsqrt(self, mod):
+        return hsqrt_device
+
+    def resolve_hrsqrt(self, mod):
+        return hrsqrt_device
+
+    def resolve_hrcp(self, mod):
+        return hrcp_device
+
+    def resolve_hrint(self, mod):
+        return hrint_device
+
+    def resolve_htrunc(self, mod):
+        return htrunc_device
+
+    def resolve_heq(self, mod):
+        return types.Function(Cuda_heq)
+
+    def resolve_hne(self, mod):
+        return types.Function(Cuda_hne)
+
+    def resolve_hge(self, mod):
+        return types.Function(Cuda_hge)
+
+    def resolve_hgt(self, mod):
+        return types.Function(Cuda_hgt)
+
+    def resolve_hle(self, mod):
+        return types.Function(Cuda_hle)
+
+    def resolve_hlt(self, mod):
+        return types.Function(Cuda_hlt)
+
+    def resolve_hmax(self, mod):
+        return types.Function(Cuda_hmax)
+
+    def resolve_hmin(self, mod):
+        return types.Function(Cuda_hmin)
+
+
+@register_attr
+class CudaModuleTemplate(AttributeTemplate):
+    key = types.Module(cuda)
+
+    def resolve_cg(self, mod):
+        return types.Module(cuda.cg)
+
+    def resolve_threadIdx(self, mod):
+        return dim3
+
+    def resolve_blockIdx(self, mod):
+        return dim3
+
+    def resolve_blockDim(self, mod):
+        return dim3
+
+    def resolve_gridDim(self, mod):
+        return dim3
+
+    def resolve_laneid(self, mod):
+        return types.int32
+
+    def resolve_shared(self, mod):
+        return types.Module(cuda.shared)
+
+    def resolve_popc(self, mod):
+        return types.Function(Cuda_popc)
+
+    def resolve_brev(self, mod):
+        return types.Function(Cuda_brev)
+
+    def resolve_clz(self, mod):
+        return types.Function(Cuda_clz)
+
+    def resolve_ffs(self, mod):
+        return types.Function(Cuda_ffs)
+
+    def resolve_fma(self, mod):
+        return types.Function(Cuda_fma)
+
+    def resolve_cbrt(self, mod):
+        return types.Function(Cuda_cbrt)
+
+    def resolve_threadfence(self, mod):
+        return types.Function(Cuda_threadfence_device)
+
+    def resolve_threadfence_block(self, mod):
+        return types.Function(Cuda_threadfence_block)
+
+    def resolve_threadfence_system(self, mod):
+        return types.Function(Cuda_threadfence_system)
+
+    def resolve_syncwarp(self, mod):
+        return types.Function(Cuda_syncwarp)
+
+    def resolve_shfl_sync_intrinsic(self, mod):
+        return types.Function(Cuda_shfl_sync_intrinsic)
+
+    def resolve_vote_sync_intrinsic(self, mod):
+        return types.Function(Cuda_vote_sync_intrinsic)
+
+    def resolve_match_any_sync(self, mod):
+        return types.Function(Cuda_match_any_sync)
+
+    def resolve_match_all_sync(self, mod):
+        return types.Function(Cuda_match_all_sync)
+
+    def resolve_activemask(self, mod):
+        return types.Function(Cuda_activemask)
+
+    def resolve_lanemask_lt(self, mod):
+        return types.Function(Cuda_lanemask_lt)
+
+    def resolve_selp(self, mod):
+        return types.Function(Cuda_selp)
+
+    def resolve_nanosleep(self, mod):
+        return types.Function(Cuda_nanosleep)
+
+    def resolve_atomic(self, mod):
+        return types.Module(cuda.atomic)
+
+    def resolve_fp16(self, mod):
+        return types.Module(cuda.fp16)
+
+    def resolve_const(self, mod):
+        return types.Module(cuda.const)
+
+    def resolve_local(self, mod):
+        return types.Module(cuda.local)
+
+
+register_global(cuda, types.Module(cuda))
+
+
+# NumPy
+
+for func in trigonometric_functions:
+    register_numpy_ufunc(func, register_global)
+
+for func in comparison_functions:
+    register_numpy_ufunc(func, register_global)
+
+for func in bit_twiddling_functions:
+    register_numpy_ufunc(func, register_global)
+
+for func in math_operations:
+    if func in ('log', 'log2', 'log10'):
+        register_numpy_ufunc(func, register_global)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/init.py
@@ -0,0 +1,9 @@
+"""CUDA Driver
+
+- Driver API binding
+- NVVM API binding
+- Device array implementation
+
+"""
+from numba.core import config
+assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devicearray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devicearray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devices.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devices.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/driver.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/driver.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/drvapi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/drvapi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/dummyarray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/dummyarray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/enums.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/enums.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/error.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/error.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/libs.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/libs.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/ndarray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/ndarray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvrtc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvrtc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvvm.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvvm.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/rtapi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/rtapi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/runtime.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/runtime.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/_extras.cpython-312-x86_64-linux-gnu.so
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/_extras.cpython-312-x86_64-linux-gnu.so
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devicearray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devicearray.py
@@ -0,0 +1,904 @@
+"""
+A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
+on the object.  If it exists and evaluate to True, it must define shape,
+strides, dtype and size attributes similar to a NumPy ndarray.
+"""
+
+import math
+import functools
+import operator
+import copy
+from ctypes import c_void_p
+
+import numpy as np
+
+import numba
+from numba import _devicearray
+from numba.cuda.cudadrv import devices, dummyarray
+from numba.cuda.cudadrv import driver as _driver
+from numba.core import types, config
+from numba.np.unsafe.ndarray import to_fixed_tuple
+from numba.np.numpy_support import numpy_version
+from numba.np import numpy_support
+from numba.cuda.api_util import prepare_shape_strides_dtype
+from numba.core.errors import NumbaPerformanceWarning
+from warnings import warn
+
+try:
+    lru_cache = getattr(functools, 'lru_cache')(None)
+except AttributeError:
+    # Python 3.1 or lower
+    def lru_cache(func):
+        return func
+
+
+def is_cuda_ndarray(obj):
+    "Check if an object is a CUDA ndarray"
+    return getattr(obj, '__cuda_ndarray__', False)
+
+
+def verify_cuda_ndarray_interface(obj):
+    "Verify the CUDA ndarray interface for an obj"
+    require_cuda_ndarray(obj)
+
+    def requires_attr(attr, typ):
+        if not hasattr(obj, attr):
+            raise AttributeError(attr)
+        if not isinstance(getattr(obj, attr), typ):
+            raise AttributeError('%s must be of type %s' % (attr, typ))
+
+    requires_attr('shape', tuple)
+    requires_attr('strides', tuple)
+    requires_attr('dtype', np.dtype)
+    requires_attr('size', int)
+
+
+def require_cuda_ndarray(obj):
+    "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
+    if not is_cuda_ndarray(obj):
+        raise ValueError('require an cuda ndarray object')
+
+
+class DeviceNDArrayBase(_devicearray.DeviceArray):
+    """A on GPU NDArray representation
+    """
+    __cuda_memory__ = True
+    __cuda_ndarray__ = True     # There must be gpu_data attribute
+
+    def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
+        """
+        Args
+        ----
+
+        shape
+            array shape.
+        strides
+            array strides.
+        dtype
+            data type as np.dtype coercible object.
+        stream
+            cuda stream.
+        gpu_data
+            user provided device memory for the ndarray data buffer
+        """
+        if isinstance(shape, int):
+            shape = (shape,)
+        if isinstance(strides, int):
+            strides = (strides,)
+        dtype = np.dtype(dtype)
+        self.ndim = len(shape)
+        if len(strides) != self.ndim:
+            raise ValueError('strides not match ndim')
+        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
+                                                 dtype.itemsize)
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
+        self.dtype = dtype
+        self.size = int(functools.reduce(operator.mul, self.shape, 1))
+        # prepare gpu memory
+        if self.size > 0:
+            if gpu_data is None:
+                self.alloc_size = _driver.memory_size_from_info(
+                    self.shape, self.strides, self.dtype.itemsize)
+                gpu_data = devices.get_context().memalloc(self.alloc_size)
+            else:
+                self.alloc_size = _driver.device_memory_size(gpu_data)
+        else:
+            # Make NULL pointer for empty allocation
+            if _driver.USE_NV_BINDING:
+                null = _driver.binding.CUdeviceptr(0)
+            else:
+                null = c_void_p(0)
+            gpu_data = _driver.MemoryPointer(context=devices.get_context(),
+                                             pointer=null, size=0)
+            self.alloc_size = 0
+
+        self.gpu_data = gpu_data
+        self.stream = stream
+
+    @property
+    def __cuda_array_interface__(self):
+        if _driver.USE_NV_BINDING:
+            if self.device_ctypes_pointer is not None:
+                ptr = int(self.device_ctypes_pointer)
+            else:
+                ptr = 0
+        else:
+            if self.device_ctypes_pointer.value is not None:
+                ptr = self.device_ctypes_pointer.value
+            else:
+                ptr = 0
+
+        return {
+            'shape': tuple(self.shape),
+            'strides': None if is_contiguous(self) else tuple(self.strides),
+            'data': (ptr, False),
+            'typestr': self.dtype.str,
+            'stream': int(self.stream) if self.stream != 0 else None,
+            'version': 3,
+        }
+
+    def bind(self, stream=0):
+        """Bind a CUDA stream to this object so that all subsequent operation
+        on this array defaults to the given stream.
+        """
+        clone = copy.copy(self)
+        clone.stream = stream
+        return clone
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    def transpose(self, axes=None):
+        if axes and tuple(axes) == tuple(range(self.ndim)):
+            return self
+        elif self.ndim != 2:
+            msg = "transposing a non-2D DeviceNDArray isn't supported"
+            raise NotImplementedError(msg)
+        elif axes is not None and set(axes) != set(range(self.ndim)):
+            raise ValueError("invalid axes list %r" % (axes,))
+        else:
+            from numba.cuda.kernels.transpose import transpose
+            return transpose(self)
+
+    def _default_stream(self, stream):
+        return self.stream if not stream else stream
+
+    @property
+    def _numba_type_(self):
+        """
+        Magic attribute expected by Numba to get the numba type that
+        represents this object.
+        """
+        # Typing considerations:
+        #
+        # 1. The preference is to use 'C' or 'F' layout since this enables
+        # hardcoding stride values into compiled kernels, which is more
+        # efficient than storing a passed-in value in a register.
+        #
+        # 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
+        # the more likely / common case.
+        #
+        # 3. If an array is broadcast then it must be typed as 'A' - using 'C'
+        # or 'F' does not apply for broadcast arrays, because the strides, some
+        # of which will be 0, will not match those hardcoded in for 'C' or 'F'
+        # layouts.
+
+        broadcast = 0 in self.strides
+        if self.flags['C_CONTIGUOUS'] and not broadcast:
+            layout = 'C'
+        elif self.flags['F_CONTIGUOUS'] and not broadcast:
+            layout = 'F'
+        else:
+            layout = 'A'
+
+        dtype = numpy_support.from_dtype(self.dtype)
+        return types.Array(dtype, self.ndim, layout)
+
+    @property
+    def device_ctypes_pointer(self):
+        """Returns the ctypes pointer to the GPU data buffer
+        """
+        if self.gpu_data is None:
+            if _driver.USE_NV_BINDING:
+                return _driver.binding.CUdeviceptr(0)
+            else:
+                return c_void_p(0)
+        else:
+            return self.gpu_data.device_ctypes_pointer
+
+    @devices.require_context
+    def copy_to_device(self, ary, stream=0):
+        """Copy `ary` to `self`.
+
+        If `ary` is a CUDA memory, perform a device-to-device transfer.
+        Otherwise, perform a a host-to-device transfer.
+        """
+        if ary.size == 0:
+            # Nothing to do
+            return
+
+        sentry_contiguous(self)
+        stream = self._default_stream(stream)
+
+        self_core, ary_core = array_core(self), array_core(ary)
+        if _driver.is_device_memory(ary):
+            sentry_contiguous(ary)
+            check_array_compatibility(self_core, ary_core)
+            _driver.device_to_device(self, ary, self.alloc_size, stream=stream)
+        else:
+            # Ensure same contiguity. Only makes a host-side copy if necessary
+            # (i.e., in order to materialize a writable strided view)
+            ary_core = np.array(
+                ary_core,
+                order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
+                subok=True,
+                copy=(not ary_core.flags['WRITEABLE'])
+                if numpy_version < (2, 0) else None)
+            check_array_compatibility(self_core, ary_core)
+            _driver.host_to_device(self, ary_core, self.alloc_size,
+                                   stream=stream)
+
+    @devices.require_context
+    def copy_to_host(self, ary=None, stream=0):
+        """Copy ``self`` to ``ary`` or create a new Numpy ndarray
+        if ``ary`` is ``None``.
+
+        If a CUDA ``stream`` is given, then the transfer will be made
+        asynchronously as part as the given stream.  Otherwise, the transfer is
+        synchronous: the function returns after the copy is finished.
+
+        Always returns the host array.
+
+        Example::
+
+            import numpy as np
+            from numba import cuda
+
+            arr = np.arange(1000)
+            d_arr = cuda.to_device(arr)
+
+            my_kernel[100, 100](d_arr)
+
+            result_array = d_arr.copy_to_host()
+        """
+        if any(s < 0 for s in self.strides):
+            msg = 'D->H copy not implemented for negative strides: {}'
+            raise NotImplementedError(msg.format(self.strides))
+        assert self.alloc_size >= 0, "Negative memory size"
+        stream = self._default_stream(stream)
+        if ary is None:
+            hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
+        else:
+            check_array_compatibility(self, ary)
+            hostary = ary
+
+        if self.alloc_size != 0:
+            _driver.device_to_host(hostary, self, self.alloc_size,
+                                   stream=stream)
+
+        if ary is None:
+            if self.size == 0:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     buffer=hostary)
+            else:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     strides=self.strides, buffer=hostary)
+        return hostary
+
+    def split(self, section, stream=0):
+        """Split the array into equal partition of the `section` size.
+        If the array cannot be equally divided, the last section will be
+        smaller.
+        """
+        stream = self._default_stream(stream)
+        if self.ndim != 1:
+            raise ValueError("only support 1d array")
+        if self.strides[0] != self.dtype.itemsize:
+            raise ValueError("only support unit stride")
+        nsect = int(math.ceil(float(self.size) / section))
+        strides = self.strides
+        itemsize = self.dtype.itemsize
+        for i in range(nsect):
+            begin = i * section
+            end = min(begin + section, self.size)
+            shape = (end - begin,)
+            gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
+            yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
+                                gpu_data=gpu_data)
+
+    def as_cuda_arg(self):
+        """Returns a device memory object that is used as the argument.
+        """
+        return self.gpu_data
+
+    def get_ipc_handle(self):
+        """
+        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
+        to another process to share the local allocation.
+
+        Note: this feature is only available on Linux.
+        """
+        ipch = devices.get_context().get_ipc_handle(self.gpu_data)
+        desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
+        return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
+
+    def squeeze(self, axis=None, stream=0):
+        """
+        Remove axes of size one from the array shape.
+
+        Parameters
+        ----------
+        axis : None or int or tuple of ints, optional
+            Subset of dimensions to remove. A `ValueError` is raised if an axis
+            with size greater than one is selected. If `None`, all axes with
+            size one are removed.
+        stream : cuda stream or 0, optional
+            Default stream for the returned view of the array.
+
+        Returns
+        -------
+        DeviceNDArray
+            Squeezed view into the array.
+
+        """
+        new_dummy, _ = self._dummy.squeeze(axis=axis)
+        return DeviceNDArray(
+            shape=new_dummy.shape,
+            strides=new_dummy.strides,
+            dtype=self.dtype,
+            stream=self._default_stream(stream),
+            gpu_data=self.gpu_data,
+        )
+
+    def view(self, dtype):
+        """Returns a new object by reinterpretting the dtype without making a
+        copy of the data.
+        """
+        dtype = np.dtype(dtype)
+        shape = list(self.shape)
+        strides = list(self.strides)
+
+        if self.dtype.itemsize != dtype.itemsize:
+            if not self.is_c_contiguous():
+                raise ValueError(
+                    "To change to a dtype of a different size,"
+                    " the array must be C-contiguous"
+                )
+
+            shape[-1], rem = divmod(
+                shape[-1] * self.dtype.itemsize,
+                dtype.itemsize
+            )
+
+            if rem != 0:
+                raise ValueError(
+                    "When changing to a larger dtype,"
+                    " its size must be a divisor of the total size in bytes"
+                    " of the last axis of the array."
+                )
+
+            strides[-1] = dtype.itemsize
+
+        return DeviceNDArray(
+            shape=shape,
+            strides=strides,
+            dtype=dtype,
+            stream=self.stream,
+            gpu_data=self.gpu_data,
+        )
+
+    @property
+    def nbytes(self):
+        # Note: not using `alloc_size`.  `alloc_size` reports memory
+        # consumption of the allocation, not the size of the array
+        # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
+        return self.dtype.itemsize * self.size
+
+
+class DeviceRecord(DeviceNDArrayBase):
+    '''
+    An on-GPU record type
+    '''
+    def __init__(self, dtype, stream=0, gpu_data=None):
+        shape = ()
+        strides = ()
+        super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
+                                           gpu_data)
+
+    @property
+    def flags(self):
+        """
+        For `numpy.ndarray` compatibility. Ideally this would return a
+        `np.core.multiarray.flagsobj`, but that needs to be constructed
+        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
+        aren't writeable).
+        """
+        return dict(self._dummy.flags) # defensive copy
+
+    @property
+    def _numba_type_(self):
+        """
+        Magic attribute expected by Numba to get the numba type that
+        represents this object.
+        """
+        return numpy_support.from_dtype(self.dtype)
+
+    @devices.require_context
+    def __getitem__(self, item):
+        return self._do_getitem(item)
+
+    @devices.require_context
+    def getitem(self, item, stream=0):
+        """Do `__getitem__(item)` with CUDA stream
+        """
+        return self._do_getitem(item, stream)
+
+    def _do_getitem(self, item, stream=0):
+        stream = self._default_stream(stream)
+        typ, offset = self.dtype.fields[item]
+        newdata = self.gpu_data.view(offset)
+
+        if typ.shape == ():
+            if typ.names is not None:
+                return DeviceRecord(dtype=typ, stream=stream,
+                                    gpu_data=newdata)
+            else:
+                hostary = np.empty(1, dtype=typ)
+                _driver.device_to_host(dst=hostary, src=newdata,
+                                       size=typ.itemsize,
+                                       stream=stream)
+            return hostary[0]
+        else:
+            shape, strides, dtype = \
+                prepare_shape_strides_dtype(typ.shape,
+                                            None,
+                                            typ.subdtype[0], 'C')
+            return DeviceNDArray(shape=shape, strides=strides,
+                                 dtype=dtype, gpu_data=newdata,
+                                 stream=stream)
+
+    @devices.require_context
+    def __setitem__(self, key, value):
+        return self._do_setitem(key, value)
+
+    @devices.require_context
+    def setitem(self, key, value, stream=0):
+        """Do `__setitem__(key, value)` with CUDA stream
+        """
+        return self._do_setitem(key, value, stream=stream)
+
+    def _do_setitem(self, key, value, stream=0):
+
+        stream = self._default_stream(stream)
+
+        # If the record didn't have a default stream, and the user didn't
+        # provide a stream, then we will use the default stream for the
+        # assignment kernel and synchronize on it.
+        synchronous = not stream
+        if synchronous:
+            ctx = devices.get_context()
+            stream = ctx.get_default_stream()
+
+        # (1) prepare LHS
+
+        typ, offset = self.dtype.fields[key]
+        newdata = self.gpu_data.view(offset)
+
+        lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
+
+        # (2) prepare RHS
+
+        rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
+
+        # (3) do the copy
+
+        _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
+
+        if synchronous:
+            stream.synchronize()
+
+
+@lru_cache
+def _assign_kernel(ndim):
+    """
+    A separate method so we don't need to compile code every assignment (!).
+
+    :param ndim: We need to have static array sizes for cuda.local.array, so
+        bake in the number of dimensions into the kernel
+    """
+    from numba import cuda  # circular!
+
+    if ndim == 0:
+        # the (2, ndim) allocation below is not yet supported, so avoid it
+        @cuda.jit
+        def kernel(lhs, rhs):
+            lhs[()] = rhs[()]
+        return kernel
+
+    @cuda.jit
+    def kernel(lhs, rhs):
+        location = cuda.grid(1)
+
+        n_elements = 1
+        for i in range(lhs.ndim):
+            n_elements *= lhs.shape[i]
+        if location >= n_elements:
+            # bake n_elements into the kernel, better than passing it in
+            # as another argument.
+            return
+
+        # [0, :] is the to-index (into `lhs`)
+        # [1, :] is the from-index (into `rhs`)
+        idx = cuda.local.array(
+            shape=(2, ndim),
+            dtype=types.int64)
+
+        for i in range(ndim - 1, -1, -1):
+            idx[0, i] = location % lhs.shape[i]
+            idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
+            location //= lhs.shape[i]
+
+        lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
+    return kernel
+
+
+class DeviceNDArray(DeviceNDArrayBase):
+    '''
+    An on-GPU array type
+    '''
+    def is_f_contiguous(self):
+        '''
+        Return true if the array is Fortran-contiguous.
+        '''
+        return self._dummy.is_f_contig
+
+    @property
+    def flags(self):
+        """
+        For `numpy.ndarray` compatibility. Ideally this would return a
+        `np.core.multiarray.flagsobj`, but that needs to be constructed
+        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
+        aren't writeable).
+        """
+        return dict(self._dummy.flags) # defensive copy
+
+    def is_c_contiguous(self):
+        '''
+        Return true if the array is C-contiguous.
+        '''
+        return self._dummy.is_c_contig
+
+    def __array__(self, dtype=None):
+        """
+        :return: an `numpy.ndarray`, so copies to the host.
+        """
+        if dtype:
+            return self.copy_to_host().__array__(dtype)
+        else:
+            return self.copy_to_host().__array__()
+
+    def __len__(self):
+        return self.shape[0]
+
+    def reshape(self, *newshape, **kws):
+        """
+        Reshape the array without changing its contents, similarly to
+        :meth:`numpy.ndarray.reshape`. Example::
+
+            d_arr = d_arr.reshape(20, 50, order='F')
+        """
+        if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
+            newshape = newshape[0]
+
+        cls = type(self)
+        if newshape == self.shape:
+            # nothing to do
+            return cls(shape=self.shape, strides=self.strides,
+                       dtype=self.dtype, gpu_data=self.gpu_data)
+
+        newarr, extents = self._dummy.reshape(*newshape, **kws)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, gpu_data=self.gpu_data)
+        else:
+            raise NotImplementedError("operation requires copying")
+
+    def ravel(self, order='C', stream=0):
+        '''
+        Flattens a contiguous array without changing its contents, similar to
+        :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
+        exception.
+        '''
+        stream = self._default_stream(stream)
+        cls = type(self)
+        newarr, extents = self._dummy.ravel(order=order)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, gpu_data=self.gpu_data,
+                       stream=stream)
+
+        else:
+            raise NotImplementedError("operation requires copying")
+
+    @devices.require_context
+    def __getitem__(self, item):
+        return self._do_getitem(item)
+
+    @devices.require_context
+    def getitem(self, item, stream=0):
+        """Do `__getitem__(item)` with CUDA stream
+        """
+        return self._do_getitem(item, stream)
+
+    def _do_getitem(self, item, stream=0):
+        stream = self._default_stream(stream)
+
+        arr = self._dummy.__getitem__(item)
+        extents = list(arr.iter_contiguous_extent())
+        cls = type(self)
+        if len(extents) == 1:
+            newdata = self.gpu_data.view(*extents[0])
+
+            if not arr.is_array:
+                # Check for structured array type (record)
+                if self.dtype.names is not None:
+                    return DeviceRecord(dtype=self.dtype, stream=stream,
+                                        gpu_data=newdata)
+                else:
+                    # Element indexing
+                    hostary = np.empty(1, dtype=self.dtype)
+                    _driver.device_to_host(dst=hostary, src=newdata,
+                                           size=self._dummy.itemsize,
+                                           stream=stream)
+                return hostary[0]
+            else:
+                return cls(shape=arr.shape, strides=arr.strides,
+                           dtype=self.dtype, gpu_data=newdata, stream=stream)
+        else:
+            newdata = self.gpu_data.view(*arr.extent)
+            return cls(shape=arr.shape, strides=arr.strides,
+                       dtype=self.dtype, gpu_data=newdata, stream=stream)
+
+    @devices.require_context
+    def __setitem__(self, key, value):
+        return self._do_setitem(key, value)
+
+    @devices.require_context
+    def setitem(self, key, value, stream=0):
+        """Do `__setitem__(key, value)` with CUDA stream
+        """
+        return self._do_setitem(key, value, stream=stream)
+
+    def _do_setitem(self, key, value, stream=0):
+
+        stream = self._default_stream(stream)
+
+        # If the array didn't have a default stream, and the user didn't provide
+        # a stream, then we will use the default stream for the assignment
+        # kernel and synchronize on it.
+        synchronous = not stream
+        if synchronous:
+            ctx = devices.get_context()
+            stream = ctx.get_default_stream()
+
+        # (1) prepare LHS
+
+        arr = self._dummy.__getitem__(key)
+        newdata = self.gpu_data.view(*arr.extent)
+
+        if isinstance(arr, dummyarray.Element):
+            # convert to a 0d array
+            shape = ()
+            strides = ()
+        else:
+            shape = arr.shape
+            strides = arr.strides
+
+        lhs = type(self)(
+            shape=shape,
+            strides=strides,
+            dtype=self.dtype,
+            gpu_data=newdata,
+            stream=stream)
+
+        # (2) prepare RHS
+
+        rhs, _ = auto_device(value, stream=stream, user_explicit=True)
+        if rhs.ndim > lhs.ndim:
+            raise ValueError("Can't assign %s-D array to %s-D self" % (
+                rhs.ndim,
+                lhs.ndim))
+        rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
+        # negative indices would not work if rhs.ndim == 0
+        rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
+        rhs = rhs.reshape(*rhs_shape)
+        for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
+            if r != 1 and l != r:
+                raise ValueError("Can't copy sequence with size %d to array "
+                                 "axis %d with dimension %d" % ( r, i, l))
+
+        # (3) do the copy
+
+        n_elements = functools.reduce(operator.mul, lhs.shape, 1)
+        _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
+        if synchronous:
+            stream.synchronize()
+
+
+class IpcArrayHandle(object):
+    """
+    An IPC array handle that can be serialized and transfer to another process
+    in the same machine for share a GPU allocation.
+
+    On the destination process, use the *.open()* method to creates a new
+    *DeviceNDArray* object that shares the allocation from the original process.
+    To release the resources, call the *.close()* method.  After that, the
+    destination can no longer use the shared array object.  (Note: the
+    underlying weakref to the resource is now dead.)
+
+    This object implements the context-manager interface that calls the
+    *.open()* and *.close()* method automatically::
+
+        with the_ipc_array_handle as ipc_array:
+            # use ipc_array here as a normal gpu array object
+            some_code(ipc_array)
+        # ipc_array is dead at this point
+    """
+    def __init__(self, ipc_handle, array_desc):
+        self._array_desc = array_desc
+        self._ipc_handle = ipc_handle
+
+    def open(self):
+        """
+        Returns a new *DeviceNDArray* that shares the allocation from the
+        original process.  Must not be used on the original process.
+        """
+        dptr = self._ipc_handle.open(devices.get_context())
+        return DeviceNDArray(gpu_data=dptr, **self._array_desc)
+
+    def close(self):
+        """
+        Closes the IPC handle to the array.
+        """
+        self._ipc_handle.close()
+
+    def __enter__(self):
+        return self.open()
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+
+class MappedNDArray(DeviceNDArrayBase, np.ndarray):
+    """
+    A host array that uses CUDA mapped memory.
+    """
+
+    def device_setup(self, gpu_data, stream=0):
+        self.gpu_data = gpu_data
+        self.stream = stream
+
+
+class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
+    """
+    A host array that uses CUDA managed memory.
+    """
+
+    def device_setup(self, gpu_data, stream=0):
+        self.gpu_data = gpu_data
+        self.stream = stream
+
+
+def from_array_like(ary, stream=0, gpu_data=None):
+    "Create a DeviceNDArray object that is like ary."
+    return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
+                         gpu_data=gpu_data)
+
+
+def from_record_like(rec, stream=0, gpu_data=None):
+    "Create a DeviceRecord object that is like rec."
+    return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
+
+
+def array_core(ary):
+    """
+    Extract the repeated core of a broadcast array.
+
+    Broadcast arrays are by definition non-contiguous due to repeated
+    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
+    contiguity and copy the underlying data from such arrays, we must create
+    a view without the repeated dimensions.
+
+    """
+    if not ary.strides or not ary.size:
+        return ary
+    core_index = []
+    for stride in ary.strides:
+        core_index.append(0 if stride == 0 else slice(None))
+    return ary[tuple(core_index)]
+
+
+def is_contiguous(ary):
+    """
+    Returns True iff `ary` is C-style contiguous while ignoring
+    broadcasted and 1-sized dimensions.
+    As opposed to array_core(), it does not call require_context(),
+    which can be quite expensive.
+    """
+    size = ary.dtype.itemsize
+    for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
+        if shape > 1 and stride != 0:
+            if size != stride:
+                return False
+            size *= shape
+    return True
+
+
+errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
+                            "be transferred as a single memory region. Please "
+                            "ensure contiguous buffer with numpy "
+                            ".ascontiguousarray()")
+
+
+def sentry_contiguous(ary):
+    core = array_core(ary)
+    if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
+        raise ValueError(errmsg_contiguous_buffer)
+
+
+def auto_device(obj, stream=0, copy=True, user_explicit=False):
+    """
+    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
+    host to device. If obj already represents device memory, it is returned and
+    no copy is made.
+    """
+    if _driver.is_device_memory(obj):
+        return obj, False
+    elif hasattr(obj, '__cuda_array_interface__'):
+        return numba.cuda.as_cuda_array(obj), False
+    else:
+        if isinstance(obj, np.void):
+            devobj = from_record_like(obj, stream=stream)
+        else:
+            # This allows you to pass non-array objects like constants and
+            # objects implementing the array interface
+            # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
+            # into this function (with no overhead -- copies -- for `obj`s
+            # that are already `ndarray`s.
+            obj = np.array(
+                obj,
+                copy=False if numpy_version < (2, 0) else None,
+                subok=True)
+            sentry_contiguous(obj)
+            devobj = from_array_like(obj, stream=stream)
+        if copy:
+            if config.CUDA_WARN_ON_IMPLICIT_COPY:
+                if (
+                    not user_explicit and
+                    (not isinstance(obj, DeviceNDArray)
+                     and isinstance(obj, np.ndarray))
+                ):
+                    msg = ("Host array used in CUDA kernel will incur "
+                           "copy overhead to/from device.")
+                    warn(NumbaPerformanceWarning(msg))
+            devobj.copy_to_device(obj, stream=stream)
+        return devobj, True
+
+
+def check_array_compatibility(ary1, ary2):
+    ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
+    if ary1.dtype != ary2.dtype:
+        raise TypeError('incompatible dtype: %s vs. %s' %
+                        (ary1.dtype, ary2.dtype))
+    if ary1sq.shape != ary2sq.shape:
+        raise ValueError('incompatible shape: %s vs. %s' %
+                         (ary1.shape, ary2.shape))
+    # We check strides only if the size is nonzero, because strides are
+    # irrelevant (and can differ) for zero-length copies.
+    if ary1.size and ary1sq.strides != ary2sq.strides:
+        raise ValueError('incompatible strides: %s vs. %s' %
+                         (ary1.strides, ary2.strides))
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devices.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devices.py
@@ -0,0 +1,248 @@
+"""
+Expose each GPU devices directly.
+
+This module implements a API that is like the "CUDA runtime" context manager
+for managing CUDA context stack and clean up.  It relies on thread-local globals
+to separate the context stack management of each thread. Contexts are also
+shareable among threads.  Only the main thread can destroy Contexts.
+
+Note:
+- This module must be imported by the main-thread.
+
+"""
+import functools
+import threading
+from contextlib import contextmanager
+
+from .driver import driver, USE_NV_BINDING
+
+
+class _DeviceList(object):
+    def __getattr__(self, attr):
+        # First time looking at "lst" attribute.
+        if attr == "lst":
+            # Device list is not initialized.
+            # Query all CUDA devices.
+            numdev = driver.get_device_count()
+            gpus = [_DeviceContextManager(driver.get_device(devid))
+                    for devid in range(numdev)]
+            # Define "lst" to avoid re-initialization
+            self.lst = gpus
+            return gpus
+
+        # Other attributes
+        return super(_DeviceList, self).__getattr__(attr)
+
+    def __getitem__(self, devnum):
+        '''
+        Returns the context manager for device *devnum*.
+        '''
+        return self.lst[devnum]
+
+    def __str__(self):
+        return ', '.join([str(d) for d in self.lst])
+
+    def __iter__(self):
+        return iter(self.lst)
+
+    def __len__(self):
+        return len(self.lst)
+
+    @property
+    def current(self):
+        """Returns the active device or None if there's no active device
+        """
+        with driver.get_active_context() as ac:
+            devnum = ac.devnum
+            if devnum is not None:
+                return self[devnum]
+
+
+class _DeviceContextManager(object):
+    """
+    Provides a context manager for executing in the context of the chosen
+    device. The normal use of instances of this type is from
+    ``numba.cuda.gpus``. For example, to execute on device 2::
+
+       with numba.cuda.gpus[2]:
+           d_a = numba.cuda.to_device(a)
+
+    to copy the array *a* onto device 2, referred to by *d_a*.
+    """
+
+    def __init__(self, device):
+        self._device = device
+
+    def __getattr__(self, item):
+        return getattr(self._device, item)
+
+    def __enter__(self):
+        _runtime.get_or_create_context(self._device.id)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # this will verify that we are popping the right device context.
+        self._device.get_primary_context().pop()
+
+    def __str__(self):
+        return "<Managed Device {self.id}>".format(self=self)
+
+
+class _Runtime(object):
+    """Emulate the CUDA runtime context management.
+
+    It owns all Devices and Contexts.
+    Keeps at most one Context per Device
+    """
+
+    def __init__(self):
+        self.gpus = _DeviceList()
+
+        # For caching the attached CUDA Context
+        self._tls = threading.local()
+
+        # Remember the main thread
+        # Only the main thread can *actually* destroy
+        self._mainthread = threading.current_thread()
+
+        # Avoid mutation of runtime state in multithreaded programs
+        self._lock = threading.RLock()
+
+    @contextmanager
+    def ensure_context(self):
+        """Ensure a CUDA context is available inside the context.
+
+        On entrance, queries the CUDA driver for an active CUDA context and
+        attaches it in TLS for subsequent calls so they do not need to query
+        the CUDA driver again.  On exit, detach the CUDA context from the TLS.
+
+        This will allow us to pickup thirdparty activated CUDA context in
+        any top-level Numba CUDA API.
+        """
+        with driver.get_active_context():
+            oldctx = self._get_attached_context()
+            newctx = self.get_or_create_context(None)
+            self._set_attached_context(newctx)
+            try:
+                yield
+            finally:
+                self._set_attached_context(oldctx)
+
+    def get_or_create_context(self, devnum):
+        """Returns the primary context and push+create it if needed
+        for *devnum*.  If *devnum* is None, use the active CUDA context (must
+        be primary) or create a new one with ``devnum=0``.
+        """
+        if devnum is None:
+            attached_ctx = self._get_attached_context()
+            if attached_ctx is None:
+                return self._get_or_create_context_uncached(devnum)
+            else:
+                return attached_ctx
+        else:
+            if USE_NV_BINDING:
+                devnum = int(devnum)
+            return self._activate_context_for(devnum)
+
+    def _get_or_create_context_uncached(self, devnum):
+        """See also ``get_or_create_context(devnum)``.
+        This version does not read the cache.
+        """
+        with self._lock:
+            # Try to get the active context in the CUDA stack or
+            # activate GPU-0 with the primary context
+            with driver.get_active_context() as ac:
+                if not ac:
+                    return self._activate_context_for(0)
+                else:
+                    # Get primary context for the active device
+                    ctx = self.gpus[ac.devnum].get_primary_context()
+                    # Is active context the primary context?
+                    if USE_NV_BINDING:
+                        ctx_handle = int(ctx.handle)
+                        ac_ctx_handle = int(ac.context_handle)
+                    else:
+                        ctx_handle = ctx.handle.value
+                        ac_ctx_handle = ac.context_handle.value
+                    if ctx_handle != ac_ctx_handle:
+                        msg = ('Numba cannot operate on non-primary'
+                               ' CUDA context {:x}')
+                        raise RuntimeError(msg.format(ac_ctx_handle))
+                    # Ensure the context is ready
+                    ctx.prepare_for_use()
+                return ctx
+
+    def _activate_context_for(self, devnum):
+        with self._lock:
+            gpu = self.gpus[devnum]
+            newctx = gpu.get_primary_context()
+            # Detect unexpected context switch
+            cached_ctx = self._get_attached_context()
+            if cached_ctx is not None and cached_ctx is not newctx:
+                raise RuntimeError('Cannot switch CUDA-context.')
+            newctx.push()
+            return newctx
+
+    def _get_attached_context(self):
+        return getattr(self._tls, 'attached_context', None)
+
+    def _set_attached_context(self, ctx):
+        self._tls.attached_context = ctx
+
+    def reset(self):
+        """Clear all contexts in the thread.  Destroy the context if and only
+        if we are in the main thread.
+        """
+        # Pop all active context.
+        while driver.pop_active_context() is not None:
+            pass
+
+        # If it is the main thread
+        if threading.current_thread() == self._mainthread:
+            self._destroy_all_contexts()
+
+    def _destroy_all_contexts(self):
+        # Reset all devices
+        for gpu in self.gpus:
+            gpu.reset()
+
+
+_runtime = _Runtime()
+
+# ================================ PUBLIC API ================================
+
+gpus = _runtime.gpus
+
+
+def get_context(devnum=None):
+    """Get the current device or use a device by device number, and
+    return the CUDA context.
+    """
+    return _runtime.get_or_create_context(devnum)
+
+
+def require_context(fn):
+    """
+    A decorator that ensures a CUDA context is available when *fn* is executed.
+
+    Note: The function *fn* cannot switch CUDA-context.
+    """
+    @functools.wraps(fn)
+    def _require_cuda_context(*args, **kws):
+        with _runtime.ensure_context():
+            return fn(*args, **kws)
+
+    return _require_cuda_context
+
+
+def reset():
+    """Reset the CUDA subsystem for the current thread.
+
+    In the main thread:
+    This removes all CUDA contexts.  Only use this at shutdown or for
+    cleaning up between tests.
+
+    In non-main threads:
+    This clear the CUDA context stack only.
+
+    """
+    _runtime.reset()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/drvapi.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/drvapi.py
@@ -0,0 +1,394 @@
+from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
+                    c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
+
+from numba.cuda.cudadrv import _extras
+
+cu_device = c_int
+cu_device_attribute = c_int     # enum
+cu_context = c_void_p           # an opaque handle
+cu_module = c_void_p            # an opaque handle
+cu_jit_option = c_int           # enum
+cu_jit_input_type = c_int       # enum
+cu_function = c_void_p          # an opaque handle
+cu_device_ptr = c_size_t        # defined as unsigned long long
+cu_stream = c_void_p            # an opaque handle
+cu_event = c_void_p
+cu_link_state = c_void_p
+cu_function_attribute = c_int
+cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE)   # 64 bytes wide
+cu_uuid = (c_byte * 16)         # Device UUID
+
+cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
+
+cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
+
+# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+CU_STREAM_DEFAULT = 0
+CU_STREAM_LEGACY = 1
+CU_STREAM_PER_THREAD = 2
+
+API_PROTOTYPES = {
+    # CUresult cuInit(unsigned int Flags);
+    'cuInit' : (c_int, c_uint),
+
+    # CUresult cuDriverGetVersion (int* driverVersion )
+    'cuDriverGetVersion': (c_int, POINTER(c_int)),
+
+    # CUresult cuDeviceGetCount(int *count);
+    'cuDeviceGetCount': (c_int, POINTER(c_int)),
+
+    # CUresult cuDeviceGet(CUdevice *device, int ordinal);
+    'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
+
+    # CUresult cuDeviceGetName ( char* name, int  len, CUdevice dev )
+    'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
+
+    # CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+    #                               CUdevice dev);
+    'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
+                             cu_device),
+
+    # CUresult cuDeviceComputeCapability(int *major, int *minor,
+    #                                    CUdevice dev);
+    'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
+                                  cu_device),
+
+    # CUresult cuDevicePrimaryCtxGetState(
+    #              CUdevice dev,
+    #              unsigned int* flags,
+    #              int* active)
+    'cuDevicePrimaryCtxGetState': (c_int,
+                                   cu_device, POINTER(c_uint), POINTER(c_int)),
+
+    # CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
+    'cuDevicePrimaryCtxRelease': (c_int, cu_device),
+
+    # CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
+    'cuDevicePrimaryCtxReset': (c_int, cu_device),
+
+    # CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
+    'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
+
+    # CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int  flags )
+    'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
+
+    # CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
+    #                      CUdevice dev);
+    'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
+
+    # CUresult cuCtxGetDevice (	CUdevice * 	device	 )
+    'cuCtxGetDevice': (c_int, POINTER(cu_device)),
+
+    # CUresult cuCtxGetCurrent (CUcontext *pctx);
+    'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
+
+    # CUresult cuCtxPushCurrent (CUcontext pctx);
+    'cuCtxPushCurrent': (c_int, cu_context),
+
+    # CUresult cuCtxPopCurrent (CUcontext *pctx);
+    'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
+
+    # CUresult cuCtxDestroy(CUcontext pctx);
+    'cuCtxDestroy': (c_int, cu_context),
+
+    # CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
+    #                             unsigned int numOptions,
+    #                             CUjit_option *options,
+    #                             void **optionValues);
+    'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
+                           POINTER(cu_jit_option), POINTER(c_void_p)),
+
+    # CUresult cuModuleUnload(CUmodule hmod);
+    'cuModuleUnload': (c_int, cu_module),
+
+    # CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+    #                              const char *name);
+    'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
+
+    # CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
+    #                              hmod, const char* name )
+    'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
+                          cu_module, c_char_p),
+
+    # CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
+    #                                       CUfunc_cache config);
+    'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
+
+    # CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+    'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
+
+    # CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
+    #                            unsigned int flags);
+    'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
+
+    # CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
+    'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
+
+    # CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
+    #                          size_t N, CUstream hStream);
+    'cuMemsetD8Async': (c_int,
+                        cu_device_ptr, c_uint8, c_size_t, cu_stream),
+
+    # CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
+    #                       size_t ByteCount);
+    'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
+
+    # CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
+    #                            size_t ByteCount, CUstream hStream);
+    'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
+                          cu_stream),
+
+    # CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
+    #                       size_t ByteCount);
+    'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
+
+    # CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
+    #                            size_t ByteCount, CUstream hStream);
+    'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
+                          cu_stream),
+
+
+    # CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
+    #                       size_t ByteCount);
+    'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
+
+    # CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
+    #                            size_t ByteCount, CUstream hStream);
+    'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
+                          cu_stream),
+
+    # CUresult cuMemFree(CUdeviceptr dptr);
+    'cuMemFree': (c_int, cu_device_ptr),
+
+    # CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
+    'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
+
+    # CUresult cuStreamDestroy(CUstream hStream);
+    'cuStreamDestroy': (c_int, cu_stream),
+
+    # CUresult cuStreamSynchronize(CUstream hStream);
+    'cuStreamSynchronize': (c_int, cu_stream),
+
+    # CUresult cuStreamAddCallback(
+    #              CUstream hStream,
+    #              CUstreamCallback callback,
+    #              void* userData,
+    #              unsigned int flags)
+    'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
+                            py_object, c_uint),
+
+    # CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+    #                        unsigned int gridDimY,
+    #                        unsigned int gridDimZ,
+    #                        unsigned int blockDimX,
+    #                        unsigned int blockDimY,
+    #                        unsigned int blockDimZ,
+    #                        unsigned int sharedMemBytes,
+    #                        CUstream hStream, void **kernelParams,
+    #                        void ** extra)
+    'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
+                       c_uint, c_uint, c_uint, c_uint, cu_stream,
+                       POINTER(c_void_p), POINTER(c_void_p)),
+
+    # CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
+    #                                   unsigned int gridDimY,
+    #                                   unsigned int gridDimZ,
+    #                                   unsigned int blockDimX,
+    #                                   unsigned int blockDimY,
+    #                                   unsigned int blockDimZ,
+    #                                   unsigned int sharedMemBytes,
+    #                                   CUstream hStream, void **kernelParams)
+    'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
+                                  c_uint, c_uint, c_uint, c_uint, cu_stream,
+                                  POINTER(c_void_p)),
+
+    #  CUresult cuMemHostAlloc (	void ** 	pp,
+    #                               size_t 	bytesize,
+    #                               unsigned int 	Flags
+    #                           )
+    'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
+
+    #  CUresult cuMemFreeHost (	void * 	p	 )
+    'cuMemFreeHost': (c_int, c_void_p),
+
+    # CUresult cuMemHostRegister(void * 	p,
+    #                            size_t 	bytesize,
+    #                            unsigned int 	Flags)
+    'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
+
+    # CUresult cuMemHostUnregister(void * 	p)
+    'cuMemHostUnregister': (c_int, c_void_p),
+
+    # CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
+    #                                    void *        p,
+    #                                    unsigned int  Flags)
+    'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
+                                  c_void_p, c_uint),
+
+    # CUresult cuMemGetInfo(size_t * free, size_t * total)
+    'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
+
+    # CUresult cuEventCreate (	CUevent * 	phEvent,
+    #                               unsigned int 	Flags )
+    'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
+
+    # CUresult cuEventDestroy (	CUevent 	hEvent	 )
+    'cuEventDestroy': (c_int, cu_event),
+
+    # CUresult cuEventElapsedTime (	float * 	pMilliseconds,
+    #                                   CUevent 	hStart,
+    #                                   CUevent 	hEnd )
+    'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
+
+    # CUresult cuEventQuery (	CUevent 	hEvent	 )
+    'cuEventQuery': (c_int, cu_event),
+
+    # CUresult cuEventRecord (	CUevent 	hEvent,
+    #                               CUstream 	hStream )
+    'cuEventRecord': (c_int, cu_event, cu_stream),
+
+    # CUresult cuEventSynchronize (	CUevent 	hEvent	 )
+    'cuEventSynchronize': (c_int, cu_event),
+
+
+    # CUresult cuStreamWaitEvent (	CUstream        hStream,
+    #                                   CUevent         hEvent,
+    #                                	unsigned int 	Flags )
+    'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
+
+    # CUresult 	cuPointerGetAttribute (
+    #               void *data,
+    #               CUpointer_attribute attribute,
+    #               CUdeviceptr ptr)
+    'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
+
+    #    CUresult cuMemGetAddressRange (	CUdeviceptr * 	pbase,
+    #                                        size_t * 	psize,
+    #                                        CUdeviceptr 	dptr
+    #                                        )
+    'cuMemGetAddressRange': (c_int,
+                             POINTER(cu_device_ptr),
+                             POINTER(c_size_t),
+                             cu_device_ptr),
+
+    #    CUresult cuMemHostGetFlags (	unsigned int * 	pFlags,
+    #                                   void * 	p )
+    'cuMemHostGetFlags': (c_int,
+                          POINTER(c_uint),
+                          c_void_p),
+
+    #   CUresult cuCtxSynchronize ( void )
+    'cuCtxSynchronize' : (c_int,),
+
+    #    CUresult
+    #    cuLinkCreate(unsigned int numOptions, CUjit_option *options,
+    #                 void **optionValues, CUlinkState *stateOut);
+    'cuLinkCreate': (c_int,
+                     c_uint, POINTER(cu_jit_option),
+                     POINTER(c_void_p), POINTER(cu_link_state)),
+
+    #    CUresult
+    #    cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
+    #                  size_t size, const char *name, unsigned
+    #                  int numOptions, CUjit_option *options,
+    #                  void **optionValues);
+    'cuLinkAddData': (c_int,
+                      cu_link_state, cu_jit_input_type, c_void_p,
+                      c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
+                      POINTER(c_void_p)),
+
+    #    CUresult
+    #    cuLinkAddFile(CUlinkState state, CUjitInputType type,
+    #                  const char *path, unsigned int numOptions,
+    #                  CUjit_option *options, void **optionValues);
+
+    'cuLinkAddFile': (c_int,
+                      cu_link_state, cu_jit_input_type, c_char_p, c_uint,
+                      POINTER(cu_jit_option), POINTER(c_void_p)),
+
+    #    CUresult CUDAAPI
+    #    cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
+    'cuLinkComplete': (c_int,
+                       cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
+
+    #    CUresult CUDAAPI
+    #    cuLinkDestroy(CUlinkState state)
+    'cuLinkDestroy': (c_int, cu_link_state),
+
+    # cuProfilerStart ( void )
+    'cuProfilerStart': (c_int,),
+
+    # cuProfilerStop ( void )
+    'cuProfilerStop': (c_int,),
+
+    # CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
+    #                              CUfunction hfunc )
+    'cuFuncGetAttribute': (c_int,
+                           POINTER(c_int), cu_function_attribute, cu_function),
+
+    # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    #                      int *numBlocks,
+    #                      CUfunction func,
+    #                      int blockSize,
+    #                      size_t dynamicSMemSize);
+    'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
+                                                    cu_function, c_size_t,
+                                                    c_uint),
+
+    # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    #                      int *numBlocks,
+    #                      CUfunction func,
+    #                      int blockSize,
+    #                      size_t dynamicSMemSize,
+    #                      unsigned int flags);
+    'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
+                                                             POINTER(c_int),
+                                                             cu_function,
+                                                             c_size_t, c_uint),
+
+    # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
+    #                      int *minGridSize, int *blockSize,
+    #                      CUfunction func,
+    #                      CUoccupancyB2DSize blockSizeToDynamicSMemSize,
+    #                      size_t dynamicSMemSize, int blockSizeLimit);
+    'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
+                                         cu_function, cu_occupancy_b2d_size,
+                                         c_size_t, c_int),
+
+    # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+    #                      int *minGridSize, int *blockSize,
+    #                      CUfunction func,
+    #                      CUoccupancyB2DSize blockSizeToDynamicSMemSize,
+    #                      size_t dynamicSMemSize, int blockSizeLimit,
+    #                      unsigned int flags);
+    'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
+                                                  POINTER(c_int), cu_function,
+                                                  cu_occupancy_b2d_size,
+                                                  c_size_t, c_int, c_uint),
+
+    # CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
+    'cuIpcGetMemHandle': (c_int,
+                          POINTER(cu_ipc_mem_handle), cu_device_ptr),
+
+    # CUresult cuIpcOpenMemHandle(
+    #              CUdeviceptr* pdptr,
+    #              CUipcMemHandle handle,
+    #              unsigned int Flags)
+    'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
+                           c_uint),
+
+    # CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
+
+    'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
+
+    # CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
+    'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
+
+    # CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
+    #                                  CUdevice dev, CUdevice peerDev )
+    'cuDeviceCanAccessPeer': (c_int,
+                              POINTER(c_int), cu_device, cu_device),
+
+    # CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
+    'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
+}
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/dummyarray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/dummyarray.py
@@ -0,0 +1,452 @@
+from collections import namedtuple
+import itertools
+import functools
+import operator
+import ctypes
+
+import numpy as np
+
+from numba import _helperlib
+
+Extent = namedtuple("Extent", ["begin", "end"])
+
+attempt_nocopy_reshape = ctypes.CFUNCTYPE(
+    ctypes.c_int,
+    ctypes.c_long,  # nd
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # dims
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # strides
+    ctypes.c_long,  # newnd
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newdims
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newstrides
+    ctypes.c_long,  # itemsize
+    ctypes.c_int,  # is_f_order
+)(_helperlib.c_helpers['attempt_nocopy_reshape'])
+
+
+class Dim(object):
+    """A single dimension of the array
+
+    Attributes
+    ----------
+    start:
+        start offset
+    stop:
+        stop offset
+    size:
+        number of items
+    stride:
+        item stride
+    """
+    __slots__ = 'start', 'stop', 'size', 'stride', 'single'
+
+    def __init__(self, start, stop, size, stride, single):
+        self.start = start
+        self.stop = stop
+        self.size = size
+        self.stride = stride
+        self.single = single
+        assert not single or size == 1
+
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            start, stop, step = item.indices(self.size)
+            stride = step * self.stride
+            start = self.start + start * abs(self.stride)
+            stop = self.start + stop * abs(self.stride)
+            if stride == 0:
+                size = 1
+            else:
+                size = _compute_size(start, stop, stride)
+            ret = Dim(
+                start=start,
+                stop=stop,
+                size=size,
+                stride=stride,
+                single=False
+            )
+            return ret
+        else:
+            sliced = self[item:item + 1] if item != -1 else self[-1:]
+            if sliced.size != 1:
+                raise IndexError
+            return Dim(
+                start=sliced.start,
+                stop=sliced.stop,
+                size=sliced.size,
+                stride=sliced.stride,
+                single=True,
+            )
+
+    def get_offset(self, idx):
+        return self.start + idx * self.stride
+
+    def __repr__(self):
+        strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
+        return strfmt % (self.start, self.stop, self.size, self.stride)
+
+    def normalize(self, base):
+        return Dim(start=self.start - base, stop=self.stop - base,
+                   size=self.size, stride=self.stride, single=self.single)
+
+    def copy(self, start=None, stop=None, size=None, stride=None, single=None):
+        if start is None:
+            start = self.start
+        if stop is None:
+            stop = self.stop
+        if size is None:
+            size = self.size
+        if stride is None:
+            stride = self.stride
+        if single is None:
+            single = self.single
+        return Dim(start, stop, size, stride, single)
+
+    def is_contiguous(self, itemsize):
+        return self.stride == itemsize
+
+
+def compute_index(indices, dims):
+    return sum(d.get_offset(i) for i, d in zip(indices, dims))
+
+
+class Element(object):
+    is_array = False
+
+    def __init__(self, extent):
+        self.extent = extent
+
+    def iter_contiguous_extent(self):
+        yield self.extent
+
+
+class Array(object):
+    """A dummy numpy array-like object.  Consider it an array without the
+    actual data, but offset from the base data pointer.
+
+    Attributes
+    ----------
+    dims: tuple of Dim
+        describing each dimension of the array
+
+    ndim: int
+        number of dimension
+
+    shape: tuple of int
+        size of each dimension
+
+    strides: tuple of int
+        stride of each dimension
+
+    itemsize: int
+        itemsize
+
+    extent: (start, end)
+        start and end offset containing the memory region
+    """
+    is_array = True
+
+    @classmethod
+    def from_desc(cls, offset, shape, strides, itemsize):
+        dims = []
+        for ashape, astride in zip(shape, strides):
+            dim = Dim(offset, offset + ashape * astride, ashape, astride,
+                      single=False)
+            dims.append(dim)
+            offset = 0  # offset only applies to first dimension
+        return cls(dims, itemsize)
+
+    def __init__(self, dims, itemsize):
+        self.dims = tuple(dims)
+        self.ndim = len(self.dims)
+        self.shape = tuple(dim.size for dim in self.dims)
+        self.strides = tuple(dim.stride for dim in self.dims)
+        self.itemsize = itemsize
+        self.size = functools.reduce(operator.mul, self.shape, 1)
+        self.extent = self._compute_extent()
+        self.flags = self._compute_layout()
+
+    def _compute_layout(self):
+        # The logic here is based on that in _UpdateContiguousFlags from
+        # numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
+        # 13661ac70).
+        # https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
+
+        # Records have no dims, and we can treat them as contiguous
+        if not self.dims:
+            return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+
+        # If this is a broadcast array then it is not contiguous
+        if any([dim.stride == 0 for dim in self.dims]):
+            return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
+
+        flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+
+        # Check C contiguity
+        sd = self.itemsize
+        for dim in reversed(self.dims):
+            if dim.size == 0:
+                # Contiguous by definition
+                return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+            if dim.size != 1:
+                if dim.stride != sd:
+                    flags['C_CONTIGUOUS'] = False
+                sd *= dim.size
+
+        # Check F contiguity
+        sd = self.itemsize
+        for dim in self.dims:
+            if dim.size != 1:
+                if dim.stride != sd:
+                    flags['F_CONTIGUOUS'] = False
+                    return flags
+                sd *= dim.size
+
+        return flags
+
+    def _compute_extent(self):
+        firstidx = [0] * self.ndim
+        lastidx = [s - 1 for s in self.shape]
+        start = compute_index(firstidx, self.dims)
+        stop = compute_index(lastidx, self.dims) + self.itemsize
+        stop = max(stop, start)   # ensure positive extent
+        return Extent(start, stop)
+
+    def __repr__(self):
+        return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
+
+    def __getitem__(self, item):
+        if not isinstance(item, tuple):
+            item = [item]
+        else:
+            item = list(item)
+
+        nitem = len(item)
+        ndim = len(self.dims)
+        if nitem > ndim:
+            raise IndexError("%d extra indices given" % (nitem - ndim,))
+
+        # Add empty slices for missing indices
+        while len(item) < ndim:
+            item.append(slice(None, None))
+
+        dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
+        newshape = [d.size for d in dims if not d.single]
+
+        arr = Array(dims, self.itemsize)
+        if newshape:
+            return arr.reshape(*newshape)[0]
+        else:
+            return Element(arr.extent)
+
+    @property
+    def is_c_contig(self):
+        return self.flags['C_CONTIGUOUS']
+
+    @property
+    def is_f_contig(self):
+        return self.flags['F_CONTIGUOUS']
+
+    def iter_contiguous_extent(self):
+        """ Generates extents
+        """
+        if self.is_c_contig or self.is_f_contig:
+            yield self.extent
+        else:
+            if self.dims[0].stride < self.dims[-1].stride:
+                innerdim = self.dims[0]
+                outerdims = self.dims[1:]
+                outershape = self.shape[1:]
+            else:
+                innerdim = self.dims[-1]
+                outerdims = self.dims[:-1]
+                outershape = self.shape[:-1]
+
+            if innerdim.is_contiguous(self.itemsize):
+                oslen = [range(s) for s in outershape]
+                for indices in itertools.product(*oslen):
+                    base = compute_index(indices, outerdims)
+                    yield base + innerdim.start, base + innerdim.stop
+            else:
+                oslen = [range(s) for s in self.shape]
+                for indices in itertools.product(*oslen):
+                    offset = compute_index(indices, self.dims)
+                    yield offset, offset + self.itemsize
+
+    def reshape(self, *newdims, **kws):
+        oldnd = self.ndim
+        newnd = len(newdims)
+
+        if newdims == self.shape:
+            return self, None
+
+        order = kws.pop('order', 'C')
+        if kws:
+            raise TypeError('unknown keyword arguments %s' % kws.keys())
+        if order not in 'CFA':
+            raise ValueError('order not C|F|A')
+
+        # check for exactly one instance of -1 in newdims
+        # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515   # noqa: E501
+        unknownidx = -1
+        knownsize = 1
+        for i, dim in enumerate(newdims):
+            if dim < 0:
+                if unknownidx == -1:
+                    unknownidx = i
+                else:
+                    raise ValueError("can only specify one unknown dimension")
+            else:
+                knownsize *= dim
+
+        # compute the missing dimension
+        if unknownidx >= 0:
+            if knownsize == 0 or self.size % knownsize != 0:
+                raise ValueError("cannot infer valid shape "
+                                 "for unknown dimension")
+            else:
+                newdims = newdims[0:unknownidx] \
+                    + (self.size // knownsize,) \
+                    + newdims[unknownidx + 1:]
+
+        newsize = functools.reduce(operator.mul, newdims, 1)
+
+        if order == 'A':
+            order = 'F' if self.is_f_contig else 'C'
+
+        if newsize != self.size:
+            raise ValueError("reshape changes the size of the array")
+
+        if self.is_c_contig or self.is_f_contig:
+            if order == 'C':
+                newstrides = list(iter_strides_c_contig(self, newdims))
+            elif order == 'F':
+                newstrides = list(iter_strides_f_contig(self, newdims))
+            else:
+                raise AssertionError("unreachable")
+        else:
+            newstrides = np.empty(newnd, np.ctypeslib.c_intp)
+
+            # need to keep these around in variables, not temporaries, so they
+            # don't get GC'ed before we call into the C code
+            olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
+            oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
+            newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
+
+            if not attempt_nocopy_reshape(
+                oldnd,
+                olddims,
+                oldstrides,
+                newnd,
+                newdims,
+                newstrides,
+                self.itemsize,
+                order == 'F',
+            ):
+                raise NotImplementedError('reshape would require copy')
+
+        ret = self.from_desc(self.extent.begin, shape=newdims,
+                             strides=newstrides, itemsize=self.itemsize)
+
+        return ret, list(self.iter_contiguous_extent())
+
+    def squeeze(self, axis=None):
+        newshape, newstrides = [], []
+        if axis is None:
+            for length, stride in zip(self.shape, self.strides):
+                if length != 1:
+                    newshape.append(length)
+                    newstrides.append(stride)
+        else:
+            if not isinstance(axis, tuple):
+                axis = (axis,)
+            for ax in axis:
+                if self.shape[ax] != 1:
+                    raise ValueError(
+                        "cannot select an axis to squeeze out which has size "
+                        "not equal to one"
+                    )
+            for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
+                if i not in axis:
+                    newshape.append(length)
+                    newstrides.append(stride)
+        newarr = self.from_desc(
+            self.extent.begin,
+            shape=newshape,
+            strides=newstrides,
+            itemsize=self.itemsize,
+        )
+        return newarr, list(self.iter_contiguous_extent())
+
+    def ravel(self, order='C'):
+        if order not in 'CFA':
+            raise ValueError('order not C|F|A')
+
+        if (order in 'CA' and self.is_c_contig
+                or order in 'FA' and self.is_f_contig):
+            newshape = (self.size,)
+            newstrides = (self.itemsize,)
+            arr = self.from_desc(self.extent.begin, newshape, newstrides,
+                                 self.itemsize)
+            return arr, list(self.iter_contiguous_extent())
+
+        else:
+            raise NotImplementedError("ravel on non-contiguous array")
+
+
+def iter_strides_f_contig(arr, shape=None):
+    """yields the f-contiguous strides
+    """
+    shape = arr.shape if shape is None else shape
+    itemsize = arr.itemsize
+    yield itemsize
+    sum = 1
+    for s in shape[:-1]:
+        sum *= s
+        yield sum * itemsize
+
+
+def iter_strides_c_contig(arr, shape=None):
+    """yields the c-contiguous strides
+    """
+    shape = arr.shape if shape is None else shape
+    itemsize = arr.itemsize
+
+    def gen():
+        yield itemsize
+        sum = 1
+        for s in reversed(shape[1:]):
+            sum *= s
+            yield sum * itemsize
+
+    for i in reversed(list(gen())):
+        yield i
+
+
+def is_element_indexing(item, ndim):
+    if isinstance(item, slice):
+        return False
+
+    elif isinstance(item, tuple):
+        if len(item) == ndim:
+            if not any(isinstance(it, slice) for it in item):
+                return True
+
+    else:
+        return True
+
+    return False
+
+
+def _compute_size(start, stop, step):
+    """Algorithm adapted from cpython rangeobject.c
+    """
+    if step > 0:
+        lo = start
+        hi = stop
+    else:
+        lo = stop
+        hi = start
+        step = -step
+    if lo >= hi:
+        return 0
+    return (hi - lo - 1) // step + 1
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/enums.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/enums.py
@@ -0,0 +1,607 @@
+"""
+Enum values for CUDA driver. Information about the values
+can be found on the official NVIDIA documentation website.
+ref: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+anchor: #group__CUDA__TYPES
+"""
+
+
+# Error codes
+
+CUDA_SUCCESS = 0
+CUDA_ERROR_INVALID_VALUE = 1
+CUDA_ERROR_OUT_OF_MEMORY = 2
+CUDA_ERROR_NOT_INITIALIZED = 3
+CUDA_ERROR_DEINITIALIZED = 4
+CUDA_ERROR_PROFILER_DISABLED = 5
+CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
+CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
+CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
+CUDA_ERROR_STUB_LIBRARY = 34
+CUDA_ERROR_DEVICE_UNAVAILABLE = 46
+CUDA_ERROR_NO_DEVICE = 100
+CUDA_ERROR_INVALID_DEVICE = 101
+CUDA_ERROR_DEVICE_NOT_LICENSED = 102
+CUDA_ERROR_INVALID_IMAGE = 200
+CUDA_ERROR_INVALID_CONTEXT = 201
+CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
+CUDA_ERROR_MAP_FAILED = 205
+CUDA_ERROR_UNMAP_FAILED = 206
+CUDA_ERROR_ARRAY_IS_MAPPED = 207
+CUDA_ERROR_ALREADY_MAPPED = 208
+CUDA_ERROR_NO_BINARY_FOR_GPU = 209
+CUDA_ERROR_ALREADY_ACQUIRED = 210
+CUDA_ERROR_NOT_MAPPED = 211
+CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
+CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
+CUDA_ERROR_ECC_UNCORRECTABLE = 214
+CUDA_ERROR_UNSUPPORTED_LIMIT = 215
+CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
+CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
+CUDA_ERROR_INVALID_PTX = 218
+CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
+CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
+CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
+CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
+CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
+CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
+CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
+CUDA_ERROR_INVALID_SOURCE = 300
+CUDA_ERROR_FILE_NOT_FOUND = 301
+CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
+CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
+CUDA_ERROR_OPERATING_SYSTEM = 304
+CUDA_ERROR_INVALID_HANDLE = 400
+CUDA_ERROR_ILLEGAL_STATE = 401
+CUDA_ERROR_NOT_FOUND = 500
+CUDA_ERROR_NOT_READY = 600
+CUDA_ERROR_LAUNCH_FAILED = 700
+CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
+CUDA_ERROR_LAUNCH_TIMEOUT = 702
+CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
+CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
+CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
+CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
+CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
+CUDA_ERROR_ASSERT = 710
+CUDA_ERROR_TOO_MANY_PEERS = 711
+CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
+CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
+CUDA_ERROR_HARDWARE_STACK_ERROR = 714
+CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
+CUDA_ERROR_MISALIGNED_ADDRESS = 716
+CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
+CUDA_ERROR_INVALID_PC = 718
+CUDA_ERROR_LAUNCH_FAILED = 719
+CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
+CUDA_ERROR_NOT_PERMITTED = 800
+CUDA_ERROR_NOT_SUPPORTED = 801
+CUDA_ERROR_SYSTEM_NOT_READY = 802
+CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
+CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
+CUDA_ERROR_MPS_CONNECTION_FAILED = 805
+CUDA_ERROR_MPS_RPC_FAILURE = 806
+CUDA_ERROR_MPS_SERVER_NOT_READY = 807
+CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
+CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
+CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
+CUDA_ERROR_CDP_NOT_SUPPORTED = 811
+CUDA_ERROR_CDP_VERSION_MISMATCH = 812
+CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
+CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
+CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
+CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
+CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
+CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
+CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
+CUDA_ERROR_CAPTURED_EVENT = 907
+CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
+CUDA_ERROR_TIMEOUT = 909
+CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
+CUDA_ERROR_EXTERNAL_DEVICE = 911
+CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
+CUDA_ERROR_UNKNOWN = 999
+
+
+# Function cache configurations
+
+# no preference for shared memory or L1 (default)
+CU_FUNC_CACHE_PREFER_NONE = 0x00
+# prefer larger shared memory and smaller L1 cache
+CU_FUNC_CACHE_PREFER_SHARED = 0x01
+# prefer larger L1 cache and smaller shared memory
+CU_FUNC_CACHE_PREFER_L1 = 0x02
+# prefer equal sized L1 cache and shared memory
+CU_FUNC_CACHE_PREFER_EQUAL = 0x03
+
+
+# Context creation flags
+
+# Automatic scheduling
+CU_CTX_SCHED_AUTO = 0x00
+# Set spin as default scheduling
+CU_CTX_SCHED_SPIN = 0x01
+# Set yield as default scheduling
+CU_CTX_SCHED_YIELD = 0x02
+# Set blocking synchronization as default scheduling
+CU_CTX_SCHED_BLOCKING_SYNC = 0x04
+
+CU_CTX_SCHED_MASK = 0x07
+# Support mapped pinned allocations
+#   This flag was deprecated as of CUDA 11.0 and it no longer has effect.
+#   All contexts as of CUDA 3.2 behave as though the flag is enabled.
+CU_CTX_MAP_HOST = 0x08
+# Keep local memory allocation after launch
+CU_CTX_LMEM_RESIZE_TO_MAX = 0x10
+# Trigger coredumps from exceptions in this context
+CU_CTX_COREDUMP_ENABLE = 0x20
+# Enable user pipe to trigger coredumps in this context
+CU_CTX_USER_COREDUMP_ENABLE = 0x40
+# Force synchronous blocking on cudaMemcpy/cudaMemset
+CU_CTX_SYNC_MEMOPS = 0x80
+
+CU_CTX_FLAGS_MASK = 0xff
+
+
+# DEFINES
+
+# If set, host memory is portable between CUDA contexts.
+# Flag for cuMemHostAlloc()
+CU_MEMHOSTALLOC_PORTABLE = 0x01
+
+# If set, host memory is mapped into CUDA address space and
+# cuMemHostGetDevicePointer() may be called on the host pointer.
+# Flag for cuMemHostAlloc()
+CU_MEMHOSTALLOC_DEVICEMAP = 0x02
+
+# If set, host memory is allocated as write-combined - fast to write,
+# faster to DMA, slow to read except via SSE4 streaming load instruction
+# (MOVNTDQA).
+# Flag for cuMemHostAlloc()
+CU_MEMHOSTALLOC_WRITECOMBINED = 0x04
+
+
+# If set, host memory is portable between CUDA contexts.
+# Flag for cuMemHostRegister()
+CU_MEMHOSTREGISTER_PORTABLE = 0x01
+
+# If set, host memory is mapped into CUDA address space and
+# cuMemHostGetDevicePointer() may be called on the host pointer.
+# Flag for cuMemHostRegister()
+CU_MEMHOSTREGISTER_DEVICEMAP = 0x02
+
+# If set, the passed memory pointer is treated as pointing to some
+# memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+# On Windows the flag is a no-op. On Linux that memory is marked
+# as non cache-coherent for the GPU and is expected
+# to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED
+# if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older
+# Linux kernel versions. On all other platforms, it is not supported
+# and CUDA_ERROR_NOT_SUPPORTED is returned.
+# Flag for cuMemHostRegister()
+CU_MEMHOSTREGISTER_IOMEMORY = 0x04
+
+# If set, the passed memory pointer is treated as pointing to memory
+# that is considered read-only by the device. On platforms without
+# CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+# this flag is required in order to register memory mapped
+# to the CPU as read-only. Support for the use of this flag can be
+# queried from the device attribute
+# CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.
+# Using this flag with a current context associated with a device
+# that does not have this attribute set will cause cuMemHostRegister
+# to error with CUDA_ERROR_NOT_SUPPORTED.
+CU_MEMHOSTREGISTER_READ_ONLY = 0x08
+
+
+# CUDA Mem Attach Flags
+
+# If set, managed memory is accessible from all streams on all devices.
+CU_MEM_ATTACH_GLOBAL = 0x01
+
+# If set on a platform where the device attribute
+# cudaDevAttrConcurrentManagedAccess is zero, then managed memory is
+# only accessible on the host (unless explicitly attached to a stream
+# with cudaStreamAttachMemAsync, in which case it can be used in kernels
+# launched on that stream).
+CU_MEM_ATTACH_HOST = 0x02
+
+# If set on a platform where the device attribute
+# cudaDevAttrConcurrentManagedAccess is zero, then managed memory accesses
+# on the associated device must only be from a single stream.
+CU_MEM_ATTACH_SINGLE = 0x04
+
+
+# Event creation flags
+
+# Default event flag
+CU_EVENT_DEFAULT = 0x0
+# Event uses blocking synchronization
+CU_EVENT_BLOCKING_SYNC = 0x1
+# Event will not record timing data
+CU_EVENT_DISABLE_TIMING = 0x2
+# Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
+CU_EVENT_INTERPROCESS = 0x4
+
+
+# Pointer information
+
+# The CUcontext on which a pointer was allocated or registered
+CU_POINTER_ATTRIBUTE_CONTEXT = 1
+# The CUmemorytype describing the physical location of a pointer
+CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2
+# The address at which a pointer's memory may be accessed on the device
+CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3
+# The address at which a pointer's memory may be accessed on the host
+CU_POINTER_ATTRIBUTE_HOST_POINTER = 4
+# A pair of tokens for use with the nv-p2p.h Linux kernel interface
+CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5
+# Synchronize every synchronous memory operation initiated on this region
+CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6
+# A process-wide unique ID for an allocated memory region
+CU_POINTER_ATTRIBUTE_BUFFER_ID = 7
+# Indicates if the pointer points to managed memory
+CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
+# A device ordinal of a device on which a pointer was allocated or registered
+CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9
+# 1 if this pointer maps to an allocation
+# that is suitable for cudaIpcGetMemHandle, 0 otherwise
+CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10
+# Starting address for this requested pointer
+CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11
+# Size of the address range for this requested pointer
+CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12
+# 1 if this pointer is in a valid address range
+# that is mapped to a backing allocation, 0 otherwise
+CU_POINTER_ATTRIBUTE_MAPPED = 13
+# Bitmask of allowed CUmemAllocationHandleType for this allocation
+CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14
+# 1 if the memory this pointer is referencing
+# can be used with the GPUDirect RDMA API
+CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
+# Returns the access flags the device associated
+# with the current context has on the corresponding
+# memory referenced by the pointer given
+CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16
+# Returns the mempool handle for the allocation
+# if it was allocated from a mempool. Otherwise returns NULL
+CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17
+# Size of the actual underlying mapping that the pointer belongs to
+CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18
+# The start address of the mapping that the pointer belongs to
+CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19
+# A process-wide unique id corresponding to the
+# physical allocation the pointer belongs to
+CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20
+
+
+# Memory types
+
+# Host memory
+CU_MEMORYTYPE_HOST = 0x01
+# Device memory
+CU_MEMORYTYPE_DEVICE = 0x02
+# Array memory
+CU_MEMORYTYPE_ARRAY = 0x03
+# Unified device or host memory
+CU_MEMORYTYPE_UNIFIED = 0x04
+
+
+# Device code formats
+
+# Compiled device-class-specific device code
+# Applicable options: none
+CU_JIT_INPUT_CUBIN = 0
+
+# PTX source code
+# Applicable options: PTX compiler options
+CU_JIT_INPUT_PTX = 1
+
+# Bundle of multiple cubins and/or PTX of some device code
+# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+CU_JIT_INPUT_FATBINARY = 2
+
+# Host object with embedded device code
+# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+CU_JIT_INPUT_OBJECT = 3
+
+# Archive of host objects with embedded device code
+# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+CU_JIT_INPUT_LIBRARY = 4
+
+CU_JIT_NUM_INPUT_TYPES = 6
+
+
+# Online compiler and linker options
+
+# Max number of registers that a thread may use.
+# Option type: unsigned int
+# Applies to: compiler only
+CU_JIT_MAX_REGISTERS = 0
+
+# IN: Specifies minimum number of threads per block to target compilation
+# for
+# OUT: Returns the number of threads the compiler actually targeted.
+# This restricts the resource utilization fo the compiler (e.g. max
+# registers) such that a block with the given number of threads should be
+# able to launch based on register limitations. Note, this option does not
+# currently take into account any other resource limitations, such as
+# shared memory utilization.
+# Cannot be combined with ::CU_JIT_TARGET.
+# Option type: unsigned int
+# Applies to: compiler only
+CU_JIT_THREADS_PER_BLOCK = 1
+
+# Overwrites the option value with the total wall clock time, in
+# milliseconds, spent in the compiler and linker
+# Option type: float
+# Applies to: compiler and linker
+CU_JIT_WALL_TIME = 2
+
+# Pointer to a buffer in which to print any log messages
+# that are informational in nature (the buffer size is specified via
+# option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
+# Option type: char *
+# Applies to: compiler and linker
+CU_JIT_INFO_LOG_BUFFER = 3
+
+# IN: Log buffer size in bytes.  Log messages will be capped at this size
+# (including null terminator)
+# OUT: Amount of log buffer filled with messages
+# Option type: unsigned int
+# Applies to: compiler and linker
+CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4
+
+# Pointer to a buffer in which to print any log messages that
+# reflect errors (the buffer size is specified via option
+# ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
+# Option type: char *
+# Applies to: compiler and linker
+CU_JIT_ERROR_LOG_BUFFER = 5
+
+# IN: Log buffer size in bytes.  Log messages will be capped at this size
+# (including null terminator)
+# OUT: Amount of log buffer filled with messages
+# Option type: unsigned int
+# Applies to: compiler and linker
+CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6
+
+# Level of optimizations to apply to generated code (0 - 4), with 4
+# being the default and highest level of optimizations.
+# Option type: unsigned int
+# Applies to: compiler only
+CU_JIT_OPTIMIZATION_LEVEL = 7
+
+# No option value required. Determines the target based on the current
+# attached context (default)
+# Option type: No option value needed
+# Applies to: compiler and linker
+CU_JIT_TARGET_FROM_CUCONTEXT = 8
+
+# Target is chosen based on supplied ::CUjit_target.  Cannot be
+# combined with ::CU_JIT_THREADS_PER_BLOCK.
+# Option type: unsigned int for enumerated type ::CUjit_target
+# Applies to: compiler and linker
+CU_JIT_TARGET = 9
+
+# Specifies choice of fallback strategy if matching cubin is not found.
+# Choice is based on supplied ::CUjit_fallback.
+# Option type: unsigned int for enumerated type ::CUjit_fallback
+# Applies to: compiler only
+CU_JIT_FALLBACK_STRATEGY = 10
+
+# Specifies whether to create debug information in output (-g)
+# (0: false, default)
+# Option type: int
+# Applies to: compiler and linker
+CU_JIT_GENERATE_DEBUG_INFO = 11
+
+# Generate verbose log messages (0: false, default)
+# Option type: int
+# Applies to: compiler and linker
+CU_JIT_LOG_VERBOSE = 12
+
+# Generate line number information (-lineinfo) (0: false, default)
+# Option type: int
+# Applies to: compiler only
+CU_JIT_GENERATE_LINE_INFO = 13
+
+# Specifies whether to enable caching explicitly (-dlcm)
+# Choice is based on supplied ::CUjit_cacheMode_enum.
+# Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum
+# Applies to: compiler only
+CU_JIT_CACHE_MODE = 14
+
+
+# CUfunction_attribute
+
+# The maximum number of threads per block, beyond which a launch of the
+# function would fail. This number depends on both the function and the
+# device on which the function is currently loaded.
+CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0
+
+# The size in bytes of statically-allocated shared memory required by
+# this function. This does not include dynamically-allocated shared
+# memory requested by the user at runtime.
+CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1
+
+# The size in bytes of user-allocated constant memory required by this
+# function.
+CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2
+
+# The size in bytes of local memory used by each thread of this function.
+CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3
+
+# The number of registers used by each thread of this function.
+CU_FUNC_ATTRIBUTE_NUM_REGS = 4
+
+# The PTX virtual architecture version for which the function was
+# compiled. This value is the major PTX version * 10 + the minor PTX
+# version, so a PTX version 1.3 function would return the value 13.
+# Note that this may return the undefined value of 0 for cubins
+# compiled prior to CUDA 3.0.
+CU_FUNC_ATTRIBUTE_PTX_VERSION = 5
+
+# The binary architecture version for which the function was compiled.
+# This value is the major binary version * 10 + the minor binary version,
+# so a binary version 1.3 function would return the value 13. Note that
+# this will return a value of 10 for legacy cubins that do not have a
+# properly-encoded binary architecture version.
+CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6
+
+# The attribute to indicate whether the function has been compiled
+# with user specified option "-Xptxas --dlcm=ca" set
+CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
+
+# The maximum size in bytes of dynamically-allocated shared memory
+# that can be used by this function. If the user-specified
+# dynamic shared memory size is larger than this value,
+# the launch will fail. See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8
+
+# On devices where the L1 cache and shared memory use the same
+# hardware resources, this sets the shared memory carveout preference,
+# in percent of the total shared memory. Refer to
+# CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+# This is only a hint, and the driver can choose a different ratio
+# if required to execute the function.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9
+
+# If this attribute is set, the kernel must launch with a valid cluster
+# size specified. See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10
+
+# The required cluster width in blocks. The values must either all be 0
+# or all be positive. The validity of the cluster dimensions
+# is otherwise checked at launch time. If the value is set during
+# compile time, it cannot be set at runtime.
+# Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11
+
+# The required cluster height in blocks. The values must either all be 0
+# or all be positive. The validity of the cluster dimensions
+# is otherwise checked at launch time.If the value is set during
+# compile time, it cannot be set at runtime.
+# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12
+
+# The required cluster depth in blocks. The values must either all be 0
+# or all be positive. The validity of the cluster dimensions
+# is otherwise checked at launch time.If the value is set during
+# compile time, it cannot be set at runtime.
+# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
+
+# Whether the function can be launched with non-portable cluster size.
+# 1 is allowed, 0 is disallowed. A non-portable cluster size may only
+# function on the specific SKUs the program is tested on.
+# The launch might fail if the program is run on a different hardware platform.
+# For more details refer to link :
+# https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
+CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
+
+# The block scheduling policy of a function.
+# The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
+
+
+# Device attributes
+
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
+CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2
+CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3
+CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4
+CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5
+CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6
+CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7
+CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
+CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9
+CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10
+CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11
+CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12
+CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
+CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14
+CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15
+CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
+CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
+CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
+CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19
+CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_WIDTH = 21
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_WIDTH = 22
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_HEIGHT = 23
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH = 24
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT = 25
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH = 26
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_WIDTH = 27
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_HEIGHT = 28
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_LAYERS = 29
+CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30
+CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31
+CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32
+CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
+CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
+CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
+CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
+CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37
+CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTI_PROCESSOR = 39
+CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40
+CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_WIDTH = 42
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_LAYERS = 43
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_WIDTH = 45
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_HEIGHT = 46
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH_ALT = 47
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT_ALT = 48
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH_ALT = 49
+CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
+CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_WIDTH = 52
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_WIDTH = 53
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_LAYERS = 54
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_WIDTH = 55
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_WIDTH = 56
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_HEIGHT = 57
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_WIDTH = 58
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_HEIGHT = 59
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_DEPTH = 60
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_WIDTH = 61
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_LAYERS = 62
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_WIDTH = 63
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_HEIGHT = 64
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_LAYERS = 65
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_WIDTH = 66
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_WIDTH = 67
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_LAYERS = 68
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LINEAR_WIDTH = 69
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_WIDTH = 70
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_HEIGHT = 71
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_PITCH = 72
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_MIPMAPPED_WIDTH = 73
+CU_DEVICE_ATTRIBUTE_MAX_MAX_TEXTURE_2D_MIPMAPPED_HEIGHT = 74
+CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75
+CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_MIPMAPPED_WIDTH = 77
+CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78
+CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79
+CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80
+CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81
+CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
+CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83
+CU_DEVICE_ATTRIBUTE_IS_MULTI_GPU_BOARD = 84
+CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85
+CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86
+CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87
+CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
+CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89
+CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90
+CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
+CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95
+CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96
+CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/error.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/error.py
@@ -0,0 +1,36 @@
+class CudaDriverError(Exception):
+    pass
+
+
+class CudaRuntimeError(Exception):
+    pass
+
+
+class CudaSupportError(ImportError):
+    pass
+
+
+class NvvmError(Exception):
+    def __str__(self):
+        return '\n'.join(map(str, self.args))
+
+
+class NvvmSupportError(ImportError):
+    pass
+
+
+class NvvmWarning(Warning):
+    pass
+
+
+class NvrtcError(Exception):
+    def __str__(self):
+        return '\n'.join(map(str, self.args))
+
+
+class NvrtcCompilationError(NvrtcError):
+    pass
+
+
+class NvrtcSupportError(ImportError):
+    pass
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/libs.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/libs.py
@@ -0,0 +1,176 @@
+"""CUDA Toolkit libraries lookup utilities.
+
+CUDA Toolkit libraries can be available via either:
+
+- the `cuda-nvcc` and `cuda-nvrtc` conda packages for CUDA 12,
+- the `cudatoolkit` conda package for CUDA 11,
+- a user supplied location from CUDA_HOME,
+- a system wide location,
+- package-specific locations (e.g. the Debian NVIDIA packages),
+- or can be discovered by the system loader.
+"""
+
+import os
+import sys
+import ctypes
+
+from numba.misc.findlib import find_lib
+from numba.cuda.cuda_paths import get_cuda_paths
+from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
+from numba.cuda.cudadrv.error import CudaSupportError
+
+
+if sys.platform == 'win32':
+    _dllnamepattern = '%s.dll'
+    _staticnamepattern = '%s.lib'
+elif sys.platform == 'darwin':
+    _dllnamepattern = 'lib%s.dylib'
+    _staticnamepattern = 'lib%s.a'
+else:
+    _dllnamepattern = 'lib%s.so'
+    _staticnamepattern = 'lib%s.a'
+
+
+def get_libdevice():
+    d = get_cuda_paths()
+    paths = d['libdevice'].info
+    return paths
+
+
+def open_libdevice():
+    with open(get_libdevice(), 'rb') as bcfile:
+        return bcfile.read()
+
+
+def get_cudalib(lib, static=False):
+    """
+    Find the path of a CUDA library based on a search of known locations. If
+    the search fails, return a generic filename for the library (e.g.
+    'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
+    loader's search mechanism.
+    """
+    if lib == 'nvvm':
+        return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
+    else:
+        dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
+        libdir = get_cuda_paths()[dir_type].info
+
+    candidates = find_lib(lib, libdir, static=static)
+    namepattern = _staticnamepattern if static else _dllnamepattern
+    return max(candidates) if candidates else namepattern % lib
+
+
+def open_cudalib(lib):
+    path = get_cudalib(lib)
+    return ctypes.CDLL(path)
+
+
+def check_static_lib(path):
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f'{path} not found')
+
+
+def _get_source_variable(lib, static=False):
+    if lib == 'nvvm':
+        return get_cuda_paths()['nvvm'].by
+    elif lib == 'libdevice':
+        return get_cuda_paths()['libdevice'].by
+    else:
+        dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
+        return get_cuda_paths()[dir_type].by
+
+
+def test():
+    """Test library lookup.  Path info is printed to stdout.
+    """
+    failed = False
+
+    # Check for the driver
+    try:
+        dlloader, candidates = locate_driver_and_loader()
+        print('Finding driver from candidates:')
+        for location in candidates:
+            print(f'\t{location}')
+        print(f'Using loader {dlloader}')
+        print('\tTrying to load driver', end='...')
+        dll, path = load_driver(dlloader, candidates)
+        print('\tok')
+        print(f'\t\tLoaded from {path}')
+    except CudaSupportError as e:
+        print(f'\tERROR: failed to open driver: {e}')
+        failed = True
+
+    # Find the absolute location of the driver on Linux. Various driver-related
+    # issues have been reported by WSL2 users, and it is almost always due to a
+    # Linux (i.e. not- WSL2) driver being installed in a WSL2 system.
+    # Providing the absolute location of the driver indicates its version
+    # number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
+    # look up whether the driver was intended for "native" Linux.
+    if sys.platform == 'linux' and not failed:
+        pid = os.getpid()
+        mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
+        try:
+            with open(mapsfile) as f:
+                maps = f.read()
+        # It's difficult to predict all that might go wrong reading the maps
+        # file - in case various error conditions ensue (the file is not found,
+        # not readable, etc.) we use OSError to hopefully catch any of them.
+        except OSError:
+            # It's helpful to report that this went wrong to the user, but we
+            # don't set failed to True because this doesn't have any connection
+            # to actual CUDA functionality.
+            print(f'\tERROR: Could not open {mapsfile} to determine absolute '
+                  'path to libcuda.so')
+        else:
+            # In this case we could read the maps, so we can report the
+            # relevant ones to the user
+            locations = set(s for s in maps.split() if 'libcuda.so' in s)
+            print('\tMapped libcuda.so paths:')
+            for location in locations:
+                print(f'\t\t{location}')
+
+    # Checks for dynamic libraries
+    libs = 'nvvm nvrtc cudart'.split()
+    for lib in libs:
+        path = get_cudalib(lib)
+        print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
+        print('\tLocated at', path)
+
+        try:
+            print('\tTrying to open library', end='...')
+            open_cudalib(lib)
+            print('\tok')
+        except OSError as e:
+            print('\tERROR: failed to open %s:\n%s' % (lib, e))
+            failed = True
+
+    # Check for cudadevrt (the only static library)
+    lib = 'cudadevrt'
+    path = get_cudalib(lib, static=True)
+    print('Finding {} from {}'.format(lib, _get_source_variable(lib,
+                                                                static=True)))
+    print('\tLocated at', path)
+
+    try:
+        print('\tChecking library', end='...')
+        check_static_lib(path)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find %s:\n%s' % (lib, e))
+        failed = True
+
+    # Check for libdevice
+    where = _get_source_variable('libdevice')
+    print(f'Finding libdevice from {where}')
+    path = get_libdevice()
+    print('\tLocated at', path)
+
+    try:
+        print('\tChecking library', end='...')
+        check_static_lib(path)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find %s:\n%s' % (lib, e))
+        failed = True
+
+    return not failed
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/ndarray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/ndarray.py
@@ -0,0 +1,20 @@
+from numba.cuda.cudadrv import devices, driver
+from numba.core.registry import cpu_target
+
+
+def _calc_array_sizeof(ndim):
+    """
+    Use the ABI size in the CPU target
+    """
+    ctx = cpu_target.target_context
+    return ctx.calc_array_sizeof(ndim)
+
+
+def ndarray_device_allocate_data(ary):
+    """
+    Allocate gpu data buffer
+    """
+    datasize = driver.host_memory_size(ary)
+    # allocate
+    gpu_data = devices.get_context().memalloc(datasize)
+    return gpu_data
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvrtc.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvrtc.py
@@ -0,0 +1,260 @@
+from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
+from enum import IntEnum
+from numba.core import config
+from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
+                                      NvrtcSupportError)
+
+import functools
+import os
+import threading
+import warnings
+
+# Opaque handle for compilation unit
+nvrtc_program = c_void_p
+
+# Result code
+nvrtc_result = c_int
+
+
+class NvrtcResult(IntEnum):
+    NVRTC_SUCCESS = 0
+    NVRTC_ERROR_OUT_OF_MEMORY = 1
+    NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
+    NVRTC_ERROR_INVALID_INPUT = 3
+    NVRTC_ERROR_INVALID_PROGRAM = 4
+    NVRTC_ERROR_INVALID_OPTION = 5
+    NVRTC_ERROR_COMPILATION = 6
+    NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
+    NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
+    NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
+    NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
+    NVRTC_ERROR_INTERNAL_ERROR = 11
+
+
+_nvrtc_lock = threading.Lock()
+
+
+class NvrtcProgram:
+    """
+    A class for managing the lifetime of nvrtcProgram instances. Instances of
+    the class own an nvrtcProgram; when an instance is deleted, the underlying
+    nvrtcProgram is destroyed using the appropriate NVRTC API.
+    """
+    def __init__(self, nvrtc, handle):
+        self._nvrtc = nvrtc
+        self._handle = handle
+
+    @property
+    def handle(self):
+        return self._handle
+
+    def __del__(self):
+        if self._handle:
+            self._nvrtc.destroy_program(self)
+
+
+class NVRTC:
+    """
+    Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
+    calls.
+
+    The sole instance of this class is a process-wide singleton, similar to the
+    NVVM interface. Initialization is protected by a lock and uses the standard
+    (for Numba) open_cudalib function to load the NVRTC library.
+    """
+    _PROTOTYPES = {
+        # nvrtcResult nvrtcVersion(int *major, int *minor)
+        'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
+        # nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+        #                                const char *src,
+        #                                const char *name,
+        #                                int numHeaders,
+        #                                const char * const *headers,
+        #                                const char * const *includeNames)
+        'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
+                               c_int, POINTER(c_char_p), POINTER(c_char_p)),
+        # nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
+        'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
+        # nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+        #                                 int numOptions,
+        #                                 const char * const *options)
+        'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
+                                POINTER(c_char_p)),
+        # nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
+        'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+        'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
+        # nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
+        #                               size_t *cubinSizeRet);
+        'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
+        'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
+        # nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
+        #                                    size_t *logSizeRet);
+        'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
+                                   POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
+        'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
+    }
+
+    # Singleton reference
+    __INSTANCE = None
+
+    def __new__(cls):
+        with _nvrtc_lock:
+            if cls.__INSTANCE is None:
+                from numba.cuda.cudadrv.libs import open_cudalib
+                cls.__INSTANCE = inst = object.__new__(cls)
+                try:
+                    lib = open_cudalib('nvrtc')
+                except OSError as e:
+                    cls.__INSTANCE = None
+                    raise NvrtcSupportError("NVRTC cannot be loaded") from e
+
+                # Find & populate functions
+                for name, proto in inst._PROTOTYPES.items():
+                    func = getattr(lib, name)
+                    func.restype = proto[0]
+                    func.argtypes = proto[1:]
+
+                    @functools.wraps(func)
+                    def checked_call(*args, func=func, name=name):
+                        error = func(*args)
+                        if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
+                            raise NvrtcCompilationError()
+                        elif error != NvrtcResult.NVRTC_SUCCESS:
+                            try:
+                                error_name = NvrtcResult(error).name
+                            except ValueError:
+                                error_name = ('Unknown nvrtc_result '
+                                              f'(error code: {error})')
+                            msg = f'Failed to call {name}: {error_name}'
+                            raise NvrtcError(msg)
+
+                    setattr(inst, name, checked_call)
+
+        return cls.__INSTANCE
+
+    def get_version(self):
+        """
+        Get the NVRTC version as a tuple (major, minor).
+        """
+        major = c_int()
+        minor = c_int()
+        self.nvrtcVersion(byref(major), byref(minor))
+        return major.value, minor.value
+
+    def create_program(self, src, name):
+        """
+        Create an NVRTC program with managed lifetime.
+        """
+        if isinstance(src, str):
+            src = src.encode()
+        if isinstance(name, str):
+            name = name.encode()
+
+        handle = nvrtc_program()
+
+        # The final three arguments are for passing the contents of headers -
+        # this is not supported, so there are 0 headers and the header names
+        # and contents are null.
+        self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
+        return NvrtcProgram(self, handle)
+
+    def compile_program(self, program, options):
+        """
+        Compile an NVRTC program. Compilation may fail due to a user error in
+        the source; this function returns ``True`` if there is a compilation
+        error and ``False`` on success.
+        """
+        # We hold a list of encoded options to ensure they can't be collected
+        # prior to the call to nvrtcCompileProgram
+        encoded_options = [opt.encode() for opt in options]
+        option_pointers = [c_char_p(opt) for opt in encoded_options]
+        c_options_type = (c_char_p * len(options))
+        c_options = c_options_type(*option_pointers)
+        try:
+            self.nvrtcCompileProgram(program.handle, len(options), c_options)
+            return False
+        except NvrtcCompilationError:
+            return True
+
+    def destroy_program(self, program):
+        """
+        Destroy an NVRTC program.
+        """
+        self.nvrtcDestroyProgram(byref(program.handle))
+
+    def get_compile_log(self, program):
+        """
+        Get the compile log as a Python string.
+        """
+        log_size = c_size_t()
+        self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
+
+        log = (c_char * log_size.value)()
+        self.nvrtcGetProgramLog(program.handle, log)
+
+        return log.value.decode()
+
+    def get_ptx(self, program):
+        """
+        Get the compiled PTX as a Python string.
+        """
+        ptx_size = c_size_t()
+        self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
+
+        ptx = (c_char * ptx_size.value)()
+        self.nvrtcGetPTX(program.handle, ptx)
+
+        return ptx.value.decode()
+
+
+def compile(src, name, cc):
+    """
+    Compile a CUDA C/C++ source to PTX for a given compute capability.
+
+    :param src: The source code to compile
+    :type src: str
+    :param name: The filename of the source (for information only)
+    :type name: str
+    :param cc: A tuple ``(major, minor)`` of the compute capability
+    :type cc: tuple
+    :return: The compiled PTX and compilation log
+    :rtype: tuple
+    """
+    nvrtc = NVRTC()
+    program = nvrtc.create_program(src, name)
+
+    # Compilation options:
+    # - Compile for the current device's compute capability.
+    # - The CUDA include path is added.
+    # - Relocatable Device Code (rdc) is needed to prevent device functions
+    #   being optimized away.
+    major, minor = cc
+    arch = f'--gpu-architecture=compute_{major}{minor}'
+    include = f'-I{config.CUDA_INCLUDE_PATH}'
+
+    cudadrv_path = os.path.dirname(os.path.abspath(__file__))
+    numba_cuda_path = os.path.dirname(cudadrv_path)
+    numba_include = f'-I{numba_cuda_path}'
+    options = [arch, include, numba_include, '-rdc', 'true']
+
+    # Compile the program
+    compile_error = nvrtc.compile_program(program, options)
+
+    # Get log from compilation
+    log = nvrtc.get_compile_log(program)
+
+    # If the compile failed, provide the log in an exception
+    if compile_error:
+        msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
+        raise NvrtcError(msg)
+
+    # Otherwise, if there's any content in the log, present it as a warning
+    if log:
+        msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
+        warnings.warn(msg)
+
+    ptx = nvrtc.get_ptx(program)
+    return ptx, log
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvvm.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvvm.py
@@ -0,0 +1,707 @@
+"""
+This is a direct translation of nvvm.h
+"""
+import logging
+import re
+import sys
+import warnings
+from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
+                    c_char)
+
+import threading
+
+from llvmlite import ir
+
+from .error import NvvmError, NvvmSupportError, NvvmWarning
+from .libs import get_libdevice, open_libdevice, open_cudalib
+from numba.core import cgutils, config
+
+
+logger = logging.getLogger(__name__)
+
+ADDRSPACE_GENERIC = 0
+ADDRSPACE_GLOBAL = 1
+ADDRSPACE_SHARED = 3
+ADDRSPACE_CONSTANT = 4
+ADDRSPACE_LOCAL = 5
+
+# Opaque handle for compilation unit
+nvvm_program = c_void_p
+
+# Result code
+nvvm_result = c_int
+
+RESULT_CODE_NAMES = '''
+NVVM_SUCCESS
+NVVM_ERROR_OUT_OF_MEMORY
+NVVM_ERROR_PROGRAM_CREATION_FAILURE
+NVVM_ERROR_IR_VERSION_MISMATCH
+NVVM_ERROR_INVALID_INPUT
+NVVM_ERROR_INVALID_PROGRAM
+NVVM_ERROR_INVALID_IR
+NVVM_ERROR_INVALID_OPTION
+NVVM_ERROR_NO_MODULE_IN_PROGRAM
+NVVM_ERROR_COMPILATION
+'''.split()
+
+for i, k in enumerate(RESULT_CODE_NAMES):
+    setattr(sys.modules[__name__], k, i)
+
+# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
+
+_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
+                        'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
+                        'v64:64:64-v128:128:128-n16:32:64')
+_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
+                    'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
+                    'v64:64:64-v128:128:128-n16:32:64')
+
+
+def is_available():
+    """
+    Return if libNVVM is available
+    """
+    try:
+        NVVM()
+    except NvvmSupportError:
+        return False
+    else:
+        return True
+
+
+_nvvm_lock = threading.Lock()
+
+
+class NVVM(object):
+    '''Process-wide singleton.
+    '''
+    _PROTOTYPES = {
+
+        # nvvmResult nvvmVersion(int *major, int *minor)
+        'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
+
+        # nvvmResult nvvmCreateProgram(nvvmProgram *cu)
+        'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
+
+        # nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
+        'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
+
+        # nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
+        #                                   size_t size, const char *name)
+        'nvvmAddModuleToProgram': (
+            nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
+
+        # nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
+        #                                       const char* buffer,
+        #                                       size_t size,
+        #                                       const char *name)
+        'nvvmLazyAddModuleToProgram': (
+            nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
+
+        # nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
+        #                          const char **options)
+        'nvvmCompileProgram': (
+            nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
+
+        # nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
+        #                                      size_t *bufferSizeRet)
+        'nvvmGetCompiledResultSize': (
+            nvvm_result, nvvm_program, POINTER(c_size_t)),
+
+        # nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
+        'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
+
+        # nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
+        #                                      size_t *bufferSizeRet)
+        'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
+
+        # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
+        'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
+
+        # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
+        #                           int* minorDbg )
+        'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
+                          POINTER(c_int), POINTER(c_int)),
+        # nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
+        #                               const char** options)
+        'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
+                              POINTER(c_char_p))
+    }
+
+    # Singleton reference
+    __INSTANCE = None
+
+    def __new__(cls):
+        with _nvvm_lock:
+            if cls.__INSTANCE is None:
+                cls.__INSTANCE = inst = object.__new__(cls)
+                try:
+                    inst.driver = open_cudalib('nvvm')
+                except OSError as e:
+                    cls.__INSTANCE = None
+                    errmsg = ("libNVVM cannot be found. Do `conda install "
+                              "cudatoolkit`:\n%s")
+                    raise NvvmSupportError(errmsg % e)
+
+                # Find & populate functions
+                for name, proto in inst._PROTOTYPES.items():
+                    func = getattr(inst.driver, name)
+                    func.restype = proto[0]
+                    func.argtypes = proto[1:]
+                    setattr(inst, name, func)
+
+        return cls.__INSTANCE
+
+    def __init__(self):
+        ir_versions = self.get_ir_version()
+        self._majorIR = ir_versions[0]
+        self._minorIR = ir_versions[1]
+        self._majorDbg = ir_versions[2]
+        self._minorDbg = ir_versions[3]
+        self._supported_ccs = get_supported_ccs()
+
+    @property
+    def data_layout(self):
+        if (self._majorIR, self._minorIR) < (1, 8):
+            return _datalayout_original
+        else:
+            return _datalayout_i128
+
+    @property
+    def supported_ccs(self):
+        return self._supported_ccs
+
+    def get_version(self):
+        major = c_int()
+        minor = c_int()
+        err = self.nvvmVersion(byref(major), byref(minor))
+        self.check_error(err, 'Failed to get version.')
+        return major.value, minor.value
+
+    def get_ir_version(self):
+        majorIR = c_int()
+        minorIR = c_int()
+        majorDbg = c_int()
+        minorDbg = c_int()
+        err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
+                                 byref(majorDbg), byref(minorDbg))
+        self.check_error(err, 'Failed to get IR version.')
+        return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
+
+    def check_error(self, error, msg, exit=False):
+        if error:
+            exc = NvvmError(msg, RESULT_CODE_NAMES[error])
+            if exit:
+                print(exc)
+                sys.exit(1)
+            else:
+                raise exc
+
+
+class CompilationUnit(object):
+    def __init__(self):
+        self.driver = NVVM()
+        self._handle = nvvm_program()
+        err = self.driver.nvvmCreateProgram(byref(self._handle))
+        self.driver.check_error(err, 'Failed to create CU')
+
+    def __del__(self):
+        driver = NVVM()
+        err = driver.nvvmDestroyProgram(byref(self._handle))
+        driver.check_error(err, 'Failed to destroy CU', exit=True)
+
+    def add_module(self, buffer):
+        """
+         Add a module level NVVM IR to a compilation unit.
+         - The buffer should contain an NVVM module IR either in the bitcode
+           representation (LLVM3.0) or in the text representation.
+        """
+        err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
+                                                 len(buffer), None)
+        self.driver.check_error(err, 'Failed to add module')
+
+    def lazy_add_module(self, buffer):
+        """
+        Lazily add an NVVM IR module to a compilation unit.
+        The buffer should contain NVVM module IR either in the bitcode
+        representation or in the text representation.
+        """
+        err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
+                                                     len(buffer), None)
+        self.driver.check_error(err, 'Failed to add module')
+
+    def compile(self, **options):
+        """Perform Compilation.
+
+        Compilation options are accepted as keyword arguments, with the
+        following considerations:
+
+        - Underscores (`_`) in option names are converted to dashes (`-`), to
+          match NVVM's option name format.
+        - Options that take a value will be emitted in the form
+          "-<name>=<value>".
+        - Booleans passed as option values will be converted to integers.
+        - Options which take no value (such as `-gen-lto`) should have a value
+          of `None` passed in and will be emitted in the form "-<name>".
+
+        For documentation on NVVM compilation options, see the CUDA Toolkit
+        Documentation:
+
+        https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
+        """
+
+        def stringify_option(k, v):
+            k = k.replace('_', '-')
+
+            if v is None:
+                return f'-{k}'
+
+            if isinstance(v, bool):
+                v = int(v)
+
+            return f'-{k}={v}'
+
+        options = [stringify_option(k, v) for k, v in options.items()]
+
+        c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
+                                             for x in options])
+        # verify
+        err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
+        self._try_error(err, 'Failed to verify\n')
+
+        # compile
+        err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
+        self._try_error(err, 'Failed to compile\n')
+
+        # get result
+        reslen = c_size_t()
+        err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
+
+        self._try_error(err, 'Failed to get size of compiled result.')
+
+        output_buffer = (c_char * reslen.value)()
+        err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
+        self._try_error(err, 'Failed to get compiled result.')
+
+        # get log
+        self.log = self.get_log()
+        if self.log:
+            warnings.warn(self.log, category=NvvmWarning)
+
+        return output_buffer[:]
+
+    def _try_error(self, err, msg):
+        self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
+
+    def get_log(self):
+        reslen = c_size_t()
+        err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
+        self.driver.check_error(err, 'Failed to get compilation log size.')
+
+        if reslen.value > 1:
+            logbuf = (c_char * reslen.value)()
+            err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
+            self.driver.check_error(err, 'Failed to get compilation log.')
+
+            return logbuf.value.decode('utf8')  # populate log attribute
+
+        return ''
+
+
+COMPUTE_CAPABILITIES = (
+    (3, 5), (3, 7),
+    (5, 0), (5, 2), (5, 3),
+    (6, 0), (6, 1), (6, 2),
+    (7, 0), (7, 2), (7, 5),
+    (8, 0), (8, 6), (8, 7), (8, 9),
+    (9, 0)
+)
+
+# Maps CTK version -> (min supported cc, max supported cc) inclusive
+CTK_SUPPORTED = {
+    (11, 2): ((3, 5), (8, 6)),
+    (11, 3): ((3, 5), (8, 6)),
+    (11, 4): ((3, 5), (8, 7)),
+    (11, 5): ((3, 5), (8, 7)),
+    (11, 6): ((3, 5), (8, 7)),
+    (11, 7): ((3, 5), (8, 7)),
+    (11, 8): ((3, 5), (9, 0)),
+    (12, 0): ((5, 0), (9, 0)),
+    (12, 1): ((5, 0), (9, 0)),
+    (12, 2): ((5, 0), (9, 0)),
+    (12, 3): ((5, 0), (9, 0)),
+    (12, 4): ((5, 0), (9, 0)),
+}
+
+
+def ccs_supported_by_ctk(ctk_version):
+    try:
+        # For supported versions, we look up the range of supported CCs
+        min_cc, max_cc = CTK_SUPPORTED[ctk_version]
+        return tuple([cc for cc in COMPUTE_CAPABILITIES
+                      if min_cc <= cc <= max_cc])
+    except KeyError:
+        # For unsupported CUDA toolkit versions, all we can do is assume all
+        # non-deprecated versions we are aware of are supported.
+        return tuple([cc for cc in COMPUTE_CAPABILITIES
+                      if cc >= config.CUDA_DEFAULT_PTX_CC])
+
+
+def get_supported_ccs():
+    try:
+        from numba.cuda.cudadrv.runtime import runtime
+        cudart_version = runtime.get_version()
+    except: # noqa: E722
+        # We can't support anything if there's an error getting the runtime
+        # version (e.g. if it's not present or there's another issue)
+        _supported_cc = ()
+        return _supported_cc
+
+    # Ensure the minimum CTK version requirement is met
+    min_cudart = min(CTK_SUPPORTED)
+    if cudart_version < min_cudart:
+        _supported_cc = ()
+        ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
+        unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
+                           f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
+                           "required version.")
+        warnings.warn(unsupported_ver)
+        return _supported_cc
+
+    _supported_cc = ccs_supported_by_ctk(cudart_version)
+    return _supported_cc
+
+
+def find_closest_arch(mycc):
+    """
+    Given a compute capability, return the closest compute capability supported
+    by the CUDA toolkit.
+
+    :param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
+    :return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
+    """
+    supported_ccs = NVVM().supported_ccs
+
+    if not supported_ccs:
+        msg = "No supported GPU compute capabilities found. " \
+              "Please check your cudatoolkit version matches your CUDA version."
+        raise NvvmSupportError(msg)
+
+    for i, cc in enumerate(supported_ccs):
+        if cc == mycc:
+            # Matches
+            return cc
+        elif cc > mycc:
+            # Exceeded
+            if i == 0:
+                # CC lower than supported
+                msg = "GPU compute capability %d.%d is not supported" \
+                      "(requires >=%d.%d)" % (mycc + cc)
+                raise NvvmSupportError(msg)
+            else:
+                # return the previous CC
+                return supported_ccs[i - 1]
+
+    # CC higher than supported
+    return supported_ccs[-1]  # Choose the highest
+
+
+def get_arch_option(major, minor):
+    """Matches with the closest architecture option
+    """
+    if config.FORCE_CUDA_CC:
+        arch = config.FORCE_CUDA_CC
+    else:
+        arch = find_closest_arch((major, minor))
+    return 'compute_%d%d' % arch
+
+
+MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
+Please ensure you have a CUDA Toolkit 11.2 or higher.
+For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
+
+    $ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
+
+For CUDA 11, ``cudatoolkit`` is required:
+
+    $ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
+'''
+
+
+class LibDevice(object):
+    _cache_ = None
+
+    def __init__(self):
+        if self._cache_ is None:
+            if get_libdevice() is None:
+                raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
+            self._cache_ = open_libdevice()
+
+        self.bc = self._cache_
+
+    def get(self):
+        return self.bc
+
+
+cas_nvvm = """
+    %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
+    %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
+""" # noqa: E501
+
+
+# Translation of code from CUDA Programming Guide v6.5, section B.12
+ir_numba_atomic_binary_template = """
+define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
+entry:
+    %iptr = bitcast {T}* %ptr to {Ti}*
+    %old2 = load volatile {Ti}, {Ti}* %iptr
+    br label %attempt
+
+attempt:
+    %old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
+    %dold = bitcast {Ti} %old to {T}
+    %dnew = {OP} {T} %dold, %val
+    %new = bitcast {T} %dnew to {Ti}
+    {CAS}
+    %repeat = icmp ne {Ti} %cas, %old
+    br i1 %repeat, label %attempt, label %done
+
+done:
+    %result = bitcast {Ti} %old to {T}
+    ret {T} %result
+}}
+""" # noqa: E501
+
+ir_numba_atomic_inc_template = """
+define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
+entry:
+    %old2 = load volatile {T}, {T}* %iptr
+    br label %attempt
+
+attempt:
+    %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
+    %bndchk = icmp ult {T} %old, %val
+    %inc = add {T} %old, 1
+    %new = select i1 %bndchk, {T} %inc, {T} 0
+    {CAS}
+    %repeat = icmp ne {T} %cas, %old
+    br i1 %repeat, label %attempt, label %done
+
+done:
+    ret {T} %old
+}}
+""" # noqa: E501
+
+ir_numba_atomic_dec_template = """
+define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
+entry:
+    %old2 = load volatile {T}, {T}* %iptr
+    br label %attempt
+
+attempt:
+    %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
+    %dec = add {T} %old, -1
+    %bndchk = icmp ult {T} %dec, %val
+    %new = select i1 %bndchk, {T} %dec, {T} %val
+    {CAS}
+    %repeat = icmp ne {T} %cas, %old
+    br i1 %repeat, label %attempt, label %done
+
+done:
+    ret {T} %old
+}}
+""" # noqa: E501
+
+ir_numba_atomic_minmax_template = """
+define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
+entry:
+    %ptrval = load volatile {T}, {T}* %ptr
+    ; Return early when:
+    ; - For nanmin / nanmax when val is a NaN
+    ; - For min / max when val or ptr is a NaN
+    %early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
+    br i1 %early_return, label %done, label %lt_check
+
+lt_check:
+    %dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
+    ; Continue attempts if dold less or greater than val (depending on whether min or max)
+    ; or if dold is NaN (for nanmin / nanmax)
+    %cmp = fcmp {OP} {T} %dold, %val
+    br i1 %cmp, label %attempt, label %done
+
+attempt:
+    ; Attempt to swap in the value
+    %old = bitcast {T} %dold to {Ti}
+    %iptr = bitcast {T}* %ptr to {Ti}*
+    %new = bitcast {T} %val to {Ti}
+    {CAS}
+    %dcas = bitcast {Ti} %cas to {T}
+    br label %lt_check
+
+done:
+    ret {T} %ptrval
+}}
+""" # noqa: E501
+
+
+def ir_cas(Ti):
+    return cas_nvvm.format(Ti=Ti)
+
+
+def ir_numba_atomic_binary(T, Ti, OP, FUNC):
+    params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
+    return ir_numba_atomic_binary_template.format(**params)
+
+
+def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
+    params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
+                  FUNC=FUNC, CAS=ir_cas(Ti))
+
+    return ir_numba_atomic_minmax_template.format(**params)
+
+
+def ir_numba_atomic_inc(T, Tu):
+    return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
+
+
+def ir_numba_atomic_dec(T, Tu):
+    return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
+
+
+def llvm_replace(llvmir):
+    replacements = [
+        ('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
+        ('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")',         # noqa: E501
+         ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
+        ('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
+        ('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
+         ir_numba_atomic_inc(T='i64', Tu='u64')),
+        ('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
+         ir_numba_atomic_dec(T='i64', Tu='u64')),
+        ('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")',         # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
+                                PTR_OR_VAL='ptr', FUNC='max')),
+        ('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
+                                PTR_OR_VAL='ptr', FUNC='max')),
+        ('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")',         # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
+                                PTR_OR_VAL='ptr', FUNC='min')),
+        ('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
+                                PTR_OR_VAL='ptr', FUNC='min')),
+        ('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")',      # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
+                                PTR_OR_VAL='', FUNC='max')),
+        ('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")',  # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
+                                PTR_OR_VAL='', FUNC='max')),
+        ('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")',      # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
+                                PTR_OR_VAL='', FUNC='min')),
+        ('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")',  # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
+                                PTR_OR_VAL='', FUNC='min')),
+        ('immarg', '')
+    ]
+
+    for decl, fn in replacements:
+        llvmir = llvmir.replace(decl, fn)
+
+    llvmir = llvm140_to_70_ir(llvmir)
+
+    return llvmir
+
+
+def compile_ir(llvmir, **opts):
+    if isinstance(llvmir, str):
+        llvmir = [llvmir]
+
+    if opts.pop('fastmath', False):
+        opts.update({
+            'ftz': True,
+            'fma': True,
+            'prec_div': False,
+            'prec_sqrt': False,
+        })
+
+    cu = CompilationUnit()
+    libdevice = LibDevice()
+
+    for mod in llvmir:
+        mod = llvm_replace(mod)
+        cu.add_module(mod.encode('utf8'))
+    cu.lazy_add_module(libdevice.get())
+
+    return cu.compile(**opts)
+
+
+re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
+
+
+def llvm140_to_70_ir(ir):
+    """
+    Convert LLVM 14.0 IR for LLVM 7.0.
+    """
+    buf = []
+    for line in ir.splitlines():
+        if line.startswith('attributes #'):
+            # Remove function attributes unsupported by LLVM 7.0
+            m = re_attributes_def.match(line)
+            attrs = m.group(1).split()
+            attrs = ' '.join(a for a in attrs if a != 'willreturn')
+            line = line.replace(m.group(1), attrs)
+
+        buf.append(line)
+
+    return '\n'.join(buf)
+
+
+def set_cuda_kernel(function):
+    """
+    Mark a function as a CUDA kernel. Kernels have the following requirements:
+
+    - Metadata that marks them as a kernel.
+    - Addition to the @llvm.used list, so that they will not be discarded.
+    - The noinline attribute is not permitted, because this causes NVVM to emit
+      a warning, which counts as failing IR verification.
+
+    Presently it is assumed that there is one kernel per module, which holds
+    for Numba-jitted functions. If this changes in future or this function is
+    to be used externally, this function may need modification to add to the
+    @llvm.used list rather than creating it.
+    """
+    module = function.module
+
+    # Add kernel metadata
+    mdstr = ir.MetaDataString(module, "kernel")
+    mdvalue = ir.Constant(ir.IntType(32), 1)
+    md = module.add_metadata((function, mdstr, mdvalue))
+
+    nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
+    nmd.add(md)
+
+    # Create the used list
+    ptrty = ir.IntType(8).as_pointer()
+    usedty = ir.ArrayType(ptrty, 1)
+
+    fnptr = function.bitcast(ptrty)
+
+    llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
+    llvm_used.linkage = 'appending'
+    llvm_used.section = 'llvm.metadata'
+    llvm_used.initializer = ir.Constant(usedty, [fnptr])
+
+    # Remove 'noinline' if it is present.
+    function.attributes.discard('noinline')
+
+
+def add_ir_version(mod):
+    """Add NVVM IR version to module"""
+    # We specify the IR version to match the current NVVM's IR version
+    i32 = ir.IntType(32)
+    ir_versions = [i32(v) for v in NVVM().get_ir_version()]
+    md_ver = mod.add_metadata(ir_versions)
+    mod.add_named_metadata('nvvmir.version', md_ver)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/rtapi.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/rtapi.py
@@ -0,0 +1,10 @@
+"""
+Declarations of the Runtime API functions.
+"""
+
+from ctypes import c_int, POINTER
+
+API_PROTOTYPES = {
+    # cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
+    'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
+}
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/runtime.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/runtime.py
@@ -0,0 +1,142 @@
+"""
+CUDA Runtime wrapper.
+
+This provides a very minimal set of bindings, since the Runtime API is not
+really used in Numba except for querying the Runtime version.
+"""
+
+import ctypes
+import functools
+import sys
+
+from numba.core import config
+from numba.cuda.cudadrv.driver import ERROR_MAP, make_logger
+from numba.cuda.cudadrv.error import CudaSupportError, CudaRuntimeError
+from numba.cuda.cudadrv.libs import open_cudalib
+from numba.cuda.cudadrv.rtapi import API_PROTOTYPES
+from numba.cuda.cudadrv import enums
+
+
+class CudaRuntimeAPIError(CudaRuntimeError):
+    """
+    Raised when there is an error accessing a C API from the CUDA Runtime.
+    """
+    def __init__(self, code, msg):
+        self.code = code
+        self.msg = msg
+        super().__init__(code, msg)
+
+    def __str__(self):
+        return "[%s] %s" % (self.code, self.msg)
+
+
+class Runtime:
+    """
+    Runtime object that lazily binds runtime API functions.
+    """
+
+    def __init__(self):
+        self.is_initialized = False
+
+    def _initialize(self):
+        # lazily initialize logger
+        global _logger
+        _logger = make_logger()
+
+        if config.DISABLE_CUDA:
+            msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
+                   "in the environment, or because CUDA is unsupported on "
+                   "32-bit systems.")
+            raise CudaSupportError(msg)
+        self.lib = open_cudalib('cudart')
+
+        self.is_initialized = True
+
+    def __getattr__(self, fname):
+        # First request of a runtime API function
+        try:
+            proto = API_PROTOTYPES[fname]
+        except KeyError:
+            raise AttributeError(fname)
+        restype = proto[0]
+        argtypes = proto[1:]
+
+        if not self.is_initialized:
+            self._initialize()
+
+        # Find function in runtime library
+        libfn = self._find_api(fname)
+        libfn.restype = restype
+        libfn.argtypes = argtypes
+
+        safe_call = self._wrap_api_call(fname, libfn)
+        setattr(self, fname, safe_call)
+        return safe_call
+
+    def _wrap_api_call(self, fname, libfn):
+        @functools.wraps(libfn)
+        def safe_cuda_api_call(*args):
+            _logger.debug('call runtime api: %s', libfn.__name__)
+            retcode = libfn(*args)
+            self._check_error(fname, retcode)
+        return safe_cuda_api_call
+
+    def _check_error(self, fname, retcode):
+        if retcode != enums.CUDA_SUCCESS:
+            errname = ERROR_MAP.get(retcode, "cudaErrorUnknown")
+            msg = "Call to %s results in %s" % (fname, errname)
+            _logger.error(msg)
+            raise CudaRuntimeAPIError(retcode, msg)
+
+    def _find_api(self, fname):
+        try:
+            return getattr(self.lib, fname)
+        except AttributeError:
+            pass
+
+        # Not found.
+        # Delay missing function error to use
+        def absent_function(*args, **kws):
+            msg = "runtime missing function: %s."
+            raise CudaRuntimeError(msg % fname)
+
+        setattr(self, fname, absent_function)
+        return absent_function
+
+    def get_version(self):
+        """
+        Returns the CUDA Runtime version as a tuple (major, minor).
+        """
+        rtver = ctypes.c_int()
+        self.cudaRuntimeGetVersion(ctypes.byref(rtver))
+        # The version is encoded as (1000 * major) + (10 * minor)
+        major = rtver.value // 1000
+        minor = (rtver.value - (major * 1000)) // 10
+        return (major, minor)
+
+    def is_supported_version(self):
+        """
+        Returns True if the CUDA Runtime is a supported version.
+        """
+
+        return self.get_version() in self.supported_versions
+
+    @property
+    def supported_versions(self):
+        """A tuple of all supported CUDA toolkit versions. Versions are given in
+        the form ``(major_version, minor_version)``."""
+        if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
+            # Only 64-bit Linux and Windows are supported
+            return ()
+        return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
+                (11, 7))
+
+
+runtime = Runtime()
+
+
+def get_version():
+    """
+    Return the runtime version as a tuple of (major, minor)
+    """
+    return runtime.get_version()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudaimpl.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudaimpl.py
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudamath.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudamath.py
@@ -0,0 +1,140 @@
+import math
+from numba.core import types
+from numba.core.typing.templates import ConcreteTemplate, signature, Registry
+
+
+registry = Registry()
+infer_global = registry.register_global
+
+
+@infer_global(math.acos)
+@infer_global(math.acosh)
+@infer_global(math.asin)
+@infer_global(math.asinh)
+@infer_global(math.atan)
+@infer_global(math.atanh)
+@infer_global(math.cosh)
+@infer_global(math.degrees)
+@infer_global(math.erf)
+@infer_global(math.erfc)
+@infer_global(math.expm1)
+@infer_global(math.gamma)
+@infer_global(math.lgamma)
+@infer_global(math.log1p)
+@infer_global(math.radians)
+@infer_global(math.sinh)
+@infer_global(math.tanh)
+@infer_global(math.tan)
+class Math_unary(ConcreteTemplate):
+    cases = [
+        signature(types.float64, types.int64),
+        signature(types.float64, types.uint64),
+        signature(types.float32, types.float32),
+        signature(types.float64, types.float64),
+    ]
+
+
+@infer_global(math.sin)
+@infer_global(math.cos)
+@infer_global(math.ceil)
+@infer_global(math.floor)
+@infer_global(math.sqrt)
+@infer_global(math.log)
+@infer_global(math.log2)
+@infer_global(math.log10)
+@infer_global(math.exp)
+@infer_global(math.fabs)
+@infer_global(math.trunc)
+class Math_unary_with_fp16(ConcreteTemplate):
+    cases = [
+        signature(types.float64, types.int64),
+        signature(types.float64, types.uint64),
+        signature(types.float32, types.float32),
+        signature(types.float64, types.float64),
+        signature(types.float16, types.float16),
+    ]
+
+
+@infer_global(math.atan2)
+class Math_atan2(ConcreteTemplate):
+    key = math.atan2
+    cases = [
+        signature(types.float64, types.int64, types.int64),
+        signature(types.float64, types.uint64, types.uint64),
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+    ]
+
+
+@infer_global(math.hypot)
+class Math_hypot(ConcreteTemplate):
+    key = math.hypot
+    cases = [
+        signature(types.float64, types.int64, types.int64),
+        signature(types.float64, types.uint64, types.uint64),
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+    ]
+
+
+@infer_global(math.copysign)
+@infer_global(math.fmod)
+class Math_binary(ConcreteTemplate):
+    cases = [
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+    ]
+
+
+@infer_global(math.remainder)
+class Math_remainder(ConcreteTemplate):
+    cases = [
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+    ]
+
+
+@infer_global(math.pow)
+class Math_pow(ConcreteTemplate):
+    cases = [
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+        signature(types.float32, types.float32, types.int32),
+        signature(types.float64, types.float64, types.int32),
+    ]
+
+
+@infer_global(math.frexp)
+class Math_frexp(ConcreteTemplate):
+    cases = [
+        signature(types.Tuple([types.float32, types.int32]), types.float32),
+        signature(types.Tuple([types.float64, types.int32]), types.float64),
+    ]
+
+
+@infer_global(math.ldexp)
+class Math_ldexp(ConcreteTemplate):
+    cases = [
+        signature(types.float32, types.float32, types.int32),
+        signature(types.float64, types.float64, types.int32),
+    ]
+
+
+@infer_global(math.isinf)
+@infer_global(math.isnan)
+@infer_global(math.isfinite)
+class Math_isnan(ConcreteTemplate):
+    cases = [
+        signature(types.boolean, types.int64),
+        signature(types.boolean, types.uint64),
+        signature(types.boolean, types.float32),
+        signature(types.boolean, types.float64),
+    ]
+
+
+@infer_global(math.modf)
+class Math_modf(ConcreteTemplate):
+    cases = [
+        signature(types.UniTuple(types.float64, 2), types.float64),
+        signature(types.UniTuple(types.float32, 2), types.float32)
+    ]
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/decorators.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/decorators.py
@@ -0,0 +1,191 @@
+from warnings import warn
+from numba.core import types, config, sigutils
+from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning
+from numba.cuda.compiler import declare_device_function
+from numba.cuda.dispatcher import CUDADispatcher
+from numba.cuda.simulator.kernel import FakeCUDAKernel
+
+
+_msg_deprecated_signature_arg = ("Deprecated keyword argument `{0}`. "
+                                 "Signatures should be passed as the first "
+                                 "positional argument.")
+
+
+def jit(func_or_sig=None, device=False, inline=False, link=None, debug=None,
+        opt=True, lineinfo=False, cache=False, **kws):
+    """
+    JIT compile a Python function for CUDA GPUs.
+
+    :param func_or_sig: A function to JIT compile, or *signatures* of a
+       function to compile. If a function is supplied, then a
+       :class:`Dispatcher <numba.cuda.dispatcher.CUDADispatcher>` is returned.
+       Otherwise, ``func_or_sig`` may be a signature or a list of signatures,
+       and a function is returned. The returned function accepts another
+       function, which it will compile and then return a :class:`Dispatcher
+       <numba.cuda.dispatcher.CUDADispatcher>`. See :ref:`jit-decorator` for
+       more information about passing signatures.
+
+       .. note:: A kernel cannot have any return value.
+    :param device: Indicates whether this is a device function.
+    :type device: bool
+    :param link: A list of files containing PTX or CUDA C/C++ source to link
+       with the function
+    :type link: list
+    :param debug: If True, check for exceptions thrown when executing the
+       kernel. Since this degrades performance, this should only be used for
+       debugging purposes. If set to True, then ``opt`` should be set to False.
+       Defaults to False.  (The default value can be overridden by setting
+       environment variable ``NUMBA_CUDA_DEBUGINFO=1``.)
+    :param fastmath: When True, enables fastmath optimizations as outlined in
+       the :ref:`CUDA Fast Math documentation <cuda-fast-math>`.
+    :param max_registers: Request that the kernel is limited to using at most
+       this number of registers per thread. The limit may not be respected if
+       the ABI requires a greater number of registers than that requested.
+       Useful for increasing occupancy.
+    :param opt: Whether to compile from LLVM IR to PTX with optimization
+                enabled. When ``True``, ``-opt=3`` is passed to NVVM. When
+                ``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``.
+    :type opt: bool
+    :param lineinfo: If True, generate a line mapping between source code and
+       assembly code. This enables inspection of the source code in NVIDIA
+       profiling tools and correlation with program counter sampling.
+    :type lineinfo: bool
+    :param cache: If True, enables the file-based cache for this function.
+    :type cache: bool
+    """
+
+    if link is None:
+        link = []
+    if link and config.ENABLE_CUDASIM:
+        raise NotImplementedError('Cannot link PTX in the simulator')
+
+    if kws.get('boundscheck'):
+        raise NotImplementedError("bounds checking is not supported for CUDA")
+
+    if kws.get('argtypes') is not None:
+        msg = _msg_deprecated_signature_arg.format('argtypes')
+        raise DeprecationError(msg)
+    if kws.get('restype') is not None:
+        msg = _msg_deprecated_signature_arg.format('restype')
+        raise DeprecationError(msg)
+    if kws.get('bind') is not None:
+        msg = _msg_deprecated_signature_arg.format('bind')
+        raise DeprecationError(msg)
+
+    debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
+    fastmath = kws.get('fastmath', False)
+    extensions = kws.get('extensions', [])
+
+    if debug and opt:
+        msg = ("debug=True with opt=True (the default) "
+               "is not supported by CUDA. This may result in a crash"
+               " - set debug=False or opt=False.")
+        warn(NumbaInvalidConfigWarning(msg))
+
+    if debug and lineinfo:
+        msg = ("debug and lineinfo are mutually exclusive. Use debug to get "
+               "full debug info (this disables some optimizations), or "
+               "lineinfo for line info only with code generation unaffected.")
+        warn(NumbaInvalidConfigWarning(msg))
+
+    if device and kws.get('link'):
+        raise ValueError("link keyword invalid for device function")
+
+    if sigutils.is_signature(func_or_sig):
+        signatures = [func_or_sig]
+        specialized = True
+    elif isinstance(func_or_sig, list):
+        signatures = func_or_sig
+        specialized = False
+    else:
+        signatures = None
+
+    if signatures is not None:
+        if config.ENABLE_CUDASIM:
+            def jitwrapper(func):
+                return FakeCUDAKernel(func, device=device, fastmath=fastmath)
+            return jitwrapper
+
+        def _jit(func):
+            targetoptions = kws.copy()
+            targetoptions['debug'] = debug
+            targetoptions['lineinfo'] = lineinfo
+            targetoptions['link'] = link
+            targetoptions['opt'] = opt
+            targetoptions['fastmath'] = fastmath
+            targetoptions['device'] = device
+            targetoptions['extensions'] = extensions
+
+            disp = CUDADispatcher(func, targetoptions=targetoptions)
+
+            if cache:
+                disp.enable_caching()
+
+            for sig in signatures:
+                argtypes, restype = sigutils.normalize_signature(sig)
+
+                if restype and not device and restype != types.void:
+                    raise TypeError("CUDA kernel must have void return type.")
+
+                if device:
+                    from numba.core import typeinfer
+                    with typeinfer.register_dispatcher(disp):
+                        disp.compile_device(argtypes, restype)
+                else:
+                    disp.compile(argtypes)
+
+            disp._specialized = specialized
+            disp.disable_compile()
+
+            return disp
+
+        return _jit
+    else:
+        if func_or_sig is None:
+            if config.ENABLE_CUDASIM:
+                def autojitwrapper(func):
+                    return FakeCUDAKernel(func, device=device,
+                                          fastmath=fastmath)
+            else:
+                def autojitwrapper(func):
+                    return jit(func, device=device, debug=debug, opt=opt,
+                               lineinfo=lineinfo, link=link, cache=cache, **kws)
+
+            return autojitwrapper
+        # func_or_sig is a function
+        else:
+            if config.ENABLE_CUDASIM:
+                return FakeCUDAKernel(func_or_sig, device=device,
+                                      fastmath=fastmath)
+            else:
+                targetoptions = kws.copy()
+                targetoptions['debug'] = debug
+                targetoptions['lineinfo'] = lineinfo
+                targetoptions['opt'] = opt
+                targetoptions['link'] = link
+                targetoptions['fastmath'] = fastmath
+                targetoptions['device'] = device
+                targetoptions['extensions'] = extensions
+                disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
+
+                if cache:
+                    disp.enable_caching()
+
+                return disp
+
+
+def declare_device(name, sig):
+    """
+    Declare the signature of a foreign function. Returns a descriptor that can
+    be used to call the function from a Python kernel.
+
+    :param name: The name of the foreign function.
+    :type name: str
+    :param sig: The Numba signature of the function.
+    """
+    argtypes, restype = sigutils.normalize_signature(sig)
+    if restype is None:
+        msg = 'Return type must be provided for device declarations'
+        raise TypeError(msg)
+
+    return declare_device_function(name, restype, argtypes)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/descriptor.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/descriptor.py
@@ -0,0 +1,33 @@
+from numba.core.descriptors import TargetDescriptor
+from numba.core.options import TargetOptions
+from .target import CUDATargetContext, CUDATypingContext
+
+
+class CUDATargetOptions(TargetOptions):
+    pass
+
+
+class CUDATarget(TargetDescriptor):
+    def __init__(self, name):
+        self.options = CUDATargetOptions
+        # The typing and target contexts are initialized only when needed -
+        # this prevents an attempt to load CUDA libraries at import time on
+        # systems that might not have them present.
+        self._typingctx = None
+        self._targetctx = None
+        super().__init__(name)
+
+    @property
+    def typing_context(self):
+        if self._typingctx is None:
+            self._typingctx = CUDATypingContext()
+        return self._typingctx
+
+    @property
+    def target_context(self):
+        if self._targetctx is None:
+            self._targetctx = CUDATargetContext(self._typingctx)
+        return self._targetctx
+
+
+cuda_target = CUDATarget('cuda')
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/device_init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/device_init.py
@@ -0,0 +1,89 @@
+# Re export
+import sys
+from numba.cuda import cg
+from .stubs import (threadIdx, blockIdx, blockDim, gridDim, laneid, warpsize,
+                    syncwarp, shared, local, const, atomic,
+                    shfl_sync_intrinsic, vote_sync_intrinsic, match_any_sync,
+                    match_all_sync, threadfence_block, threadfence_system,
+                    threadfence, selp, popc, brev, clz, ffs, fma, cbrt,
+                    activemask, lanemask_lt, nanosleep, fp16,
+                    _vector_type_stubs)
+from .intrinsics import (grid, gridsize, syncthreads, syncthreads_and,
+                         syncthreads_count, syncthreads_or)
+from .cudadrv.error import CudaSupportError
+from numba.cuda.cudadrv.driver import (BaseCUDAMemoryManager,
+                                       HostOnlyCUDAMemoryManager,
+                                       GetIpcHandleMixin, MemoryPointer,
+                                       MappedMemory, PinnedMemory, MemoryInfo,
+                                       IpcHandle, set_memory_manager)
+from numba.cuda.cudadrv.runtime import runtime
+from .cudadrv import nvvm
+from numba.cuda import initialize
+from .errors import KernelRuntimeError
+
+from .decorators import jit, declare_device
+from .api import *
+from .api import _auto_device
+from .args import In, Out, InOut
+
+from .intrinsic_wrapper import (all_sync, any_sync, eq_sync, ballot_sync,
+                                shfl_sync, shfl_up_sync, shfl_down_sync,
+                                shfl_xor_sync)
+
+from .kernels import reduction
+
+reduce = Reduce = reduction.Reduce
+
+# Expose vector type constructors and aliases as module level attributes.
+for vector_type_stub in _vector_type_stubs:
+    setattr(sys.modules[__name__], vector_type_stub.__name__, vector_type_stub)
+    for alias in vector_type_stub.aliases:
+        setattr(sys.modules[__name__], alias, vector_type_stub)
+del vector_type_stub, _vector_type_stubs
+
+
+def is_available():
+    """Returns a boolean to indicate the availability of a CUDA GPU.
+
+    This will initialize the driver if it hasn't been initialized.
+    """
+    # whilst `driver.is_available` will init the driver itself,
+    # the driver initialization may raise and as a result break
+    # test discovery/orchestration as `cuda.is_available` is often
+    # used as a guard for whether to run a CUDA test, the try/except
+    # below is to handle this case.
+    driver_is_available = False
+    try:
+        driver_is_available = driver.driver.is_available
+    except CudaSupportError:
+        pass
+
+    return driver_is_available and nvvm.is_available()
+
+
+def is_supported_version():
+    """Returns True if the CUDA Runtime is a supported version.
+
+    Unsupported versions (e.g. newer versions than those known to Numba)
+    may still work; this function provides a facility to check whether the
+    current Numba version is tested and known to work with the current
+    runtime version. If the current version is unsupported, the caller can
+    decide how to act. Options include:
+
+    - Continuing silently,
+    - Emitting a warning,
+    - Generating an error or otherwise preventing the use of CUDA.
+    """
+
+    return runtime.is_supported_version()
+
+
+def cuda_error():
+    """Returns None if there was no error initializing the CUDA driver.
+    If there was an error initializing the driver, a string describing the
+    error is returned.
+    """
+    return driver.driver.initialization_error
+
+
+initialize.initialize_all()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/deviceufunc.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/deviceufunc.py
@@ -0,0 +1,919 @@
+"""
+Implements custom ufunc dispatch mechanism for non-CPU devices.
+"""
+
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+import operator
+import warnings
+from functools import reduce
+
+import numpy as np
+
+from numba.np.ufunc.ufuncbuilder import _BaseUFuncBuilder, parse_identity
+from numba.core import types, sigutils
+from numba.core.typing import signature
+from numba.np.ufunc.sigparse import parse_signature
+
+
+def _broadcast_axis(a, b):
+    """
+    Raises
+    ------
+    ValueError if broadcast fails
+    """
+    if a == b:
+        return a
+    elif a == 1:
+        return b
+    elif b == 1:
+        return a
+    else:
+        raise ValueError("failed to broadcast {0} and {1}".format(a, b))
+
+
+def _pairwise_broadcast(shape1, shape2):
+    """
+    Raises
+    ------
+    ValueError if broadcast fails
+    """
+    shape1, shape2 = map(tuple, [shape1, shape2])
+
+    while len(shape1) < len(shape2):
+        shape1 = (1,) + shape1
+
+    while len(shape1) > len(shape2):
+        shape2 = (1,) + shape2
+
+    return tuple(_broadcast_axis(a, b) for a, b in zip(shape1, shape2))
+
+
+def _multi_broadcast(*shapelist):
+    """
+    Raises
+    ------
+    ValueError if broadcast fails
+    """
+    assert shapelist
+
+    result = shapelist[0]
+    others = shapelist[1:]
+    try:
+        for i, each in enumerate(others, start=1):
+            result = _pairwise_broadcast(result, each)
+    except ValueError:
+        raise ValueError("failed to broadcast argument #{0}".format(i))
+    else:
+        return result
+
+
+class UFuncMechanism(object):
+    """
+    Prepare ufunc arguments for vectorize.
+    """
+    DEFAULT_STREAM = None
+    SUPPORT_DEVICE_SLICING = False
+
+    def __init__(self, typemap, args):
+        """Never used directly by user. Invoke by UFuncMechanism.call().
+        """
+        self.typemap = typemap
+        self.args = args
+        nargs = len(self.args)
+        self.argtypes = [None] * nargs
+        self.scalarpos = []
+        self.signature = None
+        self.arrays = [None] * nargs
+
+    def _fill_arrays(self):
+        """
+        Get all arguments in array form
+        """
+        for i, arg in enumerate(self.args):
+            if self.is_device_array(arg):
+                self.arrays[i] = self.as_device_array(arg)
+            elif isinstance(arg, (int, float, complex, np.number)):
+                # Is scalar
+                self.scalarpos.append(i)
+            else:
+                self.arrays[i] = np.asarray(arg)
+
+    def _fill_argtypes(self):
+        """
+        Get dtypes
+        """
+        for i, ary in enumerate(self.arrays):
+            if ary is not None:
+                dtype = getattr(ary, 'dtype')
+                if dtype is None:
+                    dtype = np.asarray(ary).dtype
+                self.argtypes[i] = dtype
+
+    def _resolve_signature(self):
+        """Resolve signature.
+        May have ambiguous case.
+        """
+        matches = []
+        # Resolve scalar args exact match first
+        if self.scalarpos:
+            # Try resolve scalar arguments
+            for formaltys in self.typemap:
+                match_map = []
+                for i, (formal, actual) in enumerate(zip(formaltys,
+                                                         self.argtypes)):
+                    if actual is None:
+                        actual = np.asarray(self.args[i]).dtype
+
+                    match_map.append(actual == formal)
+
+                if all(match_map):
+                    matches.append(formaltys)
+
+        # No matching with exact match; try coercing the scalar arguments
+        if not matches:
+            matches = []
+            for formaltys in self.typemap:
+                all_matches = all(actual is None or formal == actual
+                                  for formal, actual in
+                                  zip(formaltys, self.argtypes))
+                if all_matches:
+                    matches.append(formaltys)
+
+        if not matches:
+            raise TypeError("No matching version.  GPU ufunc requires array "
+                            "arguments to have the exact types.  This behaves "
+                            "like regular ufunc with casting='no'.")
+
+        if len(matches) > 1:
+            raise TypeError("Failed to resolve ufunc due to ambiguous "
+                            "signature. Too many untyped scalars. "
+                            "Use numpy dtype object to type tag.")
+
+        # Try scalar arguments
+        self.argtypes = matches[0]
+
+    def _get_actual_args(self):
+        """Return the actual arguments
+        Casts scalar arguments to np.array.
+        """
+        for i in self.scalarpos:
+            self.arrays[i] = np.array([self.args[i]], dtype=self.argtypes[i])
+
+        return self.arrays
+
+    def _broadcast(self, arys):
+        """Perform numpy ufunc broadcasting
+        """
+        shapelist = [a.shape for a in arys]
+        shape = _multi_broadcast(*shapelist)
+
+        for i, ary in enumerate(arys):
+            if ary.shape == shape:
+                pass
+
+            else:
+                if self.is_device_array(ary):
+                    arys[i] = self.broadcast_device(ary, shape)
+
+                else:
+                    ax_differs = [ax for ax in range(len(shape))
+                                  if ax >= ary.ndim
+                                  or ary.shape[ax] != shape[ax]]
+
+                    missingdim = len(shape) - len(ary.shape)
+                    strides = [0] * missingdim + list(ary.strides)
+
+                    for ax in ax_differs:
+                        strides[ax] = 0
+
+                    strided = np.lib.stride_tricks.as_strided(ary,
+                                                              shape=shape,
+                                                              strides=strides)
+
+                    arys[i] = self.force_array_layout(strided)
+
+        return arys
+
+    def get_arguments(self):
+        """Prepare and return the arguments for the ufunc.
+        Does not call to_device().
+        """
+        self._fill_arrays()
+        self._fill_argtypes()
+        self._resolve_signature()
+        arys = self._get_actual_args()
+        return self._broadcast(arys)
+
+    def get_function(self):
+        """Returns (result_dtype, function)
+        """
+        return self.typemap[self.argtypes]
+
+    def is_device_array(self, obj):
+        """Is the `obj` a device array?
+        Override in subclass
+        """
+        return False
+
+    def as_device_array(self, obj):
+        """Convert the `obj` to a device array
+        Override in subclass
+
+        Default implementation is an identity function
+        """
+        return obj
+
+    def broadcast_device(self, ary, shape):
+        """Handles ondevice broadcasting
+
+        Override in subclass to add support.
+        """
+        raise NotImplementedError("broadcasting on device is not supported")
+
+    def force_array_layout(self, ary):
+        """Ensures array layout met device requirement.
+
+        Override in sublcass
+        """
+        return ary
+
+    @classmethod
+    def call(cls, typemap, args, kws):
+        """Perform the entire ufunc call mechanism.
+        """
+        # Handle keywords
+        stream = kws.pop('stream', cls.DEFAULT_STREAM)
+        out = kws.pop('out', None)
+
+        if kws:
+            warnings.warn("unrecognized keywords: %s" % ', '.join(kws))
+
+        # Begin call resolution
+        cr = cls(typemap, args)
+        args = cr.get_arguments()
+        resty, func = cr.get_function()
+
+        outshape = args[0].shape
+
+        # Adjust output value
+        if out is not None and cr.is_device_array(out):
+            out = cr.as_device_array(out)
+
+        def attempt_ravel(a):
+            if cr.SUPPORT_DEVICE_SLICING:
+                raise NotImplementedError
+
+            try:
+                # Call the `.ravel()` method
+                return a.ravel()
+            except NotImplementedError:
+                # If it is not a device array
+                if not cr.is_device_array(a):
+                    raise
+                # For device array, retry ravel on the host by first
+                # copying it back.
+                else:
+                    hostary = cr.to_host(a, stream).ravel()
+                    return cr.to_device(hostary, stream)
+
+        if args[0].ndim > 1:
+            args = [attempt_ravel(a) for a in args]
+
+        # Prepare argument on the device
+        devarys = []
+        any_device = False
+        for a in args:
+            if cr.is_device_array(a):
+                devarys.append(a)
+                any_device = True
+            else:
+                dev_a = cr.to_device(a, stream=stream)
+                devarys.append(dev_a)
+
+        # Launch
+        shape = args[0].shape
+        if out is None:
+            # No output is provided
+            devout = cr.allocate_device_array(shape, resty, stream=stream)
+
+            devarys.extend([devout])
+            cr.launch(func, shape[0], stream, devarys)
+
+            if any_device:
+                # If any of the arguments are on device,
+                # Keep output on the device
+                return devout.reshape(outshape)
+            else:
+                # Otherwise, transfer output back to host
+                return devout.copy_to_host().reshape(outshape)
+
+        elif cr.is_device_array(out):
+            # If output is provided and it is a device array,
+            # Return device array
+            if out.ndim > 1:
+                out = attempt_ravel(out)
+            devout = out
+            devarys.extend([devout])
+            cr.launch(func, shape[0], stream, devarys)
+            return devout.reshape(outshape)
+
+        else:
+            # If output is provided and it is a host array,
+            # Return host array
+            assert out.shape == shape
+            assert out.dtype == resty
+            devout = cr.allocate_device_array(shape, resty, stream=stream)
+            devarys.extend([devout])
+            cr.launch(func, shape[0], stream, devarys)
+            return devout.copy_to_host(out, stream=stream).reshape(outshape)
+
+    def to_device(self, hostary, stream):
+        """Implement to device transfer
+        Override in subclass
+        """
+        raise NotImplementedError
+
+    def to_host(self, devary, stream):
+        """Implement to host transfer
+        Override in subclass
+        """
+        raise NotImplementedError
+
+    def allocate_device_array(self, shape, dtype, stream):
+        """Implements device allocation
+        Override in subclass
+        """
+        raise NotImplementedError
+
+    def launch(self, func, count, stream, args):
+        """Implements device function invocation
+        Override in subclass
+        """
+        raise NotImplementedError
+
+
+def to_dtype(ty):
+    if isinstance(ty, types.EnumMember):
+        ty = ty.dtype
+    return np.dtype(str(ty))
+
+
+class DeviceVectorize(_BaseUFuncBuilder):
+    def __init__(self, func, identity=None, cache=False, targetoptions=None):
+        if targetoptions is None:
+            targetoptions = {}
+        if cache:
+            raise TypeError("caching is not supported")
+        for opt in targetoptions:
+            if opt == 'nopython':
+                warnings.warn("nopython kwarg for cuda target is redundant",
+                              RuntimeWarning)
+            else:
+                fmt = "Unrecognized options. "
+                fmt += "cuda vectorize target does not support option: '%s'"
+                raise KeyError(fmt % opt)
+        self.py_func = func
+        self.identity = parse_identity(identity)
+        # { arg_dtype: (return_dtype), cudakernel }
+        self.kernelmap = OrderedDict()
+
+    @property
+    def pyfunc(self):
+        return self.py_func
+
+    def add(self, sig=None):
+        # compile core as device function
+        args, return_type = sigutils.normalize_signature(sig)
+        devfnsig = signature(return_type, *args)
+
+        funcname = self.pyfunc.__name__
+        kernelsource = self._get_kernel_source(self._kernel_template,
+                                               devfnsig, funcname)
+        corefn, return_type = self._compile_core(devfnsig)
+        glbl = self._get_globals(corefn)
+        sig = signature(types.void, *([a[:] for a in args] + [return_type[:]]))
+        exec(kernelsource, glbl)
+
+        stager = glbl['__vectorized_%s' % funcname]
+        kernel = self._compile_kernel(stager, sig)
+
+        argdtypes = tuple(to_dtype(t) for t in devfnsig.args)
+        resdtype = to_dtype(return_type)
+        self.kernelmap[tuple(argdtypes)] = resdtype, kernel
+
+    def build_ufunc(self):
+        raise NotImplementedError
+
+    def _get_kernel_source(self, template, sig, funcname):
+        args = ['a%d' % i for i in range(len(sig.args))]
+        fmts = dict(name=funcname,
+                    args=', '.join(args),
+                    argitems=', '.join('%s[__tid__]' % i for i in args))
+        return template.format(**fmts)
+
+    def _compile_core(self, sig):
+        raise NotImplementedError
+
+    def _get_globals(self, corefn):
+        raise NotImplementedError
+
+    def _compile_kernel(self, fnobj, sig):
+        raise NotImplementedError
+
+
+class DeviceGUFuncVectorize(_BaseUFuncBuilder):
+    def __init__(
+        self,
+        func,
+        sig,
+        identity=None,
+        cache=False,
+        targetoptions=None,
+        writable_args=(),
+    ):
+        if targetoptions is None:
+            targetoptions = {}
+        if cache:
+            raise TypeError("caching is not supported")
+        if writable_args:
+            raise TypeError("writable_args are not supported")
+
+        # Allow nopython flag to be set.
+        if not targetoptions.pop('nopython', True):
+            raise TypeError("nopython flag must be True")
+        # Are there any more target options?
+        if targetoptions:
+            opts = ', '.join([repr(k) for k in targetoptions.keys()])
+            fmt = "The following target options are not supported: {0}"
+            raise TypeError(fmt.format(opts))
+
+        self.py_func = func
+        self.identity = parse_identity(identity)
+        self.signature = sig
+        self.inputsig, self.outputsig = parse_signature(self.signature)
+
+        # Maps from a tuple of input_dtypes to (output_dtypes, kernel)
+        self.kernelmap = OrderedDict()
+
+    @property
+    def pyfunc(self):
+        return self.py_func
+
+    def add(self, sig=None):
+        indims = [len(x) for x in self.inputsig]
+        outdims = [len(x) for x in self.outputsig]
+        args, return_type = sigutils.normalize_signature(sig)
+
+        # It is only valid to specify types.none as a return type, or to not
+        # specify the return type (where the "Python None" is the return type)
+        valid_return_type = return_type in (types.none, None)
+        if not valid_return_type:
+            raise TypeError('guvectorized functions cannot return values: '
+                            f'signature {sig} specifies {return_type} return '
+                            'type')
+
+        funcname = self.py_func.__name__
+        src = expand_gufunc_template(self._kernel_template, indims,
+                                     outdims, funcname, args)
+
+        glbls = self._get_globals(sig)
+
+        exec(src, glbls)
+        fnobj = glbls['__gufunc_{name}'.format(name=funcname)]
+
+        outertys = list(_determine_gufunc_outer_types(args, indims + outdims))
+        kernel = self._compile_kernel(fnobj, sig=tuple(outertys))
+
+        nout = len(outdims)
+        dtypes = [np.dtype(str(t.dtype)) for t in outertys]
+        indtypes = tuple(dtypes[:-nout])
+        outdtypes = tuple(dtypes[-nout:])
+
+        self.kernelmap[indtypes] = outdtypes, kernel
+
+    def _compile_kernel(self, fnobj, sig):
+        raise NotImplementedError
+
+    def _get_globals(self, sig):
+        raise NotImplementedError
+
+
+def _determine_gufunc_outer_types(argtys, dims):
+    for at, nd in zip(argtys, dims):
+        if isinstance(at, types.Array):
+            yield at.copy(ndim=nd + 1)
+        else:
+            if nd > 0:
+                raise ValueError("gufunc signature mismatch: ndim>0 for scalar")
+            yield types.Array(dtype=at, ndim=1, layout='A')
+
+
+def expand_gufunc_template(template, indims, outdims, funcname, argtypes):
+    """Expand gufunc source template
+    """
+    argdims = indims + outdims
+    argnames = ["arg{0}".format(i) for i in range(len(argdims))]
+    checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a)
+                                              for a in argnames]))
+    inputs = [_gen_src_for_indexing(aref, adims, atype)
+              for aref, adims, atype in zip(argnames, indims, argtypes)]
+    outputs = [_gen_src_for_indexing(aref, adims, atype)
+               for aref, adims, atype in zip(argnames[len(indims):], outdims,
+                                             argtypes[len(indims):])]
+    argitems = inputs + outputs
+    src = template.format(name=funcname, args=', '.join(argnames),
+                          checkedarg=checkedarg,
+                          argitems=', '.join(argitems))
+    return src
+
+
+def _gen_src_for_indexing(aref, adims, atype):
+    return "{aref}[{sliced}]".format(aref=aref,
+                                     sliced=_gen_src_index(adims, atype))
+
+
+def _gen_src_index(adims, atype):
+    if adims > 0:
+        return ','.join(['__tid__'] + [':'] * adims)
+    elif isinstance(atype, types.Array) and atype.ndim - 1 == adims:
+        # Special case for 0-nd in shape-signature but
+        # 1d array in type signature.
+        # Slice it so that the result has the same dimension.
+        return '__tid__:(__tid__ + 1)'
+    else:
+        return '__tid__'
+
+
+class GUFuncEngine(object):
+    '''Determine how to broadcast and execute a gufunc
+    base on input shape and signature
+    '''
+
+    @classmethod
+    def from_signature(cls, signature):
+        return cls(*parse_signature(signature))
+
+    def __init__(self, inputsig, outputsig):
+        # signatures
+        self.sin = inputsig
+        self.sout = outputsig
+        # argument count
+        self.nin = len(self.sin)
+        self.nout = len(self.sout)
+
+    def schedule(self, ishapes):
+        if len(ishapes) != self.nin:
+            raise TypeError('invalid number of input argument')
+
+        # associate symbol values for input signature
+        symbolmap = {}
+        outer_shapes = []
+        inner_shapes = []
+
+        for argn, (shape, symbols) in enumerate(zip(ishapes, self.sin)):
+            argn += 1  # start from 1 for human
+            inner_ndim = len(symbols)
+            if len(shape) < inner_ndim:
+                fmt = "arg #%d: insufficient inner dimension"
+                raise ValueError(fmt % (argn,))
+            if inner_ndim:
+                inner_shape = shape[-inner_ndim:]
+                outer_shape = shape[:-inner_ndim]
+            else:
+                inner_shape = ()
+                outer_shape = shape
+
+            for axis, (dim, sym) in enumerate(zip(inner_shape, symbols)):
+                axis += len(outer_shape)
+                if sym in symbolmap:
+                    if symbolmap[sym] != dim:
+                        fmt = "arg #%d: shape[%d] mismatch argument"
+                        raise ValueError(fmt % (argn, axis))
+                symbolmap[sym] = dim
+
+            outer_shapes.append(outer_shape)
+            inner_shapes.append(inner_shape)
+
+        # solve output shape
+        oshapes = []
+        for outsig in self.sout:
+            oshape = []
+            for sym in outsig:
+                oshape.append(symbolmap[sym])
+            oshapes.append(tuple(oshape))
+
+        # find the biggest outershape as looping dimension
+        sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]
+        largest_i = np.argmax(sizes)
+        loopdims = outer_shapes[largest_i]
+
+        pinned = [False] * self.nin  # same argument for each iteration
+        for i, d in enumerate(outer_shapes):
+            if d != loopdims:
+                if d == (1,) or d == ():
+                    pinned[i] = True
+                else:
+                    fmt = "arg #%d: outer dimension mismatch"
+                    raise ValueError(fmt % (i + 1,))
+
+        return GUFuncSchedule(self, inner_shapes, oshapes, loopdims, pinned)
+
+
+class GUFuncSchedule(object):
+    def __init__(self, parent, ishapes, oshapes, loopdims, pinned):
+        self.parent = parent
+        # core shapes
+        self.ishapes = ishapes
+        self.oshapes = oshapes
+        # looping dimension
+        self.loopdims = loopdims
+        self.loopn = reduce(operator.mul, loopdims, 1)
+        # flags
+        self.pinned = pinned
+
+        self.output_shapes = [loopdims + s for s in oshapes]
+
+    def __str__(self):
+        import pprint
+
+        attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned'
+        values = [(k, getattr(self, k)) for k in attrs]
+        return pprint.pformat(dict(values))
+
+
+class GeneralizedUFunc(object):
+    def __init__(self, kernelmap, engine):
+        self.kernelmap = kernelmap
+        self.engine = engine
+        self.max_blocksize = 2 ** 30
+
+    def __call__(self, *args, **kws):
+        callsteps = self._call_steps(self.engine.nin, self.engine.nout,
+                                     args, kws)
+        indtypes, schedule, outdtypes, kernel = self._schedule(
+            callsteps.inputs, callsteps.outputs)
+        callsteps.adjust_input_types(indtypes)
+
+        outputs = callsteps.prepare_outputs(schedule, outdtypes)
+        inputs = callsteps.prepare_inputs()
+        parameters = self._broadcast(schedule, inputs, outputs)
+
+        callsteps.launch_kernel(kernel, schedule.loopn, parameters)
+
+        return callsteps.post_process_outputs(outputs)
+
+    def _schedule(self, inputs, outs):
+        input_shapes = [a.shape for a in inputs]
+        schedule = self.engine.schedule(input_shapes)
+
+        # find kernel
+        indtypes = tuple(i.dtype for i in inputs)
+        try:
+            outdtypes, kernel = self.kernelmap[indtypes]
+        except KeyError:
+            # No exact match, then use the first compatible.
+            # This does not match the numpy dispatching exactly.
+            # Later, we may just jit a new version for the missing signature.
+            indtypes = self._search_matching_signature(indtypes)
+            # Select kernel
+            outdtypes, kernel = self.kernelmap[indtypes]
+
+        # check output
+        for sched_shape, out in zip(schedule.output_shapes, outs):
+            if out is not None and sched_shape != out.shape:
+                raise ValueError('output shape mismatch')
+
+        return indtypes, schedule, outdtypes, kernel
+
+    def _search_matching_signature(self, idtypes):
+        """
+        Given the input types in `idtypes`, return a compatible sequence of
+        types that is defined in `kernelmap`.
+
+        Note: Ordering is guaranteed by `kernelmap` being a OrderedDict
+        """
+        for sig in self.kernelmap.keys():
+            if all(np.can_cast(actual, desired)
+                   for actual, desired in zip(sig, idtypes)):
+                return sig
+        else:
+            raise TypeError("no matching signature")
+
+    def _broadcast(self, schedule, params, retvals):
+        assert schedule.loopn > 0, "zero looping dimension"
+
+        odim = 1 if not schedule.loopdims else schedule.loopn
+        newparams = []
+        for p, cs in zip(params, schedule.ishapes):
+            if not cs and p.size == 1:
+                # Broadcast scalar input
+                devary = self._broadcast_scalar_input(p, odim)
+                newparams.append(devary)
+            else:
+                # Broadcast vector input
+                newparams.append(self._broadcast_array(p, odim, cs))
+
+        newretvals = []
+        for retval, oshape in zip(retvals, schedule.oshapes):
+            newretvals.append(retval.reshape(odim, *oshape))
+        return tuple(newparams) + tuple(newretvals)
+
+    def _broadcast_array(self, ary, newdim, innerdim):
+        newshape = (newdim,) + innerdim
+        # No change in shape
+        if ary.shape == newshape:
+            return ary
+
+        # Creating new dimension
+        elif len(ary.shape) < len(newshape):
+            assert newshape[-len(ary.shape):] == ary.shape, \
+                "cannot add dim and reshape at the same time"
+            return self._broadcast_add_axis(ary, newshape)
+
+        # Collapsing dimension
+        else:
+            return ary.reshape(*newshape)
+
+    def _broadcast_add_axis(self, ary, newshape):
+        raise NotImplementedError("cannot add new axis")
+
+    def _broadcast_scalar_input(self, ary, shape):
+        raise NotImplementedError
+
+
+class GUFuncCallSteps(metaclass=ABCMeta):
+    """
+    Implements memory management and kernel launch operations for GUFunc calls.
+
+    One instance of this class is instantiated for each call, and the instance
+    is specific to the arguments given to the GUFunc call.
+
+    The base class implements the overall logic; subclasses provide
+    target-specific implementations of individual functions.
+    """
+
+    # The base class uses these slots; subclasses may provide additional slots.
+    __slots__ = [
+        'outputs',
+        'inputs',
+        '_copy_result_to_host',
+    ]
+
+    @abstractmethod
+    def launch_kernel(self, kernel, nelem, args):
+        """Implement the kernel launch"""
+
+    @abstractmethod
+    def is_device_array(self, obj):
+        """
+        Return True if `obj` is a device array for this target, False
+        otherwise.
+        """
+
+    @abstractmethod
+    def as_device_array(self, obj):
+        """
+        Return `obj` as a device array on this target.
+
+        May return `obj` directly if it is already on the target.
+        """
+
+    @abstractmethod
+    def to_device(self, hostary):
+        """
+        Copy `hostary` to the device and return the device array.
+        """
+
+    @abstractmethod
+    def allocate_device_array(self, shape, dtype):
+        """
+        Allocate a new uninitialized device array with the given shape and
+        dtype.
+        """
+
+    def __init__(self, nin, nout, args, kwargs):
+        outputs = kwargs.get('out')
+
+        # Ensure the user has passed a correct number of arguments
+        if outputs is None and len(args) not in (nin, (nin + nout)):
+            def pos_argn(n):
+                return f'{n} positional argument{"s" * (n != 1)}'
+
+            msg = (f'This gufunc accepts {pos_argn(nin)} (when providing '
+                   f'input only) or {pos_argn(nin + nout)} (when providing '
+                   f'input and output). Got {pos_argn(len(args))}.')
+            raise TypeError(msg)
+
+        if outputs is not None and len(args) > nin:
+            raise ValueError("cannot specify argument 'out' as both positional "
+                             "and keyword")
+        else:
+            # If the user did not pass outputs either in the out kwarg or as
+            # positional arguments, then we need to generate an initial list of
+            # "placeholder" outputs using None as a sentry value
+            outputs = [outputs] * nout
+
+        # Ensure all output device arrays are Numba device arrays - for
+        # example, any output passed in that supports the CUDA Array Interface
+        # is converted to a Numba CUDA device array; others are left untouched.
+        all_user_outputs_are_host = True
+        self.outputs = []
+        for output in outputs:
+            if self.is_device_array(output):
+                self.outputs.append(self.as_device_array(output))
+                all_user_outputs_are_host = False
+            else:
+                self.outputs.append(output)
+
+        all_host_arrays = not any([self.is_device_array(a) for a in args])
+
+        # - If any of the arguments are device arrays, we leave the output on
+        #   the device.
+        self._copy_result_to_host = (all_host_arrays and
+                                     all_user_outputs_are_host)
+
+        # Normalize arguments - ensure they are either device- or host-side
+        # arrays (as opposed to lists, tuples, etc).
+        def normalize_arg(a):
+            if self.is_device_array(a):
+                convert = self.as_device_array
+            else:
+                convert = np.asarray
+
+            return convert(a)
+
+        normalized_args = [normalize_arg(a) for a in args]
+        self.inputs = normalized_args[:nin]
+
+        # Check if there are extra arguments for outputs.
+        unused_inputs = normalized_args[nin:]
+        if unused_inputs:
+            self.outputs = unused_inputs
+
+    def adjust_input_types(self, indtypes):
+        """
+        Attempt to cast the inputs to the required types if necessary
+        and if they are not device arrays.
+
+        Side effect: Only affects the elements of `inputs` that require
+        a type cast.
+        """
+        for i, (ity, val) in enumerate(zip(indtypes, self.inputs)):
+            if ity != val.dtype:
+                if not hasattr(val, 'astype'):
+                    msg = ("compatible signature is possible by casting but "
+                           "{0} does not support .astype()").format(type(val))
+                    raise TypeError(msg)
+                # Cast types
+                self.inputs[i] = val.astype(ity)
+
+    def prepare_outputs(self, schedule, outdtypes):
+        """
+        Returns a list of output parameters that all reside on the target
+        device.
+
+        Outputs that were passed-in to the GUFunc are used if they reside on the
+        device; other outputs are allocated as necessary.
+        """
+        outputs = []
+        for shape, dtype, output in zip(schedule.output_shapes, outdtypes,
+                                        self.outputs):
+            if output is None or self._copy_result_to_host:
+                output = self.allocate_device_array(shape, dtype)
+            outputs.append(output)
+
+        return outputs
+
+    def prepare_inputs(self):
+        """
+        Returns a list of input parameters that all reside on the target device.
+        """
+        def ensure_device(parameter):
+            if self.is_device_array(parameter):
+                convert = self.as_device_array
+            else:
+                convert = self.to_device
+
+            return convert(parameter)
+
+        return [ensure_device(p) for p in self.inputs]
+
+    def post_process_outputs(self, outputs):
+        """
+        Moves the given output(s) to the host if necessary.
+
+        Returns a single value (e.g. an array) if there was one output, or a
+        tuple of arrays if there were multiple. Although this feels a little
+        jarring, it is consistent with the behavior of GUFuncs in general.
+        """
+        if self._copy_result_to_host:
+            outputs = [self.to_host(output, self_output)
+                       for output, self_output in zip(outputs, self.outputs)]
+        elif self.outputs[0] is not None:
+            outputs = self.outputs
+
+        if len(outputs) == 1:
+            return outputs[0]
+        else:
+            return tuple(outputs)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/dispatcher.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/dispatcher.py
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/errors.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/errors.py
@@ -0,0 +1,59 @@
+import numbers
+from numba.core.errors import LoweringError
+
+
+class KernelRuntimeError(RuntimeError):
+    def __init__(self, msg, tid=None, ctaid=None):
+        self.tid = tid
+        self.ctaid = ctaid
+        self.msg = msg
+        t = ("An exception was raised in thread=%s block=%s\n"
+             "\t%s")
+        msg = t % (self.tid, self.ctaid, self.msg)
+        super(KernelRuntimeError, self).__init__(msg)
+
+
+class CudaLoweringError(LoweringError):
+    pass
+
+
+_launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
+                    "kernels.html#kernel-invocation")
+missing_launch_config_msg = """
+Kernel launch configuration was not specified. Use the syntax:
+
+kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn)
+
+See {} for help.
+
+""".format(_launch_help_url)
+
+
+def normalize_kernel_dimensions(griddim, blockdim):
+    """
+    Normalize and validate the user-supplied kernel dimensions.
+    """
+
+    def check_dim(dim, name):
+        if not isinstance(dim, (tuple, list)):
+            dim = [dim]
+        else:
+            dim = list(dim)
+        if len(dim) > 3:
+            raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
+                             'got %r' % (name, dim))
+        for v in dim:
+            if not isinstance(v, numbers.Integral):
+                raise TypeError('%s must be a sequence of integers, got %r'
+                                % (name, dim))
+        while len(dim) < 3:
+            dim.append(1)
+        return tuple(dim)
+
+    if None in (griddim, blockdim):
+        raise ValueError(missing_launch_config_msg)
+
+    griddim = check_dim(griddim, 'griddim')
+    blockdim = check_dim(blockdim, 'blockdim')
+
+    return griddim, blockdim
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/extending.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/extending.py
@@ -0,0 +1,7 @@
+"""
+Added for symmetry with the core API
+"""
+
+from numba.core.extending import intrinsic as _intrinsic
+
+intrinsic = _intrinsic(target='cuda')
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/initialize.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/initialize.py
@@ -0,0 +1,13 @@
+def initialize_all():
+    # Import models to register them with the data model manager
+    import numba.cuda.models  # noqa: F401
+
+    from numba.cuda.decorators import jit
+    from numba.cuda.dispatcher import CUDADispatcher
+    from numba.core.target_extension import (target_registry,
+                                             dispatcher_registry,
+                                             jit_registry)
+
+    cuda_target = target_registry["cuda"]
+    jit_registry[cuda_target] = jit
+    dispatcher_registry[cuda_target] = CUDADispatcher
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/intrinsic_wrapper.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/intrinsic_wrapper.py
@@ -0,0 +1,77 @@
+from .decorators import jit
+import numba
+
+
+@jit(device=True)
+def all_sync(mask, predicate):
+    """
+    If for all threads in the masked warp the predicate is true, then
+    a non-zero value is returned, otherwise 0 is returned.
+    """
+    return numba.cuda.vote_sync_intrinsic(mask, 0, predicate)[1]
+
+
+@jit(device=True)
+def any_sync(mask, predicate):
+    """
+    If for any thread in the masked warp the predicate is true, then
+    a non-zero value is returned, otherwise 0 is returned.
+    """
+    return numba.cuda.vote_sync_intrinsic(mask, 1, predicate)[1]
+
+
+@jit(device=True)
+def eq_sync(mask, predicate):
+    """
+    If for all threads in the masked warp the boolean predicate is the same,
+    then a non-zero value is returned, otherwise 0 is returned.
+    """
+    return numba.cuda.vote_sync_intrinsic(mask, 2, predicate)[1]
+
+
+@jit(device=True)
+def ballot_sync(mask, predicate):
+    """
+    Returns a mask of all threads in the warp whose predicate is true,
+    and are within the given mask.
+    """
+    return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
+
+
+@jit(device=True)
+def shfl_sync(mask, value, src_lane):
+    """
+    Shuffles value across the masked warp and returns the value
+    from src_lane. If this is outside the warp, then the
+    given value is returned.
+    """
+    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
+
+
+@jit(device=True)
+def shfl_up_sync(mask, value, delta):
+    """
+    Shuffles value across the masked warp and returns the value
+    from (laneid - delta). If this is outside the warp, then the
+    given value is returned.
+    """
+    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
+
+
+@jit(device=True)
+def shfl_down_sync(mask, value, delta):
+    """
+    Shuffles value across the masked warp and returns the value
+    from (laneid + delta). If this is outside the warp, then the
+    given value is returned.
+    """
+    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
+
+
+@jit(device=True)
+def shfl_xor_sync(mask, value, lane_mask):
+    """
+    Shuffles value across the masked warp and returns the value
+    from (laneid ^ lane_mask).
+    """
+    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/intrinsics.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/intrinsics.py
@@ -0,0 +1,198 @@
+from llvmlite import ir
+
+from numba import cuda, types
+from numba.core import cgutils
+from numba.core.errors import RequireLiteralValue, NumbaValueError
+from numba.core.typing import signature
+from numba.core.extending import overload_attribute
+from numba.cuda import nvvmutils
+from numba.cuda.extending import intrinsic
+
+
+#-------------------------------------------------------------------------------
+# Grid functions
+
+def _type_grid_function(ndim):
+    val = ndim.literal_value
+    if val == 1:
+        restype = types.int64
+    elif val in (2, 3):
+        restype = types.UniTuple(types.int64, val)
+    else:
+        raise NumbaValueError('argument can only be 1, 2, 3')
+
+    return signature(restype, types.int32)
+
+
+@intrinsic
+def grid(typingctx, ndim):
+    '''grid(ndim)
+
+    Return the absolute position of the current thread in the entire grid of
+    blocks.  *ndim* should correspond to the number of dimensions declared when
+    instantiating the kernel. If *ndim* is 1, a single integer is returned.
+    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
+
+    Computation of the first integer is as follows::
+
+        cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
+
+    and is similar for the other two indices, but using the ``y`` and ``z``
+    attributes.
+    '''
+
+    if not isinstance(ndim, types.IntegerLiteral):
+        raise RequireLiteralValue(ndim)
+
+    sig = _type_grid_function(ndim)
+
+    def codegen(context, builder, sig, args):
+        restype = sig.return_type
+        if restype == types.int64:
+            return nvvmutils.get_global_id(builder, dim=1)
+        elif isinstance(restype, types.UniTuple):
+            ids = nvvmutils.get_global_id(builder, dim=restype.count)
+            return cgutils.pack_array(builder, ids)
+
+    return sig, codegen
+
+
+@intrinsic
+def gridsize(typingctx, ndim):
+    '''gridsize(ndim)
+
+    Return the absolute size (or shape) in threads of the entire grid of
+    blocks. *ndim* should correspond to the number of dimensions declared when
+    instantiating the kernel. If *ndim* is 1, a single integer is returned.
+    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
+
+    Computation of the first integer is as follows::
+
+        cuda.blockDim.x * cuda.gridDim.x
+
+    and is similar for the other two indices, but using the ``y`` and ``z``
+    attributes.
+    '''
+
+    if not isinstance(ndim, types.IntegerLiteral):
+        raise RequireLiteralValue(ndim)
+
+    sig = _type_grid_function(ndim)
+
+    def _nthreads_for_dim(builder, dim):
+        i64 = ir.IntType(64)
+        ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
+        nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
+        return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
+
+    def codegen(context, builder, sig, args):
+        restype = sig.return_type
+        nx = _nthreads_for_dim(builder, 'x')
+
+        if restype == types.int64:
+            return nx
+        elif isinstance(restype, types.UniTuple):
+            ny = _nthreads_for_dim(builder, 'y')
+
+            if restype.count == 2:
+                return cgutils.pack_array(builder, (nx, ny))
+            elif restype.count == 3:
+                nz = _nthreads_for_dim(builder, 'z')
+                return cgutils.pack_array(builder, (nx, ny, nz))
+
+    return sig, codegen
+
+
+@intrinsic
+def _warpsize(typingctx):
+    sig = signature(types.int32)
+
+    def codegen(context, builder, sig, args):
+        return nvvmutils.call_sreg(builder, 'warpsize')
+
+    return sig, codegen
+
+
+@overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
+def cuda_warpsize(mod):
+    '''
+    The size of a warp. All architectures implemented to date have a warp size
+    of 32.
+    '''
+    def get(mod):
+        return _warpsize()
+    return get
+
+
+#-------------------------------------------------------------------------------
+# syncthreads
+
+@intrinsic
+def syncthreads(typingctx):
+    '''
+    Synchronize all threads in the same thread block.  This function implements
+    the same pattern as barriers in traditional multi-threaded programming: this
+    function waits until all threads in the block call it, at which point it
+    returns control to all its callers.
+    '''
+    sig = signature(types.none)
+
+    def codegen(context, builder, sig, args):
+        fname = 'llvm.nvvm.barrier0'
+        lmod = builder.module
+        fnty = ir.FunctionType(ir.VoidType(), ())
+        sync = cgutils.get_or_insert_function(lmod, fnty, fname)
+        builder.call(sync, ())
+        return context.get_dummy_value()
+
+    return sig, codegen
+
+
+def _syncthreads_predicate(typingctx, predicate, fname):
+    if not isinstance(predicate, types.Integer):
+        return None
+
+    sig = signature(types.i4, types.i4)
+
+    def codegen(context, builder, sig, args):
+        fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
+        sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
+        return builder.call(sync, args)
+
+    return sig, codegen
+
+
+@intrinsic
+def syncthreads_count(typingctx, predicate):
+    '''
+    syncthreads_count(predicate)
+
+    An extension to numba.cuda.syncthreads where the return value is a count
+    of the threads where predicate is true.
+    '''
+    fname = 'llvm.nvvm.barrier0.popc'
+    return _syncthreads_predicate(typingctx, predicate, fname)
+
+
+@intrinsic
+def syncthreads_and(typingctx, predicate):
+    '''
+    syncthreads_and(predicate)
+
+    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
+    true for all threads or 0 otherwise.
+    '''
+    fname = 'llvm.nvvm.barrier0.and'
+    return _syncthreads_predicate(typingctx, predicate, fname)
+
+
+@intrinsic
+def syncthreads_or(typingctx, predicate):
+    '''
+    syncthreads_or(predicate)
+
+    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
+    true for any thread or 0 otherwise.
+    '''
+    fname = 'llvm.nvvm.barrier0.or'
+    return _syncthreads_predicate(typingctx, predicate, fname)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/init.py
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/pycache/reduction.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/pycache/reduction.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/pycache/transpose.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/pycache/transpose.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/reduction.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/reduction.py
@@ -0,0 +1,262 @@
+"""
+A library written in CUDA Python for generating reduction kernels
+"""
+
+from numba.np.numpy_support import from_dtype
+
+
+_WARPSIZE = 32
+_NUMWARPS = 4
+
+
+def _gpu_reduce_factory(fn, nbtype):
+    from numba import cuda
+
+    reduce_op = cuda.jit(device=True)(fn)
+    inner_sm_size = _WARPSIZE + 1   # plus one to avoid SM collision
+    max_blocksize = _NUMWARPS * _WARPSIZE
+
+    @cuda.jit(device=True)
+    def inner_warp_reduction(sm_partials, init):
+        """
+        Compute reduction within a single warp
+        """
+        tid = cuda.threadIdx.x
+        warpid = tid // _WARPSIZE
+        laneid = tid % _WARPSIZE
+
+        sm_this = sm_partials[warpid, :]
+        sm_this[laneid] = init
+        cuda.syncwarp()
+
+        width = _WARPSIZE // 2
+        while width:
+            if laneid < width:
+                old = sm_this[laneid]
+                sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
+            cuda.syncwarp()
+            width //= 2
+
+    @cuda.jit(device=True)
+    def device_reduce_full_block(arr, partials, sm_partials):
+        """
+        Partially reduce `arr` into `partials` using `sm_partials` as working
+        space.  The algorithm goes like:
+
+            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
+                        block-0:  |   x |     |     |   x |     |
+                        block-1:  |     |   x |     |     |   x |
+                        block-2:  |     |     |   x |     |     |
+
+        The array is divided into chunks of 128 (size of a threadblock).
+        The threadblocks consumes the chunks in roundrobin scheduling.
+        First, a threadblock loads a chunk into temp memory.  Then, all
+        subsequent chunks are combined into the temp memory.
+
+        Once all chunks are processed.  Inner-block reduction is performed
+        on the temp memory.  So that, there will just be one scalar result
+        per block.  The result from each block is stored to `partials` at
+        the dedicated slot.
+        """
+        tid = cuda.threadIdx.x
+        blkid = cuda.blockIdx.x
+        blksz = cuda.blockDim.x
+        gridsz = cuda.gridDim.x
+
+        # block strided loop to compute the reduction
+        start = tid + blksz * blkid
+        stop = arr.size
+        step = blksz * gridsz
+
+        # load first value
+        tmp = arr[start]
+        # loop over all values in block-stride
+        for i in range(start + step, stop, step):
+            tmp = reduce_op(tmp, arr[i])
+
+        cuda.syncthreads()
+        # inner-warp reduction
+        inner_warp_reduction(sm_partials, tmp)
+
+        cuda.syncthreads()
+        # at this point, only the first slot for each warp in tsm_partials
+        # is valid.
+
+        # finish up block reduction
+        # warning: this is assuming 4 warps.
+        # assert numwarps == 4
+        if tid < 2:
+            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
+                                            sm_partials[tid + 2, 0])
+            cuda.syncwarp()
+        if tid == 0:
+            partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
+
+    @cuda.jit(device=True)
+    def device_reduce_partial_block(arr, partials, sm_partials):
+        """
+        This computes reduction on `arr`.
+        This device function must be used by 1 threadblock only.
+        The blocksize must match `arr.size` and must not be greater than 128.
+        """
+        tid = cuda.threadIdx.x
+        blkid = cuda.blockIdx.x
+        blksz = cuda.blockDim.x
+        warpid = tid // _WARPSIZE
+        laneid = tid % _WARPSIZE
+
+        size = arr.size
+        # load first value
+        tid = cuda.threadIdx.x
+        value = arr[tid]
+        sm_partials[warpid, laneid] = value
+
+        cuda.syncthreads()
+
+        if (warpid + 1) * _WARPSIZE < size:
+            # fully populated warps
+            inner_warp_reduction(sm_partials, value)
+        else:
+            # partially populated warps
+            # NOTE: this uses a very inefficient sequential algorithm
+            if laneid == 0:
+                sm_this = sm_partials[warpid, :]
+                base = warpid * _WARPSIZE
+                for i in range(1, size - base):
+                    sm_this[0] = reduce_op(sm_this[0], sm_this[i])
+
+        cuda.syncthreads()
+        # finish up
+        if tid == 0:
+            num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
+
+            result = sm_partials[0, 0]
+            for i in range(1, num_active_warps):
+                result = reduce_op(result, sm_partials[i, 0])
+
+            partials[blkid] = result
+
+    def gpu_reduce_block_strided(arr, partials, init, use_init):
+        """
+        Perform reductions on *arr* and writing out partial reduction result
+        into *partials*.  The length of *partials* is determined by the
+        number of threadblocks. The initial value is set with *init*.
+
+        Launch config:
+
+        Blocksize must be multiple of warpsize and it is limited to 4 warps.
+        """
+        tid = cuda.threadIdx.x
+
+        sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
+                                        dtype=nbtype)
+        if cuda.blockDim.x == max_blocksize:
+            device_reduce_full_block(arr, partials, sm_partials)
+        else:
+            device_reduce_partial_block(arr, partials, sm_partials)
+        # deal with the initializer
+        if use_init and tid == 0 and cuda.blockIdx.x == 0:
+            partials[0] = reduce_op(partials[0], init)
+
+    return cuda.jit(gpu_reduce_block_strided)
+
+
+class Reduce(object):
+    """Create a reduction object that reduces values using a given binary
+    function. The binary function is compiled once and cached inside this
+    object. Keeping this object alive will prevent re-compilation.
+    """
+
+    _cache = {}
+
+    def __init__(self, functor):
+        """
+        :param functor: A function implementing a binary operation for
+                        reduction. It will be compiled as a CUDA device
+                        function using ``cuda.jit(device=True)``.
+        """
+        self._functor = functor
+
+    def _compile(self, dtype):
+        key = self._functor, dtype
+        if key in self._cache:
+            kernel = self._cache[key]
+        else:
+            kernel = _gpu_reduce_factory(self._functor, from_dtype(dtype))
+            self._cache[key] = kernel
+        return kernel
+
+    def __call__(self, arr, size=None, res=None, init=0, stream=0):
+        """Performs a full reduction.
+
+        :param arr: A host or device array.
+        :param size: Optional integer specifying the number of elements in
+                    ``arr`` to reduce. If this parameter is not specified, the
+                    entire array is reduced.
+        :param res: Optional device array into which to write the reduction
+                    result to. The result is written into the first element of
+                    this array. If this parameter is specified, then no
+                    communication of the reduction output takes place from the
+                    device to the host.
+        :param init: Optional initial value for the reduction, the type of which
+                    must match ``arr.dtype``.
+        :param stream: Optional CUDA stream in which to perform the reduction.
+                    If no stream is specified, the default stream of 0 is
+                    used.
+        :return: If ``res`` is specified, ``None`` is returned. Otherwise, the
+                result of the reduction is returned.
+        """
+        from numba import cuda
+
+        # ensure 1d array
+        if arr.ndim != 1:
+            raise TypeError("only support 1D array")
+
+        # adjust array size
+        if size is not None:
+            arr = arr[:size]
+
+        init = arr.dtype.type(init)  # ensure the right type
+
+        # return `init` if `arr` is empty
+        if arr.size < 1:
+            return init
+
+        kernel = self._compile(arr.dtype)
+
+        # Perform the reduction on the GPU
+        blocksize = _NUMWARPS * _WARPSIZE
+        size_full = (arr.size // blocksize) * blocksize
+        size_partial = arr.size - size_full
+        full_blockct = min(size_full // blocksize, _WARPSIZE * 2)
+
+        # allocate size of partials array
+        partials_size = full_blockct
+        if size_partial:
+            partials_size += 1
+        partials = cuda.device_array(shape=partials_size, dtype=arr.dtype)
+
+        if size_full:
+            # kernel for the fully populated threadblocks
+            kernel[full_blockct, blocksize, stream](arr[:size_full],
+                                                    partials[:full_blockct],
+                                                    init,
+                                                    True)
+
+        if size_partial:
+            # kernel for partially populated threadblocks
+            kernel[1, size_partial, stream](arr[size_full:],
+                                            partials[full_blockct:],
+                                            init,
+                                            not full_blockct)
+
+        if partials.size > 1:
+            # finish up
+            kernel[1, partials_size, stream](partials, partials, init, False)
+
+        # handle return value
+        if res is not None:
+            res[:1].copy_to_device(partials[:1], stream=stream)
+            return
+        else:
+            return partials[0]
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/transpose.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/kernels/transpose.py
@@ -0,0 +1,65 @@
+from numba import cuda
+from numba.cuda.cudadrv.driver import driver
+import math
+from numba.np import numpy_support as nps
+
+
+def transpose(a, b=None):
+    """Compute the transpose of 'a' and store it into 'b', if given,
+    and return it. If 'b' is not given, allocate a new array
+    and return that.
+
+    This implements the algorithm documented in
+    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
+
+    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
+        the device its stream will be used to perform the transpose (and to copy
+        `b` to the device if necessary).
+    """
+
+    # prefer `a`'s stream if
+    stream = getattr(a, 'stream', 0)
+
+    if not b:
+        cols, rows = a.shape
+        strides = a.dtype.itemsize * cols, a.dtype.itemsize
+        b = cuda.cudadrv.devicearray.DeviceNDArray(
+            (rows, cols),
+            strides,
+            dtype=a.dtype,
+            stream=stream)
+
+    dt = nps.from_dtype(a.dtype)
+
+    tpb = driver.get_device().MAX_THREADS_PER_BLOCK
+    # we need to factor available threads into x and y axis
+    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
+    tile_height = int(tpb / tile_width)
+
+    tile_shape = (tile_height, tile_width + 1)
+
+    @cuda.jit
+    def kernel(input, output):
+
+        tile = cuda.shared.array(shape=tile_shape, dtype=dt)
+
+        tx = cuda.threadIdx.x
+        ty = cuda.threadIdx.y
+        bx = cuda.blockIdx.x * cuda.blockDim.x
+        by = cuda.blockIdx.y * cuda.blockDim.y
+        x = by + tx
+        y = bx + ty
+
+        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
+            tile[ty, tx] = input[by + ty, bx + tx]
+        cuda.syncthreads()
+        if y < output.shape[0] and x < output.shape[1]:
+            output[y, x] = tile[tx, ty]
+
+    # one block per tile, plus one for remainders
+    blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
+    # one thread per tile element
+    threads = tile_height, tile_width
+    kernel[blocks, threads, stream](a, b)
+
+    return b
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/libdevice.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/libdevice.py
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/libdevicedecl.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/libdevicedecl.py
@@ -0,0 +1,17 @@
+from numba.cuda import libdevice, libdevicefuncs
+from numba.core.typing.templates import ConcreteTemplate, Registry
+
+registry = Registry()
+register_global = registry.register_global
+
+
+def libdevice_declare(func, retty, args):
+    class Libdevice_function(ConcreteTemplate):
+        cases = [libdevicefuncs.create_signature(retty, args)]
+
+    pyfunc = getattr(libdevice, func[5:])
+    register_global(pyfunc)(Libdevice_function)
+
+
+for func, (retty, args) in libdevicefuncs.functions.items():
+    libdevice_declare(func, retty, args)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/libdevicefuncs.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/libdevicefuncs.py
--- a/Show More
+++ b/Show More