Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/init.py
@@ -0,0 +1,9 @@
+"""CUDA Driver
+
+- Driver API binding
+- NVVM API binding
+- Device array implementation
+
+"""
+from numba.core import config
+assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devicearray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devicearray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devices.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/devices.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/driver.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/driver.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/drvapi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/drvapi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/dummyarray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/dummyarray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/enums.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/enums.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/error.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/error.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/libs.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/libs.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/ndarray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/ndarray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvrtc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvrtc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvvm.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/nvvm.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/rtapi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/rtapi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/runtime.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/pycache/runtime.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/_extras.cpython-312-x86_64-linux-gnu.so
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/_extras.cpython-312-x86_64-linux-gnu.so
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devicearray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devicearray.py
@@ -0,0 +1,904 @@
+"""
+A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
+on the object.  If it exists and evaluate to True, it must define shape,
+strides, dtype and size attributes similar to a NumPy ndarray.
+"""
+
+import math
+import functools
+import operator
+import copy
+from ctypes import c_void_p
+
+import numpy as np
+
+import numba
+from numba import _devicearray
+from numba.cuda.cudadrv import devices, dummyarray
+from numba.cuda.cudadrv import driver as _driver
+from numba.core import types, config
+from numba.np.unsafe.ndarray import to_fixed_tuple
+from numba.np.numpy_support import numpy_version
+from numba.np import numpy_support
+from numba.cuda.api_util import prepare_shape_strides_dtype
+from numba.core.errors import NumbaPerformanceWarning
+from warnings import warn
+
+try:
+    lru_cache = getattr(functools, 'lru_cache')(None)
+except AttributeError:
+    # Python 3.1 or lower
+    def lru_cache(func):
+        return func
+
+
+def is_cuda_ndarray(obj):
+    "Check if an object is a CUDA ndarray"
+    return getattr(obj, '__cuda_ndarray__', False)
+
+
+def verify_cuda_ndarray_interface(obj):
+    "Verify the CUDA ndarray interface for an obj"
+    require_cuda_ndarray(obj)
+
+    def requires_attr(attr, typ):
+        if not hasattr(obj, attr):
+            raise AttributeError(attr)
+        if not isinstance(getattr(obj, attr), typ):
+            raise AttributeError('%s must be of type %s' % (attr, typ))
+
+    requires_attr('shape', tuple)
+    requires_attr('strides', tuple)
+    requires_attr('dtype', np.dtype)
+    requires_attr('size', int)
+
+
+def require_cuda_ndarray(obj):
+    "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
+    if not is_cuda_ndarray(obj):
+        raise ValueError('require an cuda ndarray object')
+
+
+class DeviceNDArrayBase(_devicearray.DeviceArray):
+    """A on GPU NDArray representation
+    """
+    __cuda_memory__ = True
+    __cuda_ndarray__ = True     # There must be gpu_data attribute
+
+    def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
+        """
+        Args
+        ----
+
+        shape
+            array shape.
+        strides
+            array strides.
+        dtype
+            data type as np.dtype coercible object.
+        stream
+            cuda stream.
+        gpu_data
+            user provided device memory for the ndarray data buffer
+        """
+        if isinstance(shape, int):
+            shape = (shape,)
+        if isinstance(strides, int):
+            strides = (strides,)
+        dtype = np.dtype(dtype)
+        self.ndim = len(shape)
+        if len(strides) != self.ndim:
+            raise ValueError('strides not match ndim')
+        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
+                                                 dtype.itemsize)
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
+        self.dtype = dtype
+        self.size = int(functools.reduce(operator.mul, self.shape, 1))
+        # prepare gpu memory
+        if self.size > 0:
+            if gpu_data is None:
+                self.alloc_size = _driver.memory_size_from_info(
+                    self.shape, self.strides, self.dtype.itemsize)
+                gpu_data = devices.get_context().memalloc(self.alloc_size)
+            else:
+                self.alloc_size = _driver.device_memory_size(gpu_data)
+        else:
+            # Make NULL pointer for empty allocation
+            if _driver.USE_NV_BINDING:
+                null = _driver.binding.CUdeviceptr(0)
+            else:
+                null = c_void_p(0)
+            gpu_data = _driver.MemoryPointer(context=devices.get_context(),
+                                             pointer=null, size=0)
+            self.alloc_size = 0
+
+        self.gpu_data = gpu_data
+        self.stream = stream
+
+    @property
+    def __cuda_array_interface__(self):
+        if _driver.USE_NV_BINDING:
+            if self.device_ctypes_pointer is not None:
+                ptr = int(self.device_ctypes_pointer)
+            else:
+                ptr = 0
+        else:
+            if self.device_ctypes_pointer.value is not None:
+                ptr = self.device_ctypes_pointer.value
+            else:
+                ptr = 0
+
+        return {
+            'shape': tuple(self.shape),
+            'strides': None if is_contiguous(self) else tuple(self.strides),
+            'data': (ptr, False),
+            'typestr': self.dtype.str,
+            'stream': int(self.stream) if self.stream != 0 else None,
+            'version': 3,
+        }
+
+    def bind(self, stream=0):
+        """Bind a CUDA stream to this object so that all subsequent operation
+        on this array defaults to the given stream.
+        """
+        clone = copy.copy(self)
+        clone.stream = stream
+        return clone
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    def transpose(self, axes=None):
+        if axes and tuple(axes) == tuple(range(self.ndim)):
+            return self
+        elif self.ndim != 2:
+            msg = "transposing a non-2D DeviceNDArray isn't supported"
+            raise NotImplementedError(msg)
+        elif axes is not None and set(axes) != set(range(self.ndim)):
+            raise ValueError("invalid axes list %r" % (axes,))
+        else:
+            from numba.cuda.kernels.transpose import transpose
+            return transpose(self)
+
+    def _default_stream(self, stream):
+        return self.stream if not stream else stream
+
+    @property
+    def _numba_type_(self):
+        """
+        Magic attribute expected by Numba to get the numba type that
+        represents this object.
+        """
+        # Typing considerations:
+        #
+        # 1. The preference is to use 'C' or 'F' layout since this enables
+        # hardcoding stride values into compiled kernels, which is more
+        # efficient than storing a passed-in value in a register.
+        #
+        # 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
+        # the more likely / common case.
+        #
+        # 3. If an array is broadcast then it must be typed as 'A' - using 'C'
+        # or 'F' does not apply for broadcast arrays, because the strides, some
+        # of which will be 0, will not match those hardcoded in for 'C' or 'F'
+        # layouts.
+
+        broadcast = 0 in self.strides
+        if self.flags['C_CONTIGUOUS'] and not broadcast:
+            layout = 'C'
+        elif self.flags['F_CONTIGUOUS'] and not broadcast:
+            layout = 'F'
+        else:
+            layout = 'A'
+
+        dtype = numpy_support.from_dtype(self.dtype)
+        return types.Array(dtype, self.ndim, layout)
+
+    @property
+    def device_ctypes_pointer(self):
+        """Returns the ctypes pointer to the GPU data buffer
+        """
+        if self.gpu_data is None:
+            if _driver.USE_NV_BINDING:
+                return _driver.binding.CUdeviceptr(0)
+            else:
+                return c_void_p(0)
+        else:
+            return self.gpu_data.device_ctypes_pointer
+
+    @devices.require_context
+    def copy_to_device(self, ary, stream=0):
+        """Copy `ary` to `self`.
+
+        If `ary` is a CUDA memory, perform a device-to-device transfer.
+        Otherwise, perform a a host-to-device transfer.
+        """
+        if ary.size == 0:
+            # Nothing to do
+            return
+
+        sentry_contiguous(self)
+        stream = self._default_stream(stream)
+
+        self_core, ary_core = array_core(self), array_core(ary)
+        if _driver.is_device_memory(ary):
+            sentry_contiguous(ary)
+            check_array_compatibility(self_core, ary_core)
+            _driver.device_to_device(self, ary, self.alloc_size, stream=stream)
+        else:
+            # Ensure same contiguity. Only makes a host-side copy if necessary
+            # (i.e., in order to materialize a writable strided view)
+            ary_core = np.array(
+                ary_core,
+                order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
+                subok=True,
+                copy=(not ary_core.flags['WRITEABLE'])
+                if numpy_version < (2, 0) else None)
+            check_array_compatibility(self_core, ary_core)
+            _driver.host_to_device(self, ary_core, self.alloc_size,
+                                   stream=stream)
+
+    @devices.require_context
+    def copy_to_host(self, ary=None, stream=0):
+        """Copy ``self`` to ``ary`` or create a new Numpy ndarray
+        if ``ary`` is ``None``.
+
+        If a CUDA ``stream`` is given, then the transfer will be made
+        asynchronously as part as the given stream.  Otherwise, the transfer is
+        synchronous: the function returns after the copy is finished.
+
+        Always returns the host array.
+
+        Example::
+
+            import numpy as np
+            from numba import cuda
+
+            arr = np.arange(1000)
+            d_arr = cuda.to_device(arr)
+
+            my_kernel[100, 100](d_arr)
+
+            result_array = d_arr.copy_to_host()
+        """
+        if any(s < 0 for s in self.strides):
+            msg = 'D->H copy not implemented for negative strides: {}'
+            raise NotImplementedError(msg.format(self.strides))
+        assert self.alloc_size >= 0, "Negative memory size"
+        stream = self._default_stream(stream)
+        if ary is None:
+            hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
+        else:
+            check_array_compatibility(self, ary)
+            hostary = ary
+
+        if self.alloc_size != 0:
+            _driver.device_to_host(hostary, self, self.alloc_size,
+                                   stream=stream)
+
+        if ary is None:
+            if self.size == 0:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     buffer=hostary)
+            else:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     strides=self.strides, buffer=hostary)
+        return hostary
+
+    def split(self, section, stream=0):
+        """Split the array into equal partition of the `section` size.
+        If the array cannot be equally divided, the last section will be
+        smaller.
+        """
+        stream = self._default_stream(stream)
+        if self.ndim != 1:
+            raise ValueError("only support 1d array")
+        if self.strides[0] != self.dtype.itemsize:
+            raise ValueError("only support unit stride")
+        nsect = int(math.ceil(float(self.size) / section))
+        strides = self.strides
+        itemsize = self.dtype.itemsize
+        for i in range(nsect):
+            begin = i * section
+            end = min(begin + section, self.size)
+            shape = (end - begin,)
+            gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
+            yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
+                                gpu_data=gpu_data)
+
+    def as_cuda_arg(self):
+        """Returns a device memory object that is used as the argument.
+        """
+        return self.gpu_data
+
+    def get_ipc_handle(self):
+        """
+        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
+        to another process to share the local allocation.
+
+        Note: this feature is only available on Linux.
+        """
+        ipch = devices.get_context().get_ipc_handle(self.gpu_data)
+        desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
+        return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
+
+    def squeeze(self, axis=None, stream=0):
+        """
+        Remove axes of size one from the array shape.
+
+        Parameters
+        ----------
+        axis : None or int or tuple of ints, optional
+            Subset of dimensions to remove. A `ValueError` is raised if an axis
+            with size greater than one is selected. If `None`, all axes with
+            size one are removed.
+        stream : cuda stream or 0, optional
+            Default stream for the returned view of the array.
+
+        Returns
+        -------
+        DeviceNDArray
+            Squeezed view into the array.
+
+        """
+        new_dummy, _ = self._dummy.squeeze(axis=axis)
+        return DeviceNDArray(
+            shape=new_dummy.shape,
+            strides=new_dummy.strides,
+            dtype=self.dtype,
+            stream=self._default_stream(stream),
+            gpu_data=self.gpu_data,
+        )
+
+    def view(self, dtype):
+        """Returns a new object by reinterpretting the dtype without making a
+        copy of the data.
+        """
+        dtype = np.dtype(dtype)
+        shape = list(self.shape)
+        strides = list(self.strides)
+
+        if self.dtype.itemsize != dtype.itemsize:
+            if not self.is_c_contiguous():
+                raise ValueError(
+                    "To change to a dtype of a different size,"
+                    " the array must be C-contiguous"
+                )
+
+            shape[-1], rem = divmod(
+                shape[-1] * self.dtype.itemsize,
+                dtype.itemsize
+            )
+
+            if rem != 0:
+                raise ValueError(
+                    "When changing to a larger dtype,"
+                    " its size must be a divisor of the total size in bytes"
+                    " of the last axis of the array."
+                )
+
+            strides[-1] = dtype.itemsize
+
+        return DeviceNDArray(
+            shape=shape,
+            strides=strides,
+            dtype=dtype,
+            stream=self.stream,
+            gpu_data=self.gpu_data,
+        )
+
+    @property
+    def nbytes(self):
+        # Note: not using `alloc_size`.  `alloc_size` reports memory
+        # consumption of the allocation, not the size of the array
+        # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
+        return self.dtype.itemsize * self.size
+
+
+class DeviceRecord(DeviceNDArrayBase):
+    '''
+    An on-GPU record type
+    '''
+    def __init__(self, dtype, stream=0, gpu_data=None):
+        shape = ()
+        strides = ()
+        super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
+                                           gpu_data)
+
+    @property
+    def flags(self):
+        """
+        For `numpy.ndarray` compatibility. Ideally this would return a
+        `np.core.multiarray.flagsobj`, but that needs to be constructed
+        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
+        aren't writeable).
+        """
+        return dict(self._dummy.flags) # defensive copy
+
+    @property
+    def _numba_type_(self):
+        """
+        Magic attribute expected by Numba to get the numba type that
+        represents this object.
+        """
+        return numpy_support.from_dtype(self.dtype)
+
+    @devices.require_context
+    def __getitem__(self, item):
+        return self._do_getitem(item)
+
+    @devices.require_context
+    def getitem(self, item, stream=0):
+        """Do `__getitem__(item)` with CUDA stream
+        """
+        return self._do_getitem(item, stream)
+
+    def _do_getitem(self, item, stream=0):
+        stream = self._default_stream(stream)
+        typ, offset = self.dtype.fields[item]
+        newdata = self.gpu_data.view(offset)
+
+        if typ.shape == ():
+            if typ.names is not None:
+                return DeviceRecord(dtype=typ, stream=stream,
+                                    gpu_data=newdata)
+            else:
+                hostary = np.empty(1, dtype=typ)
+                _driver.device_to_host(dst=hostary, src=newdata,
+                                       size=typ.itemsize,
+                                       stream=stream)
+            return hostary[0]
+        else:
+            shape, strides, dtype = \
+                prepare_shape_strides_dtype(typ.shape,
+                                            None,
+                                            typ.subdtype[0], 'C')
+            return DeviceNDArray(shape=shape, strides=strides,
+                                 dtype=dtype, gpu_data=newdata,
+                                 stream=stream)
+
+    @devices.require_context
+    def __setitem__(self, key, value):
+        return self._do_setitem(key, value)
+
+    @devices.require_context
+    def setitem(self, key, value, stream=0):
+        """Do `__setitem__(key, value)` with CUDA stream
+        """
+        return self._do_setitem(key, value, stream=stream)
+
+    def _do_setitem(self, key, value, stream=0):
+
+        stream = self._default_stream(stream)
+
+        # If the record didn't have a default stream, and the user didn't
+        # provide a stream, then we will use the default stream for the
+        # assignment kernel and synchronize on it.
+        synchronous = not stream
+        if synchronous:
+            ctx = devices.get_context()
+            stream = ctx.get_default_stream()
+
+        # (1) prepare LHS
+
+        typ, offset = self.dtype.fields[key]
+        newdata = self.gpu_data.view(offset)
+
+        lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
+
+        # (2) prepare RHS
+
+        rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
+
+        # (3) do the copy
+
+        _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
+
+        if synchronous:
+            stream.synchronize()
+
+
+@lru_cache
+def _assign_kernel(ndim):
+    """
+    A separate method so we don't need to compile code every assignment (!).
+
+    :param ndim: We need to have static array sizes for cuda.local.array, so
+        bake in the number of dimensions into the kernel
+    """
+    from numba import cuda  # circular!
+
+    if ndim == 0:
+        # the (2, ndim) allocation below is not yet supported, so avoid it
+        @cuda.jit
+        def kernel(lhs, rhs):
+            lhs[()] = rhs[()]
+        return kernel
+
+    @cuda.jit
+    def kernel(lhs, rhs):
+        location = cuda.grid(1)
+
+        n_elements = 1
+        for i in range(lhs.ndim):
+            n_elements *= lhs.shape[i]
+        if location >= n_elements:
+            # bake n_elements into the kernel, better than passing it in
+            # as another argument.
+            return
+
+        # [0, :] is the to-index (into `lhs`)
+        # [1, :] is the from-index (into `rhs`)
+        idx = cuda.local.array(
+            shape=(2, ndim),
+            dtype=types.int64)
+
+        for i in range(ndim - 1, -1, -1):
+            idx[0, i] = location % lhs.shape[i]
+            idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
+            location //= lhs.shape[i]
+
+        lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
+    return kernel
+
+
+class DeviceNDArray(DeviceNDArrayBase):
+    '''
+    An on-GPU array type
+    '''
+    def is_f_contiguous(self):
+        '''
+        Return true if the array is Fortran-contiguous.
+        '''
+        return self._dummy.is_f_contig
+
+    @property
+    def flags(self):
+        """
+        For `numpy.ndarray` compatibility. Ideally this would return a
+        `np.core.multiarray.flagsobj`, but that needs to be constructed
+        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
+        aren't writeable).
+        """
+        return dict(self._dummy.flags) # defensive copy
+
+    def is_c_contiguous(self):
+        '''
+        Return true if the array is C-contiguous.
+        '''
+        return self._dummy.is_c_contig
+
+    def __array__(self, dtype=None):
+        """
+        :return: an `numpy.ndarray`, so copies to the host.
+        """
+        if dtype:
+            return self.copy_to_host().__array__(dtype)
+        else:
+            return self.copy_to_host().__array__()
+
+    def __len__(self):
+        return self.shape[0]
+
+    def reshape(self, *newshape, **kws):
+        """
+        Reshape the array without changing its contents, similarly to
+        :meth:`numpy.ndarray.reshape`. Example::
+
+            d_arr = d_arr.reshape(20, 50, order='F')
+        """
+        if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
+            newshape = newshape[0]
+
+        cls = type(self)
+        if newshape == self.shape:
+            # nothing to do
+            return cls(shape=self.shape, strides=self.strides,
+                       dtype=self.dtype, gpu_data=self.gpu_data)
+
+        newarr, extents = self._dummy.reshape(*newshape, **kws)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, gpu_data=self.gpu_data)
+        else:
+            raise NotImplementedError("operation requires copying")
+
+    def ravel(self, order='C', stream=0):
+        '''
+        Flattens a contiguous array without changing its contents, similar to
+        :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
+        exception.
+        '''
+        stream = self._default_stream(stream)
+        cls = type(self)
+        newarr, extents = self._dummy.ravel(order=order)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, gpu_data=self.gpu_data,
+                       stream=stream)
+
+        else:
+            raise NotImplementedError("operation requires copying")
+
+    @devices.require_context
+    def __getitem__(self, item):
+        return self._do_getitem(item)
+
+    @devices.require_context
+    def getitem(self, item, stream=0):
+        """Do `__getitem__(item)` with CUDA stream
+        """
+        return self._do_getitem(item, stream)
+
+    def _do_getitem(self, item, stream=0):
+        stream = self._default_stream(stream)
+
+        arr = self._dummy.__getitem__(item)
+        extents = list(arr.iter_contiguous_extent())
+        cls = type(self)
+        if len(extents) == 1:
+            newdata = self.gpu_data.view(*extents[0])
+
+            if not arr.is_array:
+                # Check for structured array type (record)
+                if self.dtype.names is not None:
+                    return DeviceRecord(dtype=self.dtype, stream=stream,
+                                        gpu_data=newdata)
+                else:
+                    # Element indexing
+                    hostary = np.empty(1, dtype=self.dtype)
+                    _driver.device_to_host(dst=hostary, src=newdata,
+                                           size=self._dummy.itemsize,
+                                           stream=stream)
+                return hostary[0]
+            else:
+                return cls(shape=arr.shape, strides=arr.strides,
+                           dtype=self.dtype, gpu_data=newdata, stream=stream)
+        else:
+            newdata = self.gpu_data.view(*arr.extent)
+            return cls(shape=arr.shape, strides=arr.strides,
+                       dtype=self.dtype, gpu_data=newdata, stream=stream)
+
+    @devices.require_context
+    def __setitem__(self, key, value):
+        return self._do_setitem(key, value)
+
+    @devices.require_context
+    def setitem(self, key, value, stream=0):
+        """Do `__setitem__(key, value)` with CUDA stream
+        """
+        return self._do_setitem(key, value, stream=stream)
+
+    def _do_setitem(self, key, value, stream=0):
+
+        stream = self._default_stream(stream)
+
+        # If the array didn't have a default stream, and the user didn't provide
+        # a stream, then we will use the default stream for the assignment
+        # kernel and synchronize on it.
+        synchronous = not stream
+        if synchronous:
+            ctx = devices.get_context()
+            stream = ctx.get_default_stream()
+
+        # (1) prepare LHS
+
+        arr = self._dummy.__getitem__(key)
+        newdata = self.gpu_data.view(*arr.extent)
+
+        if isinstance(arr, dummyarray.Element):
+            # convert to a 0d array
+            shape = ()
+            strides = ()
+        else:
+            shape = arr.shape
+            strides = arr.strides
+
+        lhs = type(self)(
+            shape=shape,
+            strides=strides,
+            dtype=self.dtype,
+            gpu_data=newdata,
+            stream=stream)
+
+        # (2) prepare RHS
+
+        rhs, _ = auto_device(value, stream=stream, user_explicit=True)
+        if rhs.ndim > lhs.ndim:
+            raise ValueError("Can't assign %s-D array to %s-D self" % (
+                rhs.ndim,
+                lhs.ndim))
+        rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
+        # negative indices would not work if rhs.ndim == 0
+        rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
+        rhs = rhs.reshape(*rhs_shape)
+        for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
+            if r != 1 and l != r:
+                raise ValueError("Can't copy sequence with size %d to array "
+                                 "axis %d with dimension %d" % ( r, i, l))
+
+        # (3) do the copy
+
+        n_elements = functools.reduce(operator.mul, lhs.shape, 1)
+        _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
+        if synchronous:
+            stream.synchronize()
+
+
+class IpcArrayHandle(object):
+    """
+    An IPC array handle that can be serialized and transfer to another process
+    in the same machine for share a GPU allocation.
+
+    On the destination process, use the *.open()* method to creates a new
+    *DeviceNDArray* object that shares the allocation from the original process.
+    To release the resources, call the *.close()* method.  After that, the
+    destination can no longer use the shared array object.  (Note: the
+    underlying weakref to the resource is now dead.)
+
+    This object implements the context-manager interface that calls the
+    *.open()* and *.close()* method automatically::
+
+        with the_ipc_array_handle as ipc_array:
+            # use ipc_array here as a normal gpu array object
+            some_code(ipc_array)
+        # ipc_array is dead at this point
+    """
+    def __init__(self, ipc_handle, array_desc):
+        self._array_desc = array_desc
+        self._ipc_handle = ipc_handle
+
+    def open(self):
+        """
+        Returns a new *DeviceNDArray* that shares the allocation from the
+        original process.  Must not be used on the original process.
+        """
+        dptr = self._ipc_handle.open(devices.get_context())
+        return DeviceNDArray(gpu_data=dptr, **self._array_desc)
+
+    def close(self):
+        """
+        Closes the IPC handle to the array.
+        """
+        self._ipc_handle.close()
+
+    def __enter__(self):
+        return self.open()
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+
+class MappedNDArray(DeviceNDArrayBase, np.ndarray):
+    """
+    A host array that uses CUDA mapped memory.
+    """
+
+    def device_setup(self, gpu_data, stream=0):
+        self.gpu_data = gpu_data
+        self.stream = stream
+
+
+class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
+    """
+    A host array that uses CUDA managed memory.
+    """
+
+    def device_setup(self, gpu_data, stream=0):
+        self.gpu_data = gpu_data
+        self.stream = stream
+
+
+def from_array_like(ary, stream=0, gpu_data=None):
+    "Create a DeviceNDArray object that is like ary."
+    return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
+                         gpu_data=gpu_data)
+
+
+def from_record_like(rec, stream=0, gpu_data=None):
+    "Create a DeviceRecord object that is like rec."
+    return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
+
+
+def array_core(ary):
+    """
+    Extract the repeated core of a broadcast array.
+
+    Broadcast arrays are by definition non-contiguous due to repeated
+    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
+    contiguity and copy the underlying data from such arrays, we must create
+    a view without the repeated dimensions.
+
+    """
+    if not ary.strides or not ary.size:
+        return ary
+    core_index = []
+    for stride in ary.strides:
+        core_index.append(0 if stride == 0 else slice(None))
+    return ary[tuple(core_index)]
+
+
+def is_contiguous(ary):
+    """
+    Returns True iff `ary` is C-style contiguous while ignoring
+    broadcasted and 1-sized dimensions.
+    As opposed to array_core(), it does not call require_context(),
+    which can be quite expensive.
+    """
+    size = ary.dtype.itemsize
+    for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
+        if shape > 1 and stride != 0:
+            if size != stride:
+                return False
+            size *= shape
+    return True
+
+
+errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
+                            "be transferred as a single memory region. Please "
+                            "ensure contiguous buffer with numpy "
+                            ".ascontiguousarray()")
+
+
+def sentry_contiguous(ary):
+    core = array_core(ary)
+    if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
+        raise ValueError(errmsg_contiguous_buffer)
+
+
+def auto_device(obj, stream=0, copy=True, user_explicit=False):
+    """
+    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
+    host to device. If obj already represents device memory, it is returned and
+    no copy is made.
+    """
+    if _driver.is_device_memory(obj):
+        return obj, False
+    elif hasattr(obj, '__cuda_array_interface__'):
+        return numba.cuda.as_cuda_array(obj), False
+    else:
+        if isinstance(obj, np.void):
+            devobj = from_record_like(obj, stream=stream)
+        else:
+            # This allows you to pass non-array objects like constants and
+            # objects implementing the array interface
+            # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
+            # into this function (with no overhead -- copies -- for `obj`s
+            # that are already `ndarray`s.
+            obj = np.array(
+                obj,
+                copy=False if numpy_version < (2, 0) else None,
+                subok=True)
+            sentry_contiguous(obj)
+            devobj = from_array_like(obj, stream=stream)
+        if copy:
+            if config.CUDA_WARN_ON_IMPLICIT_COPY:
+                if (
+                    not user_explicit and
+                    (not isinstance(obj, DeviceNDArray)
+                     and isinstance(obj, np.ndarray))
+                ):
+                    msg = ("Host array used in CUDA kernel will incur "
+                           "copy overhead to/from device.")
+                    warn(NumbaPerformanceWarning(msg))
+            devobj.copy_to_device(obj, stream=stream)
+        return devobj, True
+
+
+def check_array_compatibility(ary1, ary2):
+    ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
+    if ary1.dtype != ary2.dtype:
+        raise TypeError('incompatible dtype: %s vs. %s' %
+                        (ary1.dtype, ary2.dtype))
+    if ary1sq.shape != ary2sq.shape:
+        raise ValueError('incompatible shape: %s vs. %s' %
+                         (ary1.shape, ary2.shape))
+    # We check strides only if the size is nonzero, because strides are
+    # irrelevant (and can differ) for zero-length copies.
+    if ary1.size and ary1sq.strides != ary2sq.strides:
+        raise ValueError('incompatible strides: %s vs. %s' %
+                         (ary1.strides, ary2.strides))
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devices.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/devices.py
@@ -0,0 +1,248 @@
+"""
+Expose each GPU devices directly.
+
+This module implements a API that is like the "CUDA runtime" context manager
+for managing CUDA context stack and clean up.  It relies on thread-local globals
+to separate the context stack management of each thread. Contexts are also
+shareable among threads.  Only the main thread can destroy Contexts.
+
+Note:
+- This module must be imported by the main-thread.
+
+"""
+import functools
+import threading
+from contextlib import contextmanager
+
+from .driver import driver, USE_NV_BINDING
+
+
+class _DeviceList(object):
+    def __getattr__(self, attr):
+        # First time looking at "lst" attribute.
+        if attr == "lst":
+            # Device list is not initialized.
+            # Query all CUDA devices.
+            numdev = driver.get_device_count()
+            gpus = [_DeviceContextManager(driver.get_device(devid))
+                    for devid in range(numdev)]
+            # Define "lst" to avoid re-initialization
+            self.lst = gpus
+            return gpus
+
+        # Other attributes
+        return super(_DeviceList, self).__getattr__(attr)
+
+    def __getitem__(self, devnum):
+        '''
+        Returns the context manager for device *devnum*.
+        '''
+        return self.lst[devnum]
+
+    def __str__(self):
+        return ', '.join([str(d) for d in self.lst])
+
+    def __iter__(self):
+        return iter(self.lst)
+
+    def __len__(self):
+        return len(self.lst)
+
+    @property
+    def current(self):
+        """Returns the active device or None if there's no active device
+        """
+        with driver.get_active_context() as ac:
+            devnum = ac.devnum
+            if devnum is not None:
+                return self[devnum]
+
+
+class _DeviceContextManager(object):
+    """
+    Provides a context manager for executing in the context of the chosen
+    device. The normal use of instances of this type is from
+    ``numba.cuda.gpus``. For example, to execute on device 2::
+
+       with numba.cuda.gpus[2]:
+           d_a = numba.cuda.to_device(a)
+
+    to copy the array *a* onto device 2, referred to by *d_a*.
+    """
+
+    def __init__(self, device):
+        self._device = device
+
+    def __getattr__(self, item):
+        return getattr(self._device, item)
+
+    def __enter__(self):
+        _runtime.get_or_create_context(self._device.id)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # this will verify that we are popping the right device context.
+        self._device.get_primary_context().pop()
+
+    def __str__(self):
+        return "<Managed Device {self.id}>".format(self=self)
+
+
+class _Runtime(object):
+    """Emulate the CUDA runtime context management.
+
+    It owns all Devices and Contexts.
+    Keeps at most one Context per Device
+    """
+
+    def __init__(self):
+        self.gpus = _DeviceList()
+
+        # For caching the attached CUDA Context
+        self._tls = threading.local()
+
+        # Remember the main thread
+        # Only the main thread can *actually* destroy
+        self._mainthread = threading.current_thread()
+
+        # Avoid mutation of runtime state in multithreaded programs
+        self._lock = threading.RLock()
+
+    @contextmanager
+    def ensure_context(self):
+        """Ensure a CUDA context is available inside the context.
+
+        On entrance, queries the CUDA driver for an active CUDA context and
+        attaches it in TLS for subsequent calls so they do not need to query
+        the CUDA driver again.  On exit, detach the CUDA context from the TLS.
+
+        This will allow us to pickup thirdparty activated CUDA context in
+        any top-level Numba CUDA API.
+        """
+        with driver.get_active_context():
+            oldctx = self._get_attached_context()
+            newctx = self.get_or_create_context(None)
+            self._set_attached_context(newctx)
+            try:
+                yield
+            finally:
+                self._set_attached_context(oldctx)
+
+    def get_or_create_context(self, devnum):
+        """Returns the primary context and push+create it if needed
+        for *devnum*.  If *devnum* is None, use the active CUDA context (must
+        be primary) or create a new one with ``devnum=0``.
+        """
+        if devnum is None:
+            attached_ctx = self._get_attached_context()
+            if attached_ctx is None:
+                return self._get_or_create_context_uncached(devnum)
+            else:
+                return attached_ctx
+        else:
+            if USE_NV_BINDING:
+                devnum = int(devnum)
+            return self._activate_context_for(devnum)
+
+    def _get_or_create_context_uncached(self, devnum):
+        """See also ``get_or_create_context(devnum)``.
+        This version does not read the cache.
+        """
+        with self._lock:
+            # Try to get the active context in the CUDA stack or
+            # activate GPU-0 with the primary context
+            with driver.get_active_context() as ac:
+                if not ac:
+                    return self._activate_context_for(0)
+                else:
+                    # Get primary context for the active device
+                    ctx = self.gpus[ac.devnum].get_primary_context()
+                    # Is active context the primary context?
+                    if USE_NV_BINDING:
+                        ctx_handle = int(ctx.handle)
+                        ac_ctx_handle = int(ac.context_handle)
+                    else:
+                        ctx_handle = ctx.handle.value
+                        ac_ctx_handle = ac.context_handle.value
+                    if ctx_handle != ac_ctx_handle:
+                        msg = ('Numba cannot operate on non-primary'
+                               ' CUDA context {:x}')
+                        raise RuntimeError(msg.format(ac_ctx_handle))
+                    # Ensure the context is ready
+                    ctx.prepare_for_use()
+                return ctx
+
+    def _activate_context_for(self, devnum):
+        with self._lock:
+            gpu = self.gpus[devnum]
+            newctx = gpu.get_primary_context()
+            # Detect unexpected context switch
+            cached_ctx = self._get_attached_context()
+            if cached_ctx is not None and cached_ctx is not newctx:
+                raise RuntimeError('Cannot switch CUDA-context.')
+            newctx.push()
+            return newctx
+
+    def _get_attached_context(self):
+        return getattr(self._tls, 'attached_context', None)
+
+    def _set_attached_context(self, ctx):
+        self._tls.attached_context = ctx
+
+    def reset(self):
+        """Clear all contexts in the thread.  Destroy the context if and only
+        if we are in the main thread.
+        """
+        # Pop all active context.
+        while driver.pop_active_context() is not None:
+            pass
+
+        # If it is the main thread
+        if threading.current_thread() == self._mainthread:
+            self._destroy_all_contexts()
+
+    def _destroy_all_contexts(self):
+        # Reset all devices
+        for gpu in self.gpus:
+            gpu.reset()
+
+
+_runtime = _Runtime()
+
+# ================================ PUBLIC API ================================
+
+gpus = _runtime.gpus
+
+
+def get_context(devnum=None):
+    """Get the current device or use a device by device number, and
+    return the CUDA context.
+    """
+    return _runtime.get_or_create_context(devnum)
+
+
+def require_context(fn):
+    """
+    A decorator that ensures a CUDA context is available when *fn* is executed.
+
+    Note: The function *fn* cannot switch CUDA-context.
+    """
+    @functools.wraps(fn)
+    def _require_cuda_context(*args, **kws):
+        with _runtime.ensure_context():
+            return fn(*args, **kws)
+
+    return _require_cuda_context
+
+
+def reset():
+    """Reset the CUDA subsystem for the current thread.
+
+    In the main thread:
+    This removes all CUDA contexts.  Only use this at shutdown or for
+    cleaning up between tests.
+
+    In non-main threads:
+    This clear the CUDA context stack only.
+
+    """
+    _runtime.reset()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/drvapi.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/drvapi.py
@@ -0,0 +1,394 @@
+from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
+                    c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER)
+
+from numba.cuda.cudadrv import _extras
+
+cu_device = c_int
+cu_device_attribute = c_int     # enum
+cu_context = c_void_p           # an opaque handle
+cu_module = c_void_p            # an opaque handle
+cu_jit_option = c_int           # enum
+cu_jit_input_type = c_int       # enum
+cu_function = c_void_p          # an opaque handle
+cu_device_ptr = c_size_t        # defined as unsigned long long
+cu_stream = c_void_p            # an opaque handle
+cu_event = c_void_p
+cu_link_state = c_void_p
+cu_function_attribute = c_int
+cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE)   # 64 bytes wide
+cu_uuid = (c_byte * 16)         # Device UUID
+
+cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
+
+cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
+
+# See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+CU_STREAM_DEFAULT = 0
+CU_STREAM_LEGACY = 1
+CU_STREAM_PER_THREAD = 2
+
+API_PROTOTYPES = {
+    # CUresult cuInit(unsigned int Flags);
+    'cuInit' : (c_int, c_uint),
+
+    # CUresult cuDriverGetVersion (int* driverVersion )
+    'cuDriverGetVersion': (c_int, POINTER(c_int)),
+
+    # CUresult cuDeviceGetCount(int *count);
+    'cuDeviceGetCount': (c_int, POINTER(c_int)),
+
+    # CUresult cuDeviceGet(CUdevice *device, int ordinal);
+    'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
+
+    # CUresult cuDeviceGetName ( char* name, int  len, CUdevice dev )
+    'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
+
+    # CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+    #                               CUdevice dev);
+    'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
+                             cu_device),
+
+    # CUresult cuDeviceComputeCapability(int *major, int *minor,
+    #                                    CUdevice dev);
+    'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
+                                  cu_device),
+
+    # CUresult cuDevicePrimaryCtxGetState(
+    #              CUdevice dev,
+    #              unsigned int* flags,
+    #              int* active)
+    'cuDevicePrimaryCtxGetState': (c_int,
+                                   cu_device, POINTER(c_uint), POINTER(c_int)),
+
+    # CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
+    'cuDevicePrimaryCtxRelease': (c_int, cu_device),
+
+    # CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
+    'cuDevicePrimaryCtxReset': (c_int, cu_device),
+
+    # CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
+    'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
+
+    # CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int  flags )
+    'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
+
+    # CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
+    #                      CUdevice dev);
+    'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
+
+    # CUresult cuCtxGetDevice (	CUdevice * 	device	 )
+    'cuCtxGetDevice': (c_int, POINTER(cu_device)),
+
+    # CUresult cuCtxGetCurrent (CUcontext *pctx);
+    'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
+
+    # CUresult cuCtxPushCurrent (CUcontext pctx);
+    'cuCtxPushCurrent': (c_int, cu_context),
+
+    # CUresult cuCtxPopCurrent (CUcontext *pctx);
+    'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
+
+    # CUresult cuCtxDestroy(CUcontext pctx);
+    'cuCtxDestroy': (c_int, cu_context),
+
+    # CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
+    #                             unsigned int numOptions,
+    #                             CUjit_option *options,
+    #                             void **optionValues);
+    'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
+                           POINTER(cu_jit_option), POINTER(c_void_p)),
+
+    # CUresult cuModuleUnload(CUmodule hmod);
+    'cuModuleUnload': (c_int, cu_module),
+
+    # CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+    #                              const char *name);
+    'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
+
+    # CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
+    #                              hmod, const char* name )
+    'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
+                          cu_module, c_char_p),
+
+    # CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
+    #                                       CUfunc_cache config);
+    'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
+
+    # CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+    'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
+
+    # CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
+    #                            unsigned int flags);
+    'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
+
+    # CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
+    'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
+
+    # CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
+    #                          size_t N, CUstream hStream);
+    'cuMemsetD8Async': (c_int,
+                        cu_device_ptr, c_uint8, c_size_t, cu_stream),
+
+    # CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
+    #                       size_t ByteCount);
+    'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
+
+    # CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
+    #                            size_t ByteCount, CUstream hStream);
+    'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
+                          cu_stream),
+
+    # CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
+    #                       size_t ByteCount);
+    'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
+
+    # CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
+    #                            size_t ByteCount, CUstream hStream);
+    'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
+                          cu_stream),
+
+
+    # CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
+    #                       size_t ByteCount);
+    'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
+
+    # CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
+    #                            size_t ByteCount, CUstream hStream);
+    'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
+                          cu_stream),
+
+    # CUresult cuMemFree(CUdeviceptr dptr);
+    'cuMemFree': (c_int, cu_device_ptr),
+
+    # CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
+    'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
+
+    # CUresult cuStreamDestroy(CUstream hStream);
+    'cuStreamDestroy': (c_int, cu_stream),
+
+    # CUresult cuStreamSynchronize(CUstream hStream);
+    'cuStreamSynchronize': (c_int, cu_stream),
+
+    # CUresult cuStreamAddCallback(
+    #              CUstream hStream,
+    #              CUstreamCallback callback,
+    #              void* userData,
+    #              unsigned int flags)
+    'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
+                            py_object, c_uint),
+
+    # CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+    #                        unsigned int gridDimY,
+    #                        unsigned int gridDimZ,
+    #                        unsigned int blockDimX,
+    #                        unsigned int blockDimY,
+    #                        unsigned int blockDimZ,
+    #                        unsigned int sharedMemBytes,
+    #                        CUstream hStream, void **kernelParams,
+    #                        void ** extra)
+    'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
+                       c_uint, c_uint, c_uint, c_uint, cu_stream,
+                       POINTER(c_void_p), POINTER(c_void_p)),
+
+    # CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
+    #                                   unsigned int gridDimY,
+    #                                   unsigned int gridDimZ,
+    #                                   unsigned int blockDimX,
+    #                                   unsigned int blockDimY,
+    #                                   unsigned int blockDimZ,
+    #                                   unsigned int sharedMemBytes,
+    #                                   CUstream hStream, void **kernelParams)
+    'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
+                                  c_uint, c_uint, c_uint, c_uint, cu_stream,
+                                  POINTER(c_void_p)),
+
+    #  CUresult cuMemHostAlloc (	void ** 	pp,
+    #                               size_t 	bytesize,
+    #                               unsigned int 	Flags
+    #                           )
+    'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
+
+    #  CUresult cuMemFreeHost (	void * 	p	 )
+    'cuMemFreeHost': (c_int, c_void_p),
+
+    # CUresult cuMemHostRegister(void * 	p,
+    #                            size_t 	bytesize,
+    #                            unsigned int 	Flags)
+    'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
+
+    # CUresult cuMemHostUnregister(void * 	p)
+    'cuMemHostUnregister': (c_int, c_void_p),
+
+    # CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
+    #                                    void *        p,
+    #                                    unsigned int  Flags)
+    'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
+                                  c_void_p, c_uint),
+
+    # CUresult cuMemGetInfo(size_t * free, size_t * total)
+    'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
+
+    # CUresult cuEventCreate (	CUevent * 	phEvent,
+    #                               unsigned int 	Flags )
+    'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
+
+    # CUresult cuEventDestroy (	CUevent 	hEvent	 )
+    'cuEventDestroy': (c_int, cu_event),
+
+    # CUresult cuEventElapsedTime (	float * 	pMilliseconds,
+    #                                   CUevent 	hStart,
+    #                                   CUevent 	hEnd )
+    'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
+
+    # CUresult cuEventQuery (	CUevent 	hEvent	 )
+    'cuEventQuery': (c_int, cu_event),
+
+    # CUresult cuEventRecord (	CUevent 	hEvent,
+    #                               CUstream 	hStream )
+    'cuEventRecord': (c_int, cu_event, cu_stream),
+
+    # CUresult cuEventSynchronize (	CUevent 	hEvent	 )
+    'cuEventSynchronize': (c_int, cu_event),
+
+
+    # CUresult cuStreamWaitEvent (	CUstream        hStream,
+    #                                   CUevent         hEvent,
+    #                                	unsigned int 	Flags )
+    'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
+
+    # CUresult 	cuPointerGetAttribute (
+    #               void *data,
+    #               CUpointer_attribute attribute,
+    #               CUdeviceptr ptr)
+    'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
+
+    #    CUresult cuMemGetAddressRange (	CUdeviceptr * 	pbase,
+    #                                        size_t * 	psize,
+    #                                        CUdeviceptr 	dptr
+    #                                        )
+    'cuMemGetAddressRange': (c_int,
+                             POINTER(cu_device_ptr),
+                             POINTER(c_size_t),
+                             cu_device_ptr),
+
+    #    CUresult cuMemHostGetFlags (	unsigned int * 	pFlags,
+    #                                   void * 	p )
+    'cuMemHostGetFlags': (c_int,
+                          POINTER(c_uint),
+                          c_void_p),
+
+    #   CUresult cuCtxSynchronize ( void )
+    'cuCtxSynchronize' : (c_int,),
+
+    #    CUresult
+    #    cuLinkCreate(unsigned int numOptions, CUjit_option *options,
+    #                 void **optionValues, CUlinkState *stateOut);
+    'cuLinkCreate': (c_int,
+                     c_uint, POINTER(cu_jit_option),
+                     POINTER(c_void_p), POINTER(cu_link_state)),
+
+    #    CUresult
+    #    cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
+    #                  size_t size, const char *name, unsigned
+    #                  int numOptions, CUjit_option *options,
+    #                  void **optionValues);
+    'cuLinkAddData': (c_int,
+                      cu_link_state, cu_jit_input_type, c_void_p,
+                      c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
+                      POINTER(c_void_p)),
+
+    #    CUresult
+    #    cuLinkAddFile(CUlinkState state, CUjitInputType type,
+    #                  const char *path, unsigned int numOptions,
+    #                  CUjit_option *options, void **optionValues);
+
+    'cuLinkAddFile': (c_int,
+                      cu_link_state, cu_jit_input_type, c_char_p, c_uint,
+                      POINTER(cu_jit_option), POINTER(c_void_p)),
+
+    #    CUresult CUDAAPI
+    #    cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
+    'cuLinkComplete': (c_int,
+                       cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
+
+    #    CUresult CUDAAPI
+    #    cuLinkDestroy(CUlinkState state)
+    'cuLinkDestroy': (c_int, cu_link_state),
+
+    # cuProfilerStart ( void )
+    'cuProfilerStart': (c_int,),
+
+    # cuProfilerStop ( void )
+    'cuProfilerStop': (c_int,),
+
+    # CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
+    #                              CUfunction hfunc )
+    'cuFuncGetAttribute': (c_int,
+                           POINTER(c_int), cu_function_attribute, cu_function),
+
+    # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    #                      int *numBlocks,
+    #                      CUfunction func,
+    #                      int blockSize,
+    #                      size_t dynamicSMemSize);
+    'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
+                                                    cu_function, c_size_t,
+                                                    c_uint),
+
+    # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    #                      int *numBlocks,
+    #                      CUfunction func,
+    #                      int blockSize,
+    #                      size_t dynamicSMemSize,
+    #                      unsigned int flags);
+    'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
+                                                             POINTER(c_int),
+                                                             cu_function,
+                                                             c_size_t, c_uint),
+
+    # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
+    #                      int *minGridSize, int *blockSize,
+    #                      CUfunction func,
+    #                      CUoccupancyB2DSize blockSizeToDynamicSMemSize,
+    #                      size_t dynamicSMemSize, int blockSizeLimit);
+    'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
+                                         cu_function, cu_occupancy_b2d_size,
+                                         c_size_t, c_int),
+
+    # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+    #                      int *minGridSize, int *blockSize,
+    #                      CUfunction func,
+    #                      CUoccupancyB2DSize blockSizeToDynamicSMemSize,
+    #                      size_t dynamicSMemSize, int blockSizeLimit,
+    #                      unsigned int flags);
+    'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
+                                                  POINTER(c_int), cu_function,
+                                                  cu_occupancy_b2d_size,
+                                                  c_size_t, c_int, c_uint),
+
+    # CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
+    'cuIpcGetMemHandle': (c_int,
+                          POINTER(cu_ipc_mem_handle), cu_device_ptr),
+
+    # CUresult cuIpcOpenMemHandle(
+    #              CUdeviceptr* pdptr,
+    #              CUipcMemHandle handle,
+    #              unsigned int Flags)
+    'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
+                           c_uint),
+
+    # CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
+
+    'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
+
+    # CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
+    'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
+
+    # CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
+    #                                  CUdevice dev, CUdevice peerDev )
+    'cuDeviceCanAccessPeer': (c_int,
+                              POINTER(c_int), cu_device, cu_device),
+
+    # CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
+    'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
+}
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/dummyarray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/dummyarray.py
@@ -0,0 +1,452 @@
+from collections import namedtuple
+import itertools
+import functools
+import operator
+import ctypes
+
+import numpy as np
+
+from numba import _helperlib
+
+Extent = namedtuple("Extent", ["begin", "end"])
+
+attempt_nocopy_reshape = ctypes.CFUNCTYPE(
+    ctypes.c_int,
+    ctypes.c_long,  # nd
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # dims
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # strides
+    ctypes.c_long,  # newnd
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newdims
+    np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newstrides
+    ctypes.c_long,  # itemsize
+    ctypes.c_int,  # is_f_order
+)(_helperlib.c_helpers['attempt_nocopy_reshape'])
+
+
+class Dim(object):
+    """A single dimension of the array
+
+    Attributes
+    ----------
+    start:
+        start offset
+    stop:
+        stop offset
+    size:
+        number of items
+    stride:
+        item stride
+    """
+    __slots__ = 'start', 'stop', 'size', 'stride', 'single'
+
+    def __init__(self, start, stop, size, stride, single):
+        self.start = start
+        self.stop = stop
+        self.size = size
+        self.stride = stride
+        self.single = single
+        assert not single or size == 1
+
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            start, stop, step = item.indices(self.size)
+            stride = step * self.stride
+            start = self.start + start * abs(self.stride)
+            stop = self.start + stop * abs(self.stride)
+            if stride == 0:
+                size = 1
+            else:
+                size = _compute_size(start, stop, stride)
+            ret = Dim(
+                start=start,
+                stop=stop,
+                size=size,
+                stride=stride,
+                single=False
+            )
+            return ret
+        else:
+            sliced = self[item:item + 1] if item != -1 else self[-1:]
+            if sliced.size != 1:
+                raise IndexError
+            return Dim(
+                start=sliced.start,
+                stop=sliced.stop,
+                size=sliced.size,
+                stride=sliced.stride,
+                single=True,
+            )
+
+    def get_offset(self, idx):
+        return self.start + idx * self.stride
+
+    def __repr__(self):
+        strfmt = "Dim(start=%s, stop=%s, size=%s, stride=%s)"
+        return strfmt % (self.start, self.stop, self.size, self.stride)
+
+    def normalize(self, base):
+        return Dim(start=self.start - base, stop=self.stop - base,
+                   size=self.size, stride=self.stride, single=self.single)
+
+    def copy(self, start=None, stop=None, size=None, stride=None, single=None):
+        if start is None:
+            start = self.start
+        if stop is None:
+            stop = self.stop
+        if size is None:
+            size = self.size
+        if stride is None:
+            stride = self.stride
+        if single is None:
+            single = self.single
+        return Dim(start, stop, size, stride, single)
+
+    def is_contiguous(self, itemsize):
+        return self.stride == itemsize
+
+
+def compute_index(indices, dims):
+    return sum(d.get_offset(i) for i, d in zip(indices, dims))
+
+
+class Element(object):
+    is_array = False
+
+    def __init__(self, extent):
+        self.extent = extent
+
+    def iter_contiguous_extent(self):
+        yield self.extent
+
+
+class Array(object):
+    """A dummy numpy array-like object.  Consider it an array without the
+    actual data, but offset from the base data pointer.
+
+    Attributes
+    ----------
+    dims: tuple of Dim
+        describing each dimension of the array
+
+    ndim: int
+        number of dimension
+
+    shape: tuple of int
+        size of each dimension
+
+    strides: tuple of int
+        stride of each dimension
+
+    itemsize: int
+        itemsize
+
+    extent: (start, end)
+        start and end offset containing the memory region
+    """
+    is_array = True
+
+    @classmethod
+    def from_desc(cls, offset, shape, strides, itemsize):
+        dims = []
+        for ashape, astride in zip(shape, strides):
+            dim = Dim(offset, offset + ashape * astride, ashape, astride,
+                      single=False)
+            dims.append(dim)
+            offset = 0  # offset only applies to first dimension
+        return cls(dims, itemsize)
+
+    def __init__(self, dims, itemsize):
+        self.dims = tuple(dims)
+        self.ndim = len(self.dims)
+        self.shape = tuple(dim.size for dim in self.dims)
+        self.strides = tuple(dim.stride for dim in self.dims)
+        self.itemsize = itemsize
+        self.size = functools.reduce(operator.mul, self.shape, 1)
+        self.extent = self._compute_extent()
+        self.flags = self._compute_layout()
+
+    def _compute_layout(self):
+        # The logic here is based on that in _UpdateContiguousFlags from
+        # numpy/core/src/multiarray/flagsobject.c in NumPy v1.19.1 (commit
+        # 13661ac70).
+        # https://github.com/numpy/numpy/blob/maintenance/1.19.x/numpy/core/src/multiarray/flagsobject.c#L123-L191
+
+        # Records have no dims, and we can treat them as contiguous
+        if not self.dims:
+            return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+
+        # If this is a broadcast array then it is not contiguous
+        if any([dim.stride == 0 for dim in self.dims]):
+            return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
+
+        flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+
+        # Check C contiguity
+        sd = self.itemsize
+        for dim in reversed(self.dims):
+            if dim.size == 0:
+                # Contiguous by definition
+                return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+            if dim.size != 1:
+                if dim.stride != sd:
+                    flags['C_CONTIGUOUS'] = False
+                sd *= dim.size
+
+        # Check F contiguity
+        sd = self.itemsize
+        for dim in self.dims:
+            if dim.size != 1:
+                if dim.stride != sd:
+                    flags['F_CONTIGUOUS'] = False
+                    return flags
+                sd *= dim.size
+
+        return flags
+
+    def _compute_extent(self):
+        firstidx = [0] * self.ndim
+        lastidx = [s - 1 for s in self.shape]
+        start = compute_index(firstidx, self.dims)
+        stop = compute_index(lastidx, self.dims) + self.itemsize
+        stop = max(stop, start)   # ensure positive extent
+        return Extent(start, stop)
+
+    def __repr__(self):
+        return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
+
+    def __getitem__(self, item):
+        if not isinstance(item, tuple):
+            item = [item]
+        else:
+            item = list(item)
+
+        nitem = len(item)
+        ndim = len(self.dims)
+        if nitem > ndim:
+            raise IndexError("%d extra indices given" % (nitem - ndim,))
+
+        # Add empty slices for missing indices
+        while len(item) < ndim:
+            item.append(slice(None, None))
+
+        dims = [dim.__getitem__(it) for dim, it in zip(self.dims, item)]
+        newshape = [d.size for d in dims if not d.single]
+
+        arr = Array(dims, self.itemsize)
+        if newshape:
+            return arr.reshape(*newshape)[0]
+        else:
+            return Element(arr.extent)
+
+    @property
+    def is_c_contig(self):
+        return self.flags['C_CONTIGUOUS']
+
+    @property
+    def is_f_contig(self):
+        return self.flags['F_CONTIGUOUS']
+
+    def iter_contiguous_extent(self):
+        """ Generates extents
+        """
+        if self.is_c_contig or self.is_f_contig:
+            yield self.extent
+        else:
+            if self.dims[0].stride < self.dims[-1].stride:
+                innerdim = self.dims[0]
+                outerdims = self.dims[1:]
+                outershape = self.shape[1:]
+            else:
+                innerdim = self.dims[-1]
+                outerdims = self.dims[:-1]
+                outershape = self.shape[:-1]
+
+            if innerdim.is_contiguous(self.itemsize):
+                oslen = [range(s) for s in outershape]
+                for indices in itertools.product(*oslen):
+                    base = compute_index(indices, outerdims)
+                    yield base + innerdim.start, base + innerdim.stop
+            else:
+                oslen = [range(s) for s in self.shape]
+                for indices in itertools.product(*oslen):
+                    offset = compute_index(indices, self.dims)
+                    yield offset, offset + self.itemsize
+
+    def reshape(self, *newdims, **kws):
+        oldnd = self.ndim
+        newnd = len(newdims)
+
+        if newdims == self.shape:
+            return self, None
+
+        order = kws.pop('order', 'C')
+        if kws:
+            raise TypeError('unknown keyword arguments %s' % kws.keys())
+        if order not in 'CFA':
+            raise ValueError('order not C|F|A')
+
+        # check for exactly one instance of -1 in newdims
+        # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515   # noqa: E501
+        unknownidx = -1
+        knownsize = 1
+        for i, dim in enumerate(newdims):
+            if dim < 0:
+                if unknownidx == -1:
+                    unknownidx = i
+                else:
+                    raise ValueError("can only specify one unknown dimension")
+            else:
+                knownsize *= dim
+
+        # compute the missing dimension
+        if unknownidx >= 0:
+            if knownsize == 0 or self.size % knownsize != 0:
+                raise ValueError("cannot infer valid shape "
+                                 "for unknown dimension")
+            else:
+                newdims = newdims[0:unknownidx] \
+                    + (self.size // knownsize,) \
+                    + newdims[unknownidx + 1:]
+
+        newsize = functools.reduce(operator.mul, newdims, 1)
+
+        if order == 'A':
+            order = 'F' if self.is_f_contig else 'C'
+
+        if newsize != self.size:
+            raise ValueError("reshape changes the size of the array")
+
+        if self.is_c_contig or self.is_f_contig:
+            if order == 'C':
+                newstrides = list(iter_strides_c_contig(self, newdims))
+            elif order == 'F':
+                newstrides = list(iter_strides_f_contig(self, newdims))
+            else:
+                raise AssertionError("unreachable")
+        else:
+            newstrides = np.empty(newnd, np.ctypeslib.c_intp)
+
+            # need to keep these around in variables, not temporaries, so they
+            # don't get GC'ed before we call into the C code
+            olddims = np.array(self.shape, dtype=np.ctypeslib.c_intp)
+            oldstrides = np.array(self.strides, dtype=np.ctypeslib.c_intp)
+            newdims = np.array(newdims, dtype=np.ctypeslib.c_intp)
+
+            if not attempt_nocopy_reshape(
+                oldnd,
+                olddims,
+                oldstrides,
+                newnd,
+                newdims,
+                newstrides,
+                self.itemsize,
+                order == 'F',
+            ):
+                raise NotImplementedError('reshape would require copy')
+
+        ret = self.from_desc(self.extent.begin, shape=newdims,
+                             strides=newstrides, itemsize=self.itemsize)
+
+        return ret, list(self.iter_contiguous_extent())
+
+    def squeeze(self, axis=None):
+        newshape, newstrides = [], []
+        if axis is None:
+            for length, stride in zip(self.shape, self.strides):
+                if length != 1:
+                    newshape.append(length)
+                    newstrides.append(stride)
+        else:
+            if not isinstance(axis, tuple):
+                axis = (axis,)
+            for ax in axis:
+                if self.shape[ax] != 1:
+                    raise ValueError(
+                        "cannot select an axis to squeeze out which has size "
+                        "not equal to one"
+                    )
+            for i, (length, stride) in enumerate(zip(self.shape, self.strides)):
+                if i not in axis:
+                    newshape.append(length)
+                    newstrides.append(stride)
+        newarr = self.from_desc(
+            self.extent.begin,
+            shape=newshape,
+            strides=newstrides,
+            itemsize=self.itemsize,
+        )
+        return newarr, list(self.iter_contiguous_extent())
+
+    def ravel(self, order='C'):
+        if order not in 'CFA':
+            raise ValueError('order not C|F|A')
+
+        if (order in 'CA' and self.is_c_contig
+                or order in 'FA' and self.is_f_contig):
+            newshape = (self.size,)
+            newstrides = (self.itemsize,)
+            arr = self.from_desc(self.extent.begin, newshape, newstrides,
+                                 self.itemsize)
+            return arr, list(self.iter_contiguous_extent())
+
+        else:
+            raise NotImplementedError("ravel on non-contiguous array")
+
+
+def iter_strides_f_contig(arr, shape=None):
+    """yields the f-contiguous strides
+    """
+    shape = arr.shape if shape is None else shape
+    itemsize = arr.itemsize
+    yield itemsize
+    sum = 1
+    for s in shape[:-1]:
+        sum *= s
+        yield sum * itemsize
+
+
+def iter_strides_c_contig(arr, shape=None):
+    """yields the c-contiguous strides
+    """
+    shape = arr.shape if shape is None else shape
+    itemsize = arr.itemsize
+
+    def gen():
+        yield itemsize
+        sum = 1
+        for s in reversed(shape[1:]):
+            sum *= s
+            yield sum * itemsize
+
+    for i in reversed(list(gen())):
+        yield i
+
+
+def is_element_indexing(item, ndim):
+    if isinstance(item, slice):
+        return False
+
+    elif isinstance(item, tuple):
+        if len(item) == ndim:
+            if not any(isinstance(it, slice) for it in item):
+                return True
+
+    else:
+        return True
+
+    return False
+
+
+def _compute_size(start, stop, step):
+    """Algorithm adapted from cpython rangeobject.c
+    """
+    if step > 0:
+        lo = start
+        hi = stop
+    else:
+        lo = stop
+        hi = start
+        step = -step
+    if lo >= hi:
+        return 0
+    return (hi - lo - 1) // step + 1
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/enums.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/enums.py
@@ -0,0 +1,607 @@
+"""
+Enum values for CUDA driver. Information about the values
+can be found on the official NVIDIA documentation website.
+ref: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+anchor: #group__CUDA__TYPES
+"""
+
+
+# Error codes
+
+CUDA_SUCCESS = 0
+CUDA_ERROR_INVALID_VALUE = 1
+CUDA_ERROR_OUT_OF_MEMORY = 2
+CUDA_ERROR_NOT_INITIALIZED = 3
+CUDA_ERROR_DEINITIALIZED = 4
+CUDA_ERROR_PROFILER_DISABLED = 5
+CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
+CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
+CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
+CUDA_ERROR_STUB_LIBRARY = 34
+CUDA_ERROR_DEVICE_UNAVAILABLE = 46
+CUDA_ERROR_NO_DEVICE = 100
+CUDA_ERROR_INVALID_DEVICE = 101
+CUDA_ERROR_DEVICE_NOT_LICENSED = 102
+CUDA_ERROR_INVALID_IMAGE = 200
+CUDA_ERROR_INVALID_CONTEXT = 201
+CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
+CUDA_ERROR_MAP_FAILED = 205
+CUDA_ERROR_UNMAP_FAILED = 206
+CUDA_ERROR_ARRAY_IS_MAPPED = 207
+CUDA_ERROR_ALREADY_MAPPED = 208
+CUDA_ERROR_NO_BINARY_FOR_GPU = 209
+CUDA_ERROR_ALREADY_ACQUIRED = 210
+CUDA_ERROR_NOT_MAPPED = 211
+CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
+CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
+CUDA_ERROR_ECC_UNCORRECTABLE = 214
+CUDA_ERROR_UNSUPPORTED_LIMIT = 215
+CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
+CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
+CUDA_ERROR_INVALID_PTX = 218
+CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
+CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
+CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
+CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
+CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
+CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
+CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
+CUDA_ERROR_INVALID_SOURCE = 300
+CUDA_ERROR_FILE_NOT_FOUND = 301
+CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
+CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
+CUDA_ERROR_OPERATING_SYSTEM = 304
+CUDA_ERROR_INVALID_HANDLE = 400
+CUDA_ERROR_ILLEGAL_STATE = 401
+CUDA_ERROR_NOT_FOUND = 500
+CUDA_ERROR_NOT_READY = 600
+CUDA_ERROR_LAUNCH_FAILED = 700
+CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
+CUDA_ERROR_LAUNCH_TIMEOUT = 702
+CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
+CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
+CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
+CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
+CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
+CUDA_ERROR_ASSERT = 710
+CUDA_ERROR_TOO_MANY_PEERS = 711
+CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
+CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
+CUDA_ERROR_HARDWARE_STACK_ERROR = 714
+CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
+CUDA_ERROR_MISALIGNED_ADDRESS = 716
+CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
+CUDA_ERROR_INVALID_PC = 718
+CUDA_ERROR_LAUNCH_FAILED = 719
+CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
+CUDA_ERROR_NOT_PERMITTED = 800
+CUDA_ERROR_NOT_SUPPORTED = 801
+CUDA_ERROR_SYSTEM_NOT_READY = 802
+CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
+CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
+CUDA_ERROR_MPS_CONNECTION_FAILED = 805
+CUDA_ERROR_MPS_RPC_FAILURE = 806
+CUDA_ERROR_MPS_SERVER_NOT_READY = 807
+CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
+CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
+CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
+CUDA_ERROR_CDP_NOT_SUPPORTED = 811
+CUDA_ERROR_CDP_VERSION_MISMATCH = 812
+CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
+CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
+CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
+CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
+CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
+CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
+CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
+CUDA_ERROR_CAPTURED_EVENT = 907
+CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
+CUDA_ERROR_TIMEOUT = 909
+CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
+CUDA_ERROR_EXTERNAL_DEVICE = 911
+CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
+CUDA_ERROR_UNKNOWN = 999
+
+
+# Function cache configurations
+
+# no preference for shared memory or L1 (default)
+CU_FUNC_CACHE_PREFER_NONE = 0x00
+# prefer larger shared memory and smaller L1 cache
+CU_FUNC_CACHE_PREFER_SHARED = 0x01
+# prefer larger L1 cache and smaller shared memory
+CU_FUNC_CACHE_PREFER_L1 = 0x02
+# prefer equal sized L1 cache and shared memory
+CU_FUNC_CACHE_PREFER_EQUAL = 0x03
+
+
+# Context creation flags
+
+# Automatic scheduling
+CU_CTX_SCHED_AUTO = 0x00
+# Set spin as default scheduling
+CU_CTX_SCHED_SPIN = 0x01
+# Set yield as default scheduling
+CU_CTX_SCHED_YIELD = 0x02
+# Set blocking synchronization as default scheduling
+CU_CTX_SCHED_BLOCKING_SYNC = 0x04
+
+CU_CTX_SCHED_MASK = 0x07
+# Support mapped pinned allocations
+#   This flag was deprecated as of CUDA 11.0 and it no longer has effect.
+#   All contexts as of CUDA 3.2 behave as though the flag is enabled.
+CU_CTX_MAP_HOST = 0x08
+# Keep local memory allocation after launch
+CU_CTX_LMEM_RESIZE_TO_MAX = 0x10
+# Trigger coredumps from exceptions in this context
+CU_CTX_COREDUMP_ENABLE = 0x20
+# Enable user pipe to trigger coredumps in this context
+CU_CTX_USER_COREDUMP_ENABLE = 0x40
+# Force synchronous blocking on cudaMemcpy/cudaMemset
+CU_CTX_SYNC_MEMOPS = 0x80
+
+CU_CTX_FLAGS_MASK = 0xff
+
+
+# DEFINES
+
+# If set, host memory is portable between CUDA contexts.
+# Flag for cuMemHostAlloc()
+CU_MEMHOSTALLOC_PORTABLE = 0x01
+
+# If set, host memory is mapped into CUDA address space and
+# cuMemHostGetDevicePointer() may be called on the host pointer.
+# Flag for cuMemHostAlloc()
+CU_MEMHOSTALLOC_DEVICEMAP = 0x02
+
+# If set, host memory is allocated as write-combined - fast to write,
+# faster to DMA, slow to read except via SSE4 streaming load instruction
+# (MOVNTDQA).
+# Flag for cuMemHostAlloc()
+CU_MEMHOSTALLOC_WRITECOMBINED = 0x04
+
+
+# If set, host memory is portable between CUDA contexts.
+# Flag for cuMemHostRegister()
+CU_MEMHOSTREGISTER_PORTABLE = 0x01
+
+# If set, host memory is mapped into CUDA address space and
+# cuMemHostGetDevicePointer() may be called on the host pointer.
+# Flag for cuMemHostRegister()
+CU_MEMHOSTREGISTER_DEVICEMAP = 0x02
+
+# If set, the passed memory pointer is treated as pointing to some
+# memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+# On Windows the flag is a no-op. On Linux that memory is marked
+# as non cache-coherent for the GPU and is expected
+# to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED
+# if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older
+# Linux kernel versions. On all other platforms, it is not supported
+# and CUDA_ERROR_NOT_SUPPORTED is returned.
+# Flag for cuMemHostRegister()
+CU_MEMHOSTREGISTER_IOMEMORY = 0x04
+
+# If set, the passed memory pointer is treated as pointing to memory
+# that is considered read-only by the device. On platforms without
+# CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+# this flag is required in order to register memory mapped
+# to the CPU as read-only. Support for the use of this flag can be
+# queried from the device attribute
+# CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.
+# Using this flag with a current context associated with a device
+# that does not have this attribute set will cause cuMemHostRegister
+# to error with CUDA_ERROR_NOT_SUPPORTED.
+CU_MEMHOSTREGISTER_READ_ONLY = 0x08
+
+
+# CUDA Mem Attach Flags
+
+# If set, managed memory is accessible from all streams on all devices.
+CU_MEM_ATTACH_GLOBAL = 0x01
+
+# If set on a platform where the device attribute
+# cudaDevAttrConcurrentManagedAccess is zero, then managed memory is
+# only accessible on the host (unless explicitly attached to a stream
+# with cudaStreamAttachMemAsync, in which case it can be used in kernels
+# launched on that stream).
+CU_MEM_ATTACH_HOST = 0x02
+
+# If set on a platform where the device attribute
+# cudaDevAttrConcurrentManagedAccess is zero, then managed memory accesses
+# on the associated device must only be from a single stream.
+CU_MEM_ATTACH_SINGLE = 0x04
+
+
+# Event creation flags
+
+# Default event flag
+CU_EVENT_DEFAULT = 0x0
+# Event uses blocking synchronization
+CU_EVENT_BLOCKING_SYNC = 0x1
+# Event will not record timing data
+CU_EVENT_DISABLE_TIMING = 0x2
+# Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
+CU_EVENT_INTERPROCESS = 0x4
+
+
+# Pointer information
+
+# The CUcontext on which a pointer was allocated or registered
+CU_POINTER_ATTRIBUTE_CONTEXT = 1
+# The CUmemorytype describing the physical location of a pointer
+CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2
+# The address at which a pointer's memory may be accessed on the device
+CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3
+# The address at which a pointer's memory may be accessed on the host
+CU_POINTER_ATTRIBUTE_HOST_POINTER = 4
+# A pair of tokens for use with the nv-p2p.h Linux kernel interface
+CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5
+# Synchronize every synchronous memory operation initiated on this region
+CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6
+# A process-wide unique ID for an allocated memory region
+CU_POINTER_ATTRIBUTE_BUFFER_ID = 7
+# Indicates if the pointer points to managed memory
+CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
+# A device ordinal of a device on which a pointer was allocated or registered
+CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9
+# 1 if this pointer maps to an allocation
+# that is suitable for cudaIpcGetMemHandle, 0 otherwise
+CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10
+# Starting address for this requested pointer
+CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11
+# Size of the address range for this requested pointer
+CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12
+# 1 if this pointer is in a valid address range
+# that is mapped to a backing allocation, 0 otherwise
+CU_POINTER_ATTRIBUTE_MAPPED = 13
+# Bitmask of allowed CUmemAllocationHandleType for this allocation
+CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14
+# 1 if the memory this pointer is referencing
+# can be used with the GPUDirect RDMA API
+CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
+# Returns the access flags the device associated
+# with the current context has on the corresponding
+# memory referenced by the pointer given
+CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16
+# Returns the mempool handle for the allocation
+# if it was allocated from a mempool. Otherwise returns NULL
+CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17
+# Size of the actual underlying mapping that the pointer belongs to
+CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18
+# The start address of the mapping that the pointer belongs to
+CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19
+# A process-wide unique id corresponding to the
+# physical allocation the pointer belongs to
+CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20
+
+
+# Memory types
+
+# Host memory
+CU_MEMORYTYPE_HOST = 0x01
+# Device memory
+CU_MEMORYTYPE_DEVICE = 0x02
+# Array memory
+CU_MEMORYTYPE_ARRAY = 0x03
+# Unified device or host memory
+CU_MEMORYTYPE_UNIFIED = 0x04
+
+
+# Device code formats
+
+# Compiled device-class-specific device code
+# Applicable options: none
+CU_JIT_INPUT_CUBIN = 0
+
+# PTX source code
+# Applicable options: PTX compiler options
+CU_JIT_INPUT_PTX = 1
+
+# Bundle of multiple cubins and/or PTX of some device code
+# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+CU_JIT_INPUT_FATBINARY = 2
+
+# Host object with embedded device code
+# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+CU_JIT_INPUT_OBJECT = 3
+
+# Archive of host objects with embedded device code
+# Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+CU_JIT_INPUT_LIBRARY = 4
+
+CU_JIT_NUM_INPUT_TYPES = 6
+
+
+# Online compiler and linker options
+
+# Max number of registers that a thread may use.
+# Option type: unsigned int
+# Applies to: compiler only
+CU_JIT_MAX_REGISTERS = 0
+
+# IN: Specifies minimum number of threads per block to target compilation
+# for
+# OUT: Returns the number of threads the compiler actually targeted.
+# This restricts the resource utilization fo the compiler (e.g. max
+# registers) such that a block with the given number of threads should be
+# able to launch based on register limitations. Note, this option does not
+# currently take into account any other resource limitations, such as
+# shared memory utilization.
+# Cannot be combined with ::CU_JIT_TARGET.
+# Option type: unsigned int
+# Applies to: compiler only
+CU_JIT_THREADS_PER_BLOCK = 1
+
+# Overwrites the option value with the total wall clock time, in
+# milliseconds, spent in the compiler and linker
+# Option type: float
+# Applies to: compiler and linker
+CU_JIT_WALL_TIME = 2
+
+# Pointer to a buffer in which to print any log messages
+# that are informational in nature (the buffer size is specified via
+# option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
+# Option type: char *
+# Applies to: compiler and linker
+CU_JIT_INFO_LOG_BUFFER = 3
+
+# IN: Log buffer size in bytes.  Log messages will be capped at this size
+# (including null terminator)
+# OUT: Amount of log buffer filled with messages
+# Option type: unsigned int
+# Applies to: compiler and linker
+CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4
+
+# Pointer to a buffer in which to print any log messages that
+# reflect errors (the buffer size is specified via option
+# ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
+# Option type: char *
+# Applies to: compiler and linker
+CU_JIT_ERROR_LOG_BUFFER = 5
+
+# IN: Log buffer size in bytes.  Log messages will be capped at this size
+# (including null terminator)
+# OUT: Amount of log buffer filled with messages
+# Option type: unsigned int
+# Applies to: compiler and linker
+CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6
+
+# Level of optimizations to apply to generated code (0 - 4), with 4
+# being the default and highest level of optimizations.
+# Option type: unsigned int
+# Applies to: compiler only
+CU_JIT_OPTIMIZATION_LEVEL = 7
+
+# No option value required. Determines the target based on the current
+# attached context (default)
+# Option type: No option value needed
+# Applies to: compiler and linker
+CU_JIT_TARGET_FROM_CUCONTEXT = 8
+
+# Target is chosen based on supplied ::CUjit_target.  Cannot be
+# combined with ::CU_JIT_THREADS_PER_BLOCK.
+# Option type: unsigned int for enumerated type ::CUjit_target
+# Applies to: compiler and linker
+CU_JIT_TARGET = 9
+
+# Specifies choice of fallback strategy if matching cubin is not found.
+# Choice is based on supplied ::CUjit_fallback.
+# Option type: unsigned int for enumerated type ::CUjit_fallback
+# Applies to: compiler only
+CU_JIT_FALLBACK_STRATEGY = 10
+
+# Specifies whether to create debug information in output (-g)
+# (0: false, default)
+# Option type: int
+# Applies to: compiler and linker
+CU_JIT_GENERATE_DEBUG_INFO = 11
+
+# Generate verbose log messages (0: false, default)
+# Option type: int
+# Applies to: compiler and linker
+CU_JIT_LOG_VERBOSE = 12
+
+# Generate line number information (-lineinfo) (0: false, default)
+# Option type: int
+# Applies to: compiler only
+CU_JIT_GENERATE_LINE_INFO = 13
+
+# Specifies whether to enable caching explicitly (-dlcm)
+# Choice is based on supplied ::CUjit_cacheMode_enum.
+# Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum
+# Applies to: compiler only
+CU_JIT_CACHE_MODE = 14
+
+
+# CUfunction_attribute
+
+# The maximum number of threads per block, beyond which a launch of the
+# function would fail. This number depends on both the function and the
+# device on which the function is currently loaded.
+CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0
+
+# The size in bytes of statically-allocated shared memory required by
+# this function. This does not include dynamically-allocated shared
+# memory requested by the user at runtime.
+CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1
+
+# The size in bytes of user-allocated constant memory required by this
+# function.
+CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2
+
+# The size in bytes of local memory used by each thread of this function.
+CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3
+
+# The number of registers used by each thread of this function.
+CU_FUNC_ATTRIBUTE_NUM_REGS = 4
+
+# The PTX virtual architecture version for which the function was
+# compiled. This value is the major PTX version * 10 + the minor PTX
+# version, so a PTX version 1.3 function would return the value 13.
+# Note that this may return the undefined value of 0 for cubins
+# compiled prior to CUDA 3.0.
+CU_FUNC_ATTRIBUTE_PTX_VERSION = 5
+
+# The binary architecture version for which the function was compiled.
+# This value is the major binary version * 10 + the minor binary version,
+# so a binary version 1.3 function would return the value 13. Note that
+# this will return a value of 10 for legacy cubins that do not have a
+# properly-encoded binary architecture version.
+CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6
+
+# The attribute to indicate whether the function has been compiled
+# with user specified option "-Xptxas --dlcm=ca" set
+CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
+
+# The maximum size in bytes of dynamically-allocated shared memory
+# that can be used by this function. If the user-specified
+# dynamic shared memory size is larger than this value,
+# the launch will fail. See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8
+
+# On devices where the L1 cache and shared memory use the same
+# hardware resources, this sets the shared memory carveout preference,
+# in percent of the total shared memory. Refer to
+# CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+# This is only a hint, and the driver can choose a different ratio
+# if required to execute the function.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9
+
+# If this attribute is set, the kernel must launch with a valid cluster
+# size specified. See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10
+
+# The required cluster width in blocks. The values must either all be 0
+# or all be positive. The validity of the cluster dimensions
+# is otherwise checked at launch time. If the value is set during
+# compile time, it cannot be set at runtime.
+# Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11
+
+# The required cluster height in blocks. The values must either all be 0
+# or all be positive. The validity of the cluster dimensions
+# is otherwise checked at launch time.If the value is set during
+# compile time, it cannot be set at runtime.
+# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12
+
+# The required cluster depth in blocks. The values must either all be 0
+# or all be positive. The validity of the cluster dimensions
+# is otherwise checked at launch time.If the value is set during
+# compile time, it cannot be set at runtime.
+# Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13
+
+# Whether the function can be launched with non-portable cluster size.
+# 1 is allowed, 0 is disallowed. A non-portable cluster size may only
+# function on the specific SKUs the program is tested on.
+# The launch might fail if the program is run on a different hardware platform.
+# For more details refer to link :
+# https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
+CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14
+
+# The block scheduling policy of a function.
+# The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
+# See cuFuncSetAttribute, cuKernelSetAttribute
+CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15
+
+
+# Device attributes
+
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
+CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2
+CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3
+CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4
+CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5
+CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6
+CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7
+CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
+CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9
+CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10
+CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11
+CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12
+CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
+CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14
+CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15
+CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
+CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
+CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
+CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19
+CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_WIDTH = 21
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_WIDTH = 22
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_HEIGHT = 23
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH = 24
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT = 25
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH = 26
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_WIDTH = 27
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_HEIGHT = 28
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LAYERED_LAYERS = 29
+CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30
+CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31
+CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32
+CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
+CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
+CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
+CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
+CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37
+CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTI_PROCESSOR = 39
+CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40
+CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_WIDTH = 42
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LAYERED_LAYERS = 43
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_WIDTH = 45
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_GATHER_HEIGHT = 46
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_WIDTH_ALT = 47
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_HEIGHT_ALT = 48
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_3D_DEPTH_ALT = 49
+CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
+CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_WIDTH = 52
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_WIDTH = 53
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_CUBEMAP_LAYERED_LAYERS = 54
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_WIDTH = 55
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_WIDTH = 56
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_HEIGHT = 57
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_WIDTH = 58
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_HEIGHT = 59
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_3D_DEPTH = 60
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_WIDTH = 61
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_1D_LAYERED_LAYERS = 62
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_WIDTH = 63
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_HEIGHT = 64
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_2D_LAYERED_LAYERS = 65
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_WIDTH = 66
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_WIDTH = 67
+CU_DEVICE_ATTRIBUTE_MAX_SURFACE_CUBEMAP_LAYERED_LAYERS = 68
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_LINEAR_WIDTH = 69
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_WIDTH = 70
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_HEIGHT = 71
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_LINEAR_PITCH = 72
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_2D_MIPMAPPED_WIDTH = 73
+CU_DEVICE_ATTRIBUTE_MAX_MAX_TEXTURE_2D_MIPMAPPED_HEIGHT = 74
+CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75
+CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
+CU_DEVICE_ATTRIBUTE_MAX_TEXTURE_1D_MIPMAPPED_WIDTH = 77
+CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78
+CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79
+CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80
+CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81
+CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
+CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83
+CU_DEVICE_ATTRIBUTE_IS_MULTI_GPU_BOARD = 84
+CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85
+CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86
+CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87
+CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
+CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89
+CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90
+CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
+CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95
+CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96
+CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/error.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/error.py
@@ -0,0 +1,36 @@
+class CudaDriverError(Exception):
+    pass
+
+
+class CudaRuntimeError(Exception):
+    pass
+
+
+class CudaSupportError(ImportError):
+    pass
+
+
+class NvvmError(Exception):
+    def __str__(self):
+        return '\n'.join(map(str, self.args))
+
+
+class NvvmSupportError(ImportError):
+    pass
+
+
+class NvvmWarning(Warning):
+    pass
+
+
+class NvrtcError(Exception):
+    def __str__(self):
+        return '\n'.join(map(str, self.args))
+
+
+class NvrtcCompilationError(NvrtcError):
+    pass
+
+
+class NvrtcSupportError(ImportError):
+    pass
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/libs.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/libs.py
@@ -0,0 +1,176 @@
+"""CUDA Toolkit libraries lookup utilities.
+
+CUDA Toolkit libraries can be available via either:
+
+- the `cuda-nvcc` and `cuda-nvrtc` conda packages for CUDA 12,
+- the `cudatoolkit` conda package for CUDA 11,
+- a user supplied location from CUDA_HOME,
+- a system wide location,
+- package-specific locations (e.g. the Debian NVIDIA packages),
+- or can be discovered by the system loader.
+"""
+
+import os
+import sys
+import ctypes
+
+from numba.misc.findlib import find_lib
+from numba.cuda.cuda_paths import get_cuda_paths
+from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
+from numba.cuda.cudadrv.error import CudaSupportError
+
+
+if sys.platform == 'win32':
+    _dllnamepattern = '%s.dll'
+    _staticnamepattern = '%s.lib'
+elif sys.platform == 'darwin':
+    _dllnamepattern = 'lib%s.dylib'
+    _staticnamepattern = 'lib%s.a'
+else:
+    _dllnamepattern = 'lib%s.so'
+    _staticnamepattern = 'lib%s.a'
+
+
+def get_libdevice():
+    d = get_cuda_paths()
+    paths = d['libdevice'].info
+    return paths
+
+
+def open_libdevice():
+    with open(get_libdevice(), 'rb') as bcfile:
+        return bcfile.read()
+
+
+def get_cudalib(lib, static=False):
+    """
+    Find the path of a CUDA library based on a search of known locations. If
+    the search fails, return a generic filename for the library (e.g.
+    'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
+    loader's search mechanism.
+    """
+    if lib == 'nvvm':
+        return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
+    else:
+        dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
+        libdir = get_cuda_paths()[dir_type].info
+
+    candidates = find_lib(lib, libdir, static=static)
+    namepattern = _staticnamepattern if static else _dllnamepattern
+    return max(candidates) if candidates else namepattern % lib
+
+
+def open_cudalib(lib):
+    path = get_cudalib(lib)
+    return ctypes.CDLL(path)
+
+
+def check_static_lib(path):
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f'{path} not found')
+
+
+def _get_source_variable(lib, static=False):
+    if lib == 'nvvm':
+        return get_cuda_paths()['nvvm'].by
+    elif lib == 'libdevice':
+        return get_cuda_paths()['libdevice'].by
+    else:
+        dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
+        return get_cuda_paths()[dir_type].by
+
+
+def test():
+    """Test library lookup.  Path info is printed to stdout.
+    """
+    failed = False
+
+    # Check for the driver
+    try:
+        dlloader, candidates = locate_driver_and_loader()
+        print('Finding driver from candidates:')
+        for location in candidates:
+            print(f'\t{location}')
+        print(f'Using loader {dlloader}')
+        print('\tTrying to load driver', end='...')
+        dll, path = load_driver(dlloader, candidates)
+        print('\tok')
+        print(f'\t\tLoaded from {path}')
+    except CudaSupportError as e:
+        print(f'\tERROR: failed to open driver: {e}')
+        failed = True
+
+    # Find the absolute location of the driver on Linux. Various driver-related
+    # issues have been reported by WSL2 users, and it is almost always due to a
+    # Linux (i.e. not- WSL2) driver being installed in a WSL2 system.
+    # Providing the absolute location of the driver indicates its version
+    # number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
+    # look up whether the driver was intended for "native" Linux.
+    if sys.platform == 'linux' and not failed:
+        pid = os.getpid()
+        mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
+        try:
+            with open(mapsfile) as f:
+                maps = f.read()
+        # It's difficult to predict all that might go wrong reading the maps
+        # file - in case various error conditions ensue (the file is not found,
+        # not readable, etc.) we use OSError to hopefully catch any of them.
+        except OSError:
+            # It's helpful to report that this went wrong to the user, but we
+            # don't set failed to True because this doesn't have any connection
+            # to actual CUDA functionality.
+            print(f'\tERROR: Could not open {mapsfile} to determine absolute '
+                  'path to libcuda.so')
+        else:
+            # In this case we could read the maps, so we can report the
+            # relevant ones to the user
+            locations = set(s for s in maps.split() if 'libcuda.so' in s)
+            print('\tMapped libcuda.so paths:')
+            for location in locations:
+                print(f'\t\t{location}')
+
+    # Checks for dynamic libraries
+    libs = 'nvvm nvrtc cudart'.split()
+    for lib in libs:
+        path = get_cudalib(lib)
+        print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
+        print('\tLocated at', path)
+
+        try:
+            print('\tTrying to open library', end='...')
+            open_cudalib(lib)
+            print('\tok')
+        except OSError as e:
+            print('\tERROR: failed to open %s:\n%s' % (lib, e))
+            failed = True
+
+    # Check for cudadevrt (the only static library)
+    lib = 'cudadevrt'
+    path = get_cudalib(lib, static=True)
+    print('Finding {} from {}'.format(lib, _get_source_variable(lib,
+                                                                static=True)))
+    print('\tLocated at', path)
+
+    try:
+        print('\tChecking library', end='...')
+        check_static_lib(path)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find %s:\n%s' % (lib, e))
+        failed = True
+
+    # Check for libdevice
+    where = _get_source_variable('libdevice')
+    print(f'Finding libdevice from {where}')
+    path = get_libdevice()
+    print('\tLocated at', path)
+
+    try:
+        print('\tChecking library', end='...')
+        check_static_lib(path)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find %s:\n%s' % (lib, e))
+        failed = True
+
+    return not failed
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/ndarray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/ndarray.py
@@ -0,0 +1,20 @@
+from numba.cuda.cudadrv import devices, driver
+from numba.core.registry import cpu_target
+
+
+def _calc_array_sizeof(ndim):
+    """
+    Use the ABI size in the CPU target
+    """
+    ctx = cpu_target.target_context
+    return ctx.calc_array_sizeof(ndim)
+
+
+def ndarray_device_allocate_data(ary):
+    """
+    Allocate gpu data buffer
+    """
+    datasize = driver.host_memory_size(ary)
+    # allocate
+    gpu_data = devices.get_context().memalloc(datasize)
+    return gpu_data
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvrtc.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvrtc.py
@@ -0,0 +1,260 @@
+from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
+from enum import IntEnum
+from numba.core import config
+from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
+                                      NvrtcSupportError)
+
+import functools
+import os
+import threading
+import warnings
+
+# Opaque handle for compilation unit
+nvrtc_program = c_void_p
+
+# Result code
+nvrtc_result = c_int
+
+
+class NvrtcResult(IntEnum):
+    NVRTC_SUCCESS = 0
+    NVRTC_ERROR_OUT_OF_MEMORY = 1
+    NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
+    NVRTC_ERROR_INVALID_INPUT = 3
+    NVRTC_ERROR_INVALID_PROGRAM = 4
+    NVRTC_ERROR_INVALID_OPTION = 5
+    NVRTC_ERROR_COMPILATION = 6
+    NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
+    NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
+    NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
+    NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
+    NVRTC_ERROR_INTERNAL_ERROR = 11
+
+
+_nvrtc_lock = threading.Lock()
+
+
+class NvrtcProgram:
+    """
+    A class for managing the lifetime of nvrtcProgram instances. Instances of
+    the class own an nvrtcProgram; when an instance is deleted, the underlying
+    nvrtcProgram is destroyed using the appropriate NVRTC API.
+    """
+    def __init__(self, nvrtc, handle):
+        self._nvrtc = nvrtc
+        self._handle = handle
+
+    @property
+    def handle(self):
+        return self._handle
+
+    def __del__(self):
+        if self._handle:
+            self._nvrtc.destroy_program(self)
+
+
+class NVRTC:
+    """
+    Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
+    calls.
+
+    The sole instance of this class is a process-wide singleton, similar to the
+    NVVM interface. Initialization is protected by a lock and uses the standard
+    (for Numba) open_cudalib function to load the NVRTC library.
+    """
+    _PROTOTYPES = {
+        # nvrtcResult nvrtcVersion(int *major, int *minor)
+        'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
+        # nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+        #                                const char *src,
+        #                                const char *name,
+        #                                int numHeaders,
+        #                                const char * const *headers,
+        #                                const char * const *includeNames)
+        'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
+                               c_int, POINTER(c_char_p), POINTER(c_char_p)),
+        # nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
+        'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
+        # nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+        #                                 int numOptions,
+        #                                 const char * const *options)
+        'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
+                                POINTER(c_char_p)),
+        # nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
+        'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+        'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
+        # nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
+        #                               size_t *cubinSizeRet);
+        'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
+        'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
+        # nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
+        #                                    size_t *logSizeRet);
+        'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
+                                   POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
+        'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
+    }
+
+    # Singleton reference
+    __INSTANCE = None
+
+    def __new__(cls):
+        with _nvrtc_lock:
+            if cls.__INSTANCE is None:
+                from numba.cuda.cudadrv.libs import open_cudalib
+                cls.__INSTANCE = inst = object.__new__(cls)
+                try:
+                    lib = open_cudalib('nvrtc')
+                except OSError as e:
+                    cls.__INSTANCE = None
+                    raise NvrtcSupportError("NVRTC cannot be loaded") from e
+
+                # Find & populate functions
+                for name, proto in inst._PROTOTYPES.items():
+                    func = getattr(lib, name)
+                    func.restype = proto[0]
+                    func.argtypes = proto[1:]
+
+                    @functools.wraps(func)
+                    def checked_call(*args, func=func, name=name):
+                        error = func(*args)
+                        if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
+                            raise NvrtcCompilationError()
+                        elif error != NvrtcResult.NVRTC_SUCCESS:
+                            try:
+                                error_name = NvrtcResult(error).name
+                            except ValueError:
+                                error_name = ('Unknown nvrtc_result '
+                                              f'(error code: {error})')
+                            msg = f'Failed to call {name}: {error_name}'
+                            raise NvrtcError(msg)
+
+                    setattr(inst, name, checked_call)
+
+        return cls.__INSTANCE
+
+    def get_version(self):
+        """
+        Get the NVRTC version as a tuple (major, minor).
+        """
+        major = c_int()
+        minor = c_int()
+        self.nvrtcVersion(byref(major), byref(minor))
+        return major.value, minor.value
+
+    def create_program(self, src, name):
+        """
+        Create an NVRTC program with managed lifetime.
+        """
+        if isinstance(src, str):
+            src = src.encode()
+        if isinstance(name, str):
+            name = name.encode()
+
+        handle = nvrtc_program()
+
+        # The final three arguments are for passing the contents of headers -
+        # this is not supported, so there are 0 headers and the header names
+        # and contents are null.
+        self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
+        return NvrtcProgram(self, handle)
+
+    def compile_program(self, program, options):
+        """
+        Compile an NVRTC program. Compilation may fail due to a user error in
+        the source; this function returns ``True`` if there is a compilation
+        error and ``False`` on success.
+        """
+        # We hold a list of encoded options to ensure they can't be collected
+        # prior to the call to nvrtcCompileProgram
+        encoded_options = [opt.encode() for opt in options]
+        option_pointers = [c_char_p(opt) for opt in encoded_options]
+        c_options_type = (c_char_p * len(options))
+        c_options = c_options_type(*option_pointers)
+        try:
+            self.nvrtcCompileProgram(program.handle, len(options), c_options)
+            return False
+        except NvrtcCompilationError:
+            return True
+
+    def destroy_program(self, program):
+        """
+        Destroy an NVRTC program.
+        """
+        self.nvrtcDestroyProgram(byref(program.handle))
+
+    def get_compile_log(self, program):
+        """
+        Get the compile log as a Python string.
+        """
+        log_size = c_size_t()
+        self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
+
+        log = (c_char * log_size.value)()
+        self.nvrtcGetProgramLog(program.handle, log)
+
+        return log.value.decode()
+
+    def get_ptx(self, program):
+        """
+        Get the compiled PTX as a Python string.
+        """
+        ptx_size = c_size_t()
+        self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
+
+        ptx = (c_char * ptx_size.value)()
+        self.nvrtcGetPTX(program.handle, ptx)
+
+        return ptx.value.decode()
+
+
+def compile(src, name, cc):
+    """
+    Compile a CUDA C/C++ source to PTX for a given compute capability.
+
+    :param src: The source code to compile
+    :type src: str
+    :param name: The filename of the source (for information only)
+    :type name: str
+    :param cc: A tuple ``(major, minor)`` of the compute capability
+    :type cc: tuple
+    :return: The compiled PTX and compilation log
+    :rtype: tuple
+    """
+    nvrtc = NVRTC()
+    program = nvrtc.create_program(src, name)
+
+    # Compilation options:
+    # - Compile for the current device's compute capability.
+    # - The CUDA include path is added.
+    # - Relocatable Device Code (rdc) is needed to prevent device functions
+    #   being optimized away.
+    major, minor = cc
+    arch = f'--gpu-architecture=compute_{major}{minor}'
+    include = f'-I{config.CUDA_INCLUDE_PATH}'
+
+    cudadrv_path = os.path.dirname(os.path.abspath(__file__))
+    numba_cuda_path = os.path.dirname(cudadrv_path)
+    numba_include = f'-I{numba_cuda_path}'
+    options = [arch, include, numba_include, '-rdc', 'true']
+
+    # Compile the program
+    compile_error = nvrtc.compile_program(program, options)
+
+    # Get log from compilation
+    log = nvrtc.get_compile_log(program)
+
+    # If the compile failed, provide the log in an exception
+    if compile_error:
+        msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
+        raise NvrtcError(msg)
+
+    # Otherwise, if there's any content in the log, present it as a warning
+    if log:
+        msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
+        warnings.warn(msg)
+
+    ptx = nvrtc.get_ptx(program)
+    return ptx, log
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvvm.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvvm.py
@@ -0,0 +1,707 @@
+"""
+This is a direct translation of nvvm.h
+"""
+import logging
+import re
+import sys
+import warnings
+from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
+                    c_char)
+
+import threading
+
+from llvmlite import ir
+
+from .error import NvvmError, NvvmSupportError, NvvmWarning
+from .libs import get_libdevice, open_libdevice, open_cudalib
+from numba.core import cgutils, config
+
+
+logger = logging.getLogger(__name__)
+
+ADDRSPACE_GENERIC = 0
+ADDRSPACE_GLOBAL = 1
+ADDRSPACE_SHARED = 3
+ADDRSPACE_CONSTANT = 4
+ADDRSPACE_LOCAL = 5
+
+# Opaque handle for compilation unit
+nvvm_program = c_void_p
+
+# Result code
+nvvm_result = c_int
+
+RESULT_CODE_NAMES = '''
+NVVM_SUCCESS
+NVVM_ERROR_OUT_OF_MEMORY
+NVVM_ERROR_PROGRAM_CREATION_FAILURE
+NVVM_ERROR_IR_VERSION_MISMATCH
+NVVM_ERROR_INVALID_INPUT
+NVVM_ERROR_INVALID_PROGRAM
+NVVM_ERROR_INVALID_IR
+NVVM_ERROR_INVALID_OPTION
+NVVM_ERROR_NO_MODULE_IN_PROGRAM
+NVVM_ERROR_COMPILATION
+'''.split()
+
+for i, k in enumerate(RESULT_CODE_NAMES):
+    setattr(sys.modules[__name__], k, i)
+
+# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
+
+_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
+                        'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
+                        'v64:64:64-v128:128:128-n16:32:64')
+_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
+                    'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
+                    'v64:64:64-v128:128:128-n16:32:64')
+
+
+def is_available():
+    """
+    Return if libNVVM is available
+    """
+    try:
+        NVVM()
+    except NvvmSupportError:
+        return False
+    else:
+        return True
+
+
+_nvvm_lock = threading.Lock()
+
+
+class NVVM(object):
+    '''Process-wide singleton.
+    '''
+    _PROTOTYPES = {
+
+        # nvvmResult nvvmVersion(int *major, int *minor)
+        'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
+
+        # nvvmResult nvvmCreateProgram(nvvmProgram *cu)
+        'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
+
+        # nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
+        'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
+
+        # nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
+        #                                   size_t size, const char *name)
+        'nvvmAddModuleToProgram': (
+            nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
+
+        # nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
+        #                                       const char* buffer,
+        #                                       size_t size,
+        #                                       const char *name)
+        'nvvmLazyAddModuleToProgram': (
+            nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
+
+        # nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
+        #                          const char **options)
+        'nvvmCompileProgram': (
+            nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
+
+        # nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
+        #                                      size_t *bufferSizeRet)
+        'nvvmGetCompiledResultSize': (
+            nvvm_result, nvvm_program, POINTER(c_size_t)),
+
+        # nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
+        'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
+
+        # nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
+        #                                      size_t *bufferSizeRet)
+        'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
+
+        # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
+        'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
+
+        # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
+        #                           int* minorDbg )
+        'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
+                          POINTER(c_int), POINTER(c_int)),
+        # nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
+        #                               const char** options)
+        'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
+                              POINTER(c_char_p))
+    }
+
+    # Singleton reference
+    __INSTANCE = None
+
+    def __new__(cls):
+        with _nvvm_lock:
+            if cls.__INSTANCE is None:
+                cls.__INSTANCE = inst = object.__new__(cls)
+                try:
+                    inst.driver = open_cudalib('nvvm')
+                except OSError as e:
+                    cls.__INSTANCE = None
+                    errmsg = ("libNVVM cannot be found. Do `conda install "
+                              "cudatoolkit`:\n%s")
+                    raise NvvmSupportError(errmsg % e)
+
+                # Find & populate functions
+                for name, proto in inst._PROTOTYPES.items():
+                    func = getattr(inst.driver, name)
+                    func.restype = proto[0]
+                    func.argtypes = proto[1:]
+                    setattr(inst, name, func)
+
+        return cls.__INSTANCE
+
+    def __init__(self):
+        ir_versions = self.get_ir_version()
+        self._majorIR = ir_versions[0]
+        self._minorIR = ir_versions[1]
+        self._majorDbg = ir_versions[2]
+        self._minorDbg = ir_versions[3]
+        self._supported_ccs = get_supported_ccs()
+
+    @property
+    def data_layout(self):
+        if (self._majorIR, self._minorIR) < (1, 8):
+            return _datalayout_original
+        else:
+            return _datalayout_i128
+
+    @property
+    def supported_ccs(self):
+        return self._supported_ccs
+
+    def get_version(self):
+        major = c_int()
+        minor = c_int()
+        err = self.nvvmVersion(byref(major), byref(minor))
+        self.check_error(err, 'Failed to get version.')
+        return major.value, minor.value
+
+    def get_ir_version(self):
+        majorIR = c_int()
+        minorIR = c_int()
+        majorDbg = c_int()
+        minorDbg = c_int()
+        err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
+                                 byref(majorDbg), byref(minorDbg))
+        self.check_error(err, 'Failed to get IR version.')
+        return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
+
+    def check_error(self, error, msg, exit=False):
+        if error:
+            exc = NvvmError(msg, RESULT_CODE_NAMES[error])
+            if exit:
+                print(exc)
+                sys.exit(1)
+            else:
+                raise exc
+
+
+class CompilationUnit(object):
+    def __init__(self):
+        self.driver = NVVM()
+        self._handle = nvvm_program()
+        err = self.driver.nvvmCreateProgram(byref(self._handle))
+        self.driver.check_error(err, 'Failed to create CU')
+
+    def __del__(self):
+        driver = NVVM()
+        err = driver.nvvmDestroyProgram(byref(self._handle))
+        driver.check_error(err, 'Failed to destroy CU', exit=True)
+
+    def add_module(self, buffer):
+        """
+         Add a module level NVVM IR to a compilation unit.
+         - The buffer should contain an NVVM module IR either in the bitcode
+           representation (LLVM3.0) or in the text representation.
+        """
+        err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
+                                                 len(buffer), None)
+        self.driver.check_error(err, 'Failed to add module')
+
+    def lazy_add_module(self, buffer):
+        """
+        Lazily add an NVVM IR module to a compilation unit.
+        The buffer should contain NVVM module IR either in the bitcode
+        representation or in the text representation.
+        """
+        err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
+                                                     len(buffer), None)
+        self.driver.check_error(err, 'Failed to add module')
+
+    def compile(self, **options):
+        """Perform Compilation.
+
+        Compilation options are accepted as keyword arguments, with the
+        following considerations:
+
+        - Underscores (`_`) in option names are converted to dashes (`-`), to
+          match NVVM's option name format.
+        - Options that take a value will be emitted in the form
+          "-<name>=<value>".
+        - Booleans passed as option values will be converted to integers.
+        - Options which take no value (such as `-gen-lto`) should have a value
+          of `None` passed in and will be emitted in the form "-<name>".
+
+        For documentation on NVVM compilation options, see the CUDA Toolkit
+        Documentation:
+
+        https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
+        """
+
+        def stringify_option(k, v):
+            k = k.replace('_', '-')
+
+            if v is None:
+                return f'-{k}'
+
+            if isinstance(v, bool):
+                v = int(v)
+
+            return f'-{k}={v}'
+
+        options = [stringify_option(k, v) for k, v in options.items()]
+
+        c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
+                                             for x in options])
+        # verify
+        err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
+        self._try_error(err, 'Failed to verify\n')
+
+        # compile
+        err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
+        self._try_error(err, 'Failed to compile\n')
+
+        # get result
+        reslen = c_size_t()
+        err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
+
+        self._try_error(err, 'Failed to get size of compiled result.')
+
+        output_buffer = (c_char * reslen.value)()
+        err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
+        self._try_error(err, 'Failed to get compiled result.')
+
+        # get log
+        self.log = self.get_log()
+        if self.log:
+            warnings.warn(self.log, category=NvvmWarning)
+
+        return output_buffer[:]
+
+    def _try_error(self, err, msg):
+        self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
+
+    def get_log(self):
+        reslen = c_size_t()
+        err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
+        self.driver.check_error(err, 'Failed to get compilation log size.')
+
+        if reslen.value > 1:
+            logbuf = (c_char * reslen.value)()
+            err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
+            self.driver.check_error(err, 'Failed to get compilation log.')
+
+            return logbuf.value.decode('utf8')  # populate log attribute
+
+        return ''
+
+
+COMPUTE_CAPABILITIES = (
+    (3, 5), (3, 7),
+    (5, 0), (5, 2), (5, 3),
+    (6, 0), (6, 1), (6, 2),
+    (7, 0), (7, 2), (7, 5),
+    (8, 0), (8, 6), (8, 7), (8, 9),
+    (9, 0)
+)
+
+# Maps CTK version -> (min supported cc, max supported cc) inclusive
+CTK_SUPPORTED = {
+    (11, 2): ((3, 5), (8, 6)),
+    (11, 3): ((3, 5), (8, 6)),
+    (11, 4): ((3, 5), (8, 7)),
+    (11, 5): ((3, 5), (8, 7)),
+    (11, 6): ((3, 5), (8, 7)),
+    (11, 7): ((3, 5), (8, 7)),
+    (11, 8): ((3, 5), (9, 0)),
+    (12, 0): ((5, 0), (9, 0)),
+    (12, 1): ((5, 0), (9, 0)),
+    (12, 2): ((5, 0), (9, 0)),
+    (12, 3): ((5, 0), (9, 0)),
+    (12, 4): ((5, 0), (9, 0)),
+}
+
+
+def ccs_supported_by_ctk(ctk_version):
+    try:
+        # For supported versions, we look up the range of supported CCs
+        min_cc, max_cc = CTK_SUPPORTED[ctk_version]
+        return tuple([cc for cc in COMPUTE_CAPABILITIES
+                      if min_cc <= cc <= max_cc])
+    except KeyError:
+        # For unsupported CUDA toolkit versions, all we can do is assume all
+        # non-deprecated versions we are aware of are supported.
+        return tuple([cc for cc in COMPUTE_CAPABILITIES
+                      if cc >= config.CUDA_DEFAULT_PTX_CC])
+
+
+def get_supported_ccs():
+    try:
+        from numba.cuda.cudadrv.runtime import runtime
+        cudart_version = runtime.get_version()
+    except: # noqa: E722
+        # We can't support anything if there's an error getting the runtime
+        # version (e.g. if it's not present or there's another issue)
+        _supported_cc = ()
+        return _supported_cc
+
+    # Ensure the minimum CTK version requirement is met
+    min_cudart = min(CTK_SUPPORTED)
+    if cudart_version < min_cudart:
+        _supported_cc = ()
+        ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
+        unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
+                           f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
+                           "required version.")
+        warnings.warn(unsupported_ver)
+        return _supported_cc
+
+    _supported_cc = ccs_supported_by_ctk(cudart_version)
+    return _supported_cc
+
+
+def find_closest_arch(mycc):
+    """
+    Given a compute capability, return the closest compute capability supported
+    by the CUDA toolkit.
+
+    :param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
+    :return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
+    """
+    supported_ccs = NVVM().supported_ccs
+
+    if not supported_ccs:
+        msg = "No supported GPU compute capabilities found. " \
+              "Please check your cudatoolkit version matches your CUDA version."
+        raise NvvmSupportError(msg)
+
+    for i, cc in enumerate(supported_ccs):
+        if cc == mycc:
+            # Matches
+            return cc
+        elif cc > mycc:
+            # Exceeded
+            if i == 0:
+                # CC lower than supported
+                msg = "GPU compute capability %d.%d is not supported" \
+                      "(requires >=%d.%d)" % (mycc + cc)
+                raise NvvmSupportError(msg)
+            else:
+                # return the previous CC
+                return supported_ccs[i - 1]
+
+    # CC higher than supported
+    return supported_ccs[-1]  # Choose the highest
+
+
+def get_arch_option(major, minor):
+    """Matches with the closest architecture option
+    """
+    if config.FORCE_CUDA_CC:
+        arch = config.FORCE_CUDA_CC
+    else:
+        arch = find_closest_arch((major, minor))
+    return 'compute_%d%d' % arch
+
+
+MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
+Please ensure you have a CUDA Toolkit 11.2 or higher.
+For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
+
+    $ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
+
+For CUDA 11, ``cudatoolkit`` is required:
+
+    $ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
+'''
+
+
+class LibDevice(object):
+    _cache_ = None
+
+    def __init__(self):
+        if self._cache_ is None:
+            if get_libdevice() is None:
+                raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
+            self._cache_ = open_libdevice()
+
+        self.bc = self._cache_
+
+    def get(self):
+        return self.bc
+
+
+cas_nvvm = """
+    %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
+    %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
+""" # noqa: E501
+
+
+# Translation of code from CUDA Programming Guide v6.5, section B.12
+ir_numba_atomic_binary_template = """
+define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
+entry:
+    %iptr = bitcast {T}* %ptr to {Ti}*
+    %old2 = load volatile {Ti}, {Ti}* %iptr
+    br label %attempt
+
+attempt:
+    %old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
+    %dold = bitcast {Ti} %old to {T}
+    %dnew = {OP} {T} %dold, %val
+    %new = bitcast {T} %dnew to {Ti}
+    {CAS}
+    %repeat = icmp ne {Ti} %cas, %old
+    br i1 %repeat, label %attempt, label %done
+
+done:
+    %result = bitcast {Ti} %old to {T}
+    ret {T} %result
+}}
+""" # noqa: E501
+
+ir_numba_atomic_inc_template = """
+define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
+entry:
+    %old2 = load volatile {T}, {T}* %iptr
+    br label %attempt
+
+attempt:
+    %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
+    %bndchk = icmp ult {T} %old, %val
+    %inc = add {T} %old, 1
+    %new = select i1 %bndchk, {T} %inc, {T} 0
+    {CAS}
+    %repeat = icmp ne {T} %cas, %old
+    br i1 %repeat, label %attempt, label %done
+
+done:
+    ret {T} %old
+}}
+""" # noqa: E501
+
+ir_numba_atomic_dec_template = """
+define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
+entry:
+    %old2 = load volatile {T}, {T}* %iptr
+    br label %attempt
+
+attempt:
+    %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
+    %dec = add {T} %old, -1
+    %bndchk = icmp ult {T} %dec, %val
+    %new = select i1 %bndchk, {T} %dec, {T} %val
+    {CAS}
+    %repeat = icmp ne {T} %cas, %old
+    br i1 %repeat, label %attempt, label %done
+
+done:
+    ret {T} %old
+}}
+""" # noqa: E501
+
+ir_numba_atomic_minmax_template = """
+define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
+entry:
+    %ptrval = load volatile {T}, {T}* %ptr
+    ; Return early when:
+    ; - For nanmin / nanmax when val is a NaN
+    ; - For min / max when val or ptr is a NaN
+    %early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
+    br i1 %early_return, label %done, label %lt_check
+
+lt_check:
+    %dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
+    ; Continue attempts if dold less or greater than val (depending on whether min or max)
+    ; or if dold is NaN (for nanmin / nanmax)
+    %cmp = fcmp {OP} {T} %dold, %val
+    br i1 %cmp, label %attempt, label %done
+
+attempt:
+    ; Attempt to swap in the value
+    %old = bitcast {T} %dold to {Ti}
+    %iptr = bitcast {T}* %ptr to {Ti}*
+    %new = bitcast {T} %val to {Ti}
+    {CAS}
+    %dcas = bitcast {Ti} %cas to {T}
+    br label %lt_check
+
+done:
+    ret {T} %ptrval
+}}
+""" # noqa: E501
+
+
+def ir_cas(Ti):
+    return cas_nvvm.format(Ti=Ti)
+
+
+def ir_numba_atomic_binary(T, Ti, OP, FUNC):
+    params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
+    return ir_numba_atomic_binary_template.format(**params)
+
+
+def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
+    params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
+                  FUNC=FUNC, CAS=ir_cas(Ti))
+
+    return ir_numba_atomic_minmax_template.format(**params)
+
+
+def ir_numba_atomic_inc(T, Tu):
+    return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
+
+
+def ir_numba_atomic_dec(T, Tu):
+    return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
+
+
+def llvm_replace(llvmir):
+    replacements = [
+        ('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
+        ('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")',         # noqa: E501
+         ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
+        ('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
+        ('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
+         ir_numba_atomic_inc(T='i64', Tu='u64')),
+        ('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
+         ir_numba_atomic_dec(T='i64', Tu='u64')),
+        ('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")',         # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
+                                PTR_OR_VAL='ptr', FUNC='max')),
+        ('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
+                                PTR_OR_VAL='ptr', FUNC='max')),
+        ('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")',         # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
+                                PTR_OR_VAL='ptr', FUNC='min')),
+        ('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")',     # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
+                                PTR_OR_VAL='ptr', FUNC='min')),
+        ('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")',      # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
+                                PTR_OR_VAL='', FUNC='max')),
+        ('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")',  # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
+                                PTR_OR_VAL='', FUNC='max')),
+        ('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")',      # noqa: E501
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
+                                PTR_OR_VAL='', FUNC='min')),
+        ('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")',  # noqa: E501
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
+                                PTR_OR_VAL='', FUNC='min')),
+        ('immarg', '')
+    ]
+
+    for decl, fn in replacements:
+        llvmir = llvmir.replace(decl, fn)
+
+    llvmir = llvm140_to_70_ir(llvmir)
+
+    return llvmir
+
+
+def compile_ir(llvmir, **opts):
+    if isinstance(llvmir, str):
+        llvmir = [llvmir]
+
+    if opts.pop('fastmath', False):
+        opts.update({
+            'ftz': True,
+            'fma': True,
+            'prec_div': False,
+            'prec_sqrt': False,
+        })
+
+    cu = CompilationUnit()
+    libdevice = LibDevice()
+
+    for mod in llvmir:
+        mod = llvm_replace(mod)
+        cu.add_module(mod.encode('utf8'))
+    cu.lazy_add_module(libdevice.get())
+
+    return cu.compile(**opts)
+
+
+re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
+
+
+def llvm140_to_70_ir(ir):
+    """
+    Convert LLVM 14.0 IR for LLVM 7.0.
+    """
+    buf = []
+    for line in ir.splitlines():
+        if line.startswith('attributes #'):
+            # Remove function attributes unsupported by LLVM 7.0
+            m = re_attributes_def.match(line)
+            attrs = m.group(1).split()
+            attrs = ' '.join(a for a in attrs if a != 'willreturn')
+            line = line.replace(m.group(1), attrs)
+
+        buf.append(line)
+
+    return '\n'.join(buf)
+
+
+def set_cuda_kernel(function):
+    """
+    Mark a function as a CUDA kernel. Kernels have the following requirements:
+
+    - Metadata that marks them as a kernel.
+    - Addition to the @llvm.used list, so that they will not be discarded.
+    - The noinline attribute is not permitted, because this causes NVVM to emit
+      a warning, which counts as failing IR verification.
+
+    Presently it is assumed that there is one kernel per module, which holds
+    for Numba-jitted functions. If this changes in future or this function is
+    to be used externally, this function may need modification to add to the
+    @llvm.used list rather than creating it.
+    """
+    module = function.module
+
+    # Add kernel metadata
+    mdstr = ir.MetaDataString(module, "kernel")
+    mdvalue = ir.Constant(ir.IntType(32), 1)
+    md = module.add_metadata((function, mdstr, mdvalue))
+
+    nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
+    nmd.add(md)
+
+    # Create the used list
+    ptrty = ir.IntType(8).as_pointer()
+    usedty = ir.ArrayType(ptrty, 1)
+
+    fnptr = function.bitcast(ptrty)
+
+    llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
+    llvm_used.linkage = 'appending'
+    llvm_used.section = 'llvm.metadata'
+    llvm_used.initializer = ir.Constant(usedty, [fnptr])
+
+    # Remove 'noinline' if it is present.
+    function.attributes.discard('noinline')
+
+
+def add_ir_version(mod):
+    """Add NVVM IR version to module"""
+    # We specify the IR version to match the current NVVM's IR version
+    i32 = ir.IntType(32)
+    ir_versions = [i32(v) for v in NVVM().get_ir_version()]
+    md_ver = mod.add_metadata(ir_versions)
+    mod.add_named_metadata('nvvmir.version', md_ver)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/rtapi.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/rtapi.py
@@ -0,0 +1,10 @@
+"""
+Declarations of the Runtime API functions.
+"""
+
+from ctypes import c_int, POINTER
+
+API_PROTOTYPES = {
+    # cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
+    'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
+}
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/runtime.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/cudadrv/runtime.py
@@ -0,0 +1,142 @@
+"""
+CUDA Runtime wrapper.
+
+This provides a very minimal set of bindings, since the Runtime API is not
+really used in Numba except for querying the Runtime version.
+"""
+
+import ctypes
+import functools
+import sys
+
+from numba.core import config
+from numba.cuda.cudadrv.driver import ERROR_MAP, make_logger
+from numba.cuda.cudadrv.error import CudaSupportError, CudaRuntimeError
+from numba.cuda.cudadrv.libs import open_cudalib
+from numba.cuda.cudadrv.rtapi import API_PROTOTYPES
+from numba.cuda.cudadrv import enums
+
+
+class CudaRuntimeAPIError(CudaRuntimeError):
+    """
+    Raised when there is an error accessing a C API from the CUDA Runtime.
+    """
+    def __init__(self, code, msg):
+        self.code = code
+        self.msg = msg
+        super().__init__(code, msg)
+
+    def __str__(self):
+        return "[%s] %s" % (self.code, self.msg)
+
+
+class Runtime:
+    """
+    Runtime object that lazily binds runtime API functions.
+    """
+
+    def __init__(self):
+        self.is_initialized = False
+
+    def _initialize(self):
+        # lazily initialize logger
+        global _logger
+        _logger = make_logger()
+
+        if config.DISABLE_CUDA:
+            msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
+                   "in the environment, or because CUDA is unsupported on "
+                   "32-bit systems.")
+            raise CudaSupportError(msg)
+        self.lib = open_cudalib('cudart')
+
+        self.is_initialized = True
+
+    def __getattr__(self, fname):
+        # First request of a runtime API function
+        try:
+            proto = API_PROTOTYPES[fname]
+        except KeyError:
+            raise AttributeError(fname)
+        restype = proto[0]
+        argtypes = proto[1:]
+
+        if not self.is_initialized:
+            self._initialize()
+
+        # Find function in runtime library
+        libfn = self._find_api(fname)
+        libfn.restype = restype
+        libfn.argtypes = argtypes
+
+        safe_call = self._wrap_api_call(fname, libfn)
+        setattr(self, fname, safe_call)
+        return safe_call
+
+    def _wrap_api_call(self, fname, libfn):
+        @functools.wraps(libfn)
+        def safe_cuda_api_call(*args):
+            _logger.debug('call runtime api: %s', libfn.__name__)
+            retcode = libfn(*args)
+            self._check_error(fname, retcode)
+        return safe_cuda_api_call
+
+    def _check_error(self, fname, retcode):
+        if retcode != enums.CUDA_SUCCESS:
+            errname = ERROR_MAP.get(retcode, "cudaErrorUnknown")
+            msg = "Call to %s results in %s" % (fname, errname)
+            _logger.error(msg)
+            raise CudaRuntimeAPIError(retcode, msg)
+
+    def _find_api(self, fname):
+        try:
+            return getattr(self.lib, fname)
+        except AttributeError:
+            pass
+
+        # Not found.
+        # Delay missing function error to use
+        def absent_function(*args, **kws):
+            msg = "runtime missing function: %s."
+            raise CudaRuntimeError(msg % fname)
+
+        setattr(self, fname, absent_function)
+        return absent_function
+
+    def get_version(self):
+        """
+        Returns the CUDA Runtime version as a tuple (major, minor).
+        """
+        rtver = ctypes.c_int()
+        self.cudaRuntimeGetVersion(ctypes.byref(rtver))
+        # The version is encoded as (1000 * major) + (10 * minor)
+        major = rtver.value // 1000
+        minor = (rtver.value - (major * 1000)) // 10
+        return (major, minor)
+
+    def is_supported_version(self):
+        """
+        Returns True if the CUDA Runtime is a supported version.
+        """
+
+        return self.get_version() in self.supported_versions
+
+    @property
+    def supported_versions(self):
+        """A tuple of all supported CUDA toolkit versions. Versions are given in
+        the form ``(major_version, minor_version)``."""
+        if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
+            # Only 64-bit Linux and Windows are supported
+            return ()
+        return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
+                (11, 7))
+
+
+runtime = Runtime()
+
+
+def get_version():
+    """
+    Return the runtime version as a tuple of (major, minor)
+    """
+    return runtime.get_version()