Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/init.py
@@ -0,0 +1,38 @@
+import sys
+
+from .api import *
+from .vector_types import vector_types
+from .reduction import Reduce
+from .cudadrv.devicearray import (device_array, device_array_like, pinned,
+                                  pinned_array, pinned_array_like,
+                                  mapped_array, to_device, auto_device)
+from .cudadrv import devicearray
+from .cudadrv.devices import require_context, gpus
+from .cudadrv.devices import get_context as current_context
+from .cudadrv.runtime import runtime
+from numba.core import config
+reduce = Reduce
+
+# Register simulated vector types as module level variables
+for name, svty in vector_types.items():
+    setattr(sys.modules[__name__], name, svty)
+    for alias in svty.aliases:
+        setattr(sys.modules[__name__], alias, svty)
+del vector_types, name, svty, alias
+
+# Ensure that any user code attempting to import cudadrv etc. gets the
+# simulator's version and not the real version if the simulator is enabled.
+if config.ENABLE_CUDASIM:
+    import sys
+    from numba.cuda.simulator import cudadrv
+    sys.modules['numba.cuda.cudadrv'] = cudadrv
+    sys.modules['numba.cuda.cudadrv.devicearray'] = cudadrv.devicearray
+    sys.modules['numba.cuda.cudadrv.devices'] = cudadrv.devices
+    sys.modules['numba.cuda.cudadrv.driver'] = cudadrv.driver
+    sys.modules['numba.cuda.cudadrv.runtime'] = cudadrv.runtime
+    sys.modules['numba.cuda.cudadrv.drvapi'] = cudadrv.drvapi
+    sys.modules['numba.cuda.cudadrv.error'] = cudadrv.error
+    sys.modules['numba.cuda.cudadrv.nvvm'] = cudadrv.nvvm
+
+    from . import compiler
+    sys.modules['numba.cuda.compiler'] = compiler
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/api.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/api.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/compiler.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/compiler.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/kernel.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/kernel.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/kernelapi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/kernelapi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/reduction.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/reduction.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/vector_types.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/pycache/vector_types.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/api.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/api.py
@@ -0,0 +1,110 @@
+'''
+Contains CUDA API functions
+'''
+
+# Imports here bring together parts of the API from other modules, so some of
+# them appear unused.
+from contextlib import contextmanager
+
+from .cudadrv.devices import require_context, reset, gpus  # noqa: F401
+from .kernel import FakeCUDAKernel
+from numba.core.sigutils import is_signature
+from warnings import warn
+from ..args import In, Out, InOut  # noqa: F401
+
+
+def select_device(dev=0):
+    assert dev == 0, 'Only a single device supported by the simulator'
+
+
+def is_float16_supported():
+    return True
+
+
+class stream(object):
+    '''
+    The stream API is supported in the simulator - however, all execution
+    occurs synchronously, so synchronization requires no operation.
+    '''
+    @contextmanager
+    def auto_synchronize(self):
+        yield
+
+    def synchronize(self):
+        pass
+
+
+def synchronize():
+    pass
+
+
+def close():
+    gpus.closed = True
+
+
+def declare_device(*args, **kwargs):
+    pass
+
+
+def detect():
+    print('Found 1 CUDA devices')
+    print('id %d    %20s %40s' % (0, 'SIMULATOR', '[SUPPORTED]'))
+    print('%40s: 5.0' % 'compute capability')
+
+
+def list_devices():
+    return gpus
+
+
+# Events
+
+class Event(object):
+    '''
+    The simulator supports the event API, but they do not record timing info,
+    and all simulation is synchronous. Execution time is not recorded.
+    '''
+    def record(self, stream=0):
+        pass
+
+    def wait(self, stream=0):
+        pass
+
+    def synchronize(self):
+        pass
+
+    def elapsed_time(self, event):
+        warn('Simulator timings are bogus')
+        return 0.0
+
+
+event = Event
+
+
+def jit(func_or_sig=None, device=False, debug=False, argtypes=None,
+        inline=False, restype=None, fastmath=False, link=None,
+        boundscheck=None, opt=True, cache=None
+        ):
+    # Here for API compatibility
+    if boundscheck:
+        raise NotImplementedError("bounds checking is not supported for CUDA")
+
+    if link is not None:
+        raise NotImplementedError('Cannot link PTX in the simulator')
+
+    # Check for first argument specifying types - in that case the
+    # decorator is not being passed a function
+    if (func_or_sig is None or is_signature(func_or_sig)
+            or isinstance(func_or_sig, list)):
+        def jitwrapper(fn):
+            return FakeCUDAKernel(fn,
+                                  device=device,
+                                  fastmath=fastmath,
+                                  debug=debug)
+        return jitwrapper
+    return FakeCUDAKernel(func_or_sig, device=device, debug=debug)
+
+
+@contextmanager
+def defer_cleanup():
+    # No effect for simulator
+    yield
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/compiler.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/compiler.py
@@ -0,0 +1,9 @@
+'''
+The compiler is not implemented in the simulator. This module provides a stub
+to allow tests to import successfully.
+'''
+
+compile = None
+compile_for_current_device = None
+compile_ptx = None
+compile_ptx_for_current_device = None
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/init.py
@@ -0,0 +1,2 @@
+from numba.cuda.simulator.cudadrv import (devicearray, devices, driver, drvapi,
+                                          error, nvvm)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/devicearray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/devicearray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/devices.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/devices.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/driver.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/driver.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/drvapi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/drvapi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/dummyarray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/dummyarray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/error.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/error.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/libs.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/libs.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/nvvm.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/nvvm.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/runtime.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/pycache/runtime.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/devicearray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/devicearray.py
@@ -0,0 +1,436 @@
+'''
+The Device Array API is not implemented in the simulator. This module provides
+stubs to allow tests to import correctly.
+'''
+from contextlib import contextmanager
+from numba.np.numpy_support import numpy_version
+
+import numpy as np
+
+
+DeviceRecord = None
+from_record_like = None
+
+
+errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
+                            "be transferred as a single memory region. Please "
+                            "ensure contiguous buffer with numpy "
+                            ".ascontiguousarray()")
+
+
+class FakeShape(tuple):
+    '''
+    The FakeShape class is used to provide a shape which does not allow negative
+    indexing, similar to the shape in CUDA Python. (Numpy shape arrays allow
+    negative indexing)
+    '''
+
+    def __getitem__(self, k):
+        if isinstance(k, int) and k < 0:
+            raise IndexError('tuple index out of range')
+        return super(FakeShape, self).__getitem__(k)
+
+
+class FakeWithinKernelCUDAArray(object):
+    '''
+    Created to emulate the behavior of arrays within kernels, where either
+    array.item or array['item'] is valid (that is, give all structured
+    arrays `numpy.recarray`-like semantics). This behaviour does not follow
+    the semantics of Python and NumPy with non-jitted code, and will be
+    deprecated and removed.
+    '''
+
+    def __init__(self, item):
+        assert isinstance(item, FakeCUDAArray)
+        self.__dict__['_item'] = item
+
+    def __wrap_if_fake(self, item):
+        if isinstance(item, FakeCUDAArray):
+            return FakeWithinKernelCUDAArray(item)
+        else:
+            return item
+
+    def __getattr__(self, attrname):
+        try:
+            if attrname in dir(self._item._ary):  # For e.g. array size.
+                return self.__wrap_if_fake(getattr(self._item._ary, attrname))
+            else:
+                return self.__wrap_if_fake(self._item.__getitem__(attrname))
+        except Exception as e:
+            if not isinstance(e, AttributeError):
+                raise AttributeError(attrname) from e
+
+    def __setattr__(self, nm, val):
+        self._item.__setitem__(nm, val)
+
+    def __getitem__(self, idx):
+        return self.__wrap_if_fake(self._item.__getitem__(idx))
+
+    def __setitem__(self, idx, val):
+        self._item.__setitem__(idx, val)
+
+    def __len__(self):
+        return len(self._item)
+
+    def __array_ufunc__(self, ufunc, method, *args, **kwargs):
+        # ufuncs can only be called directly on instances of numpy.ndarray (not
+        # things that implement its interfaces, like the FakeCUDAArray or
+        # FakeWithinKernelCUDAArray). For other objects, __array_ufunc__ is
+        # called when they are arguments to ufuncs, to provide an opportunity
+        # to somehow implement the ufunc. Since the FakeWithinKernelCUDAArray
+        # is just a thin wrapper over an ndarray, we can implement all ufuncs
+        # by passing the underlying ndarrays to a call to the intended ufunc.
+        call = getattr(ufunc, method)
+
+        def convert_fakes(obj):
+            if isinstance(obj, FakeWithinKernelCUDAArray):
+                obj = obj._item._ary
+
+            return obj
+
+        out = kwargs.get('out')
+        if out:
+            kwargs['out'] = tuple(convert_fakes(o) for o in out)
+        args = tuple(convert_fakes(a) for a in args)
+        return call(*args, **kwargs)
+
+
+class FakeCUDAArray(object):
+    '''
+    Implements the interface of a DeviceArray/DeviceRecord, but mostly just
+    wraps a NumPy array.
+    '''
+
+    __cuda_ndarray__ = True  # There must be gpu_data attribute
+
+    def __init__(self, ary, stream=0):
+        self._ary = ary
+        self.stream = stream
+
+    @property
+    def alloc_size(self):
+        return self._ary.nbytes
+
+    @property
+    def nbytes(self):
+        # return nbytes -- FakeCUDAArray is a wrapper around NumPy
+        return self._ary.nbytes
+
+    def __getattr__(self, attrname):
+        try:
+            attr = getattr(self._ary, attrname)
+            return attr
+        except AttributeError as e:
+            msg = "Wrapped array has no attribute '%s'" % attrname
+            raise AttributeError(msg) from e
+
+    def bind(self, stream=0):
+        return FakeCUDAArray(self._ary, stream)
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    def transpose(self, axes=None):
+        return FakeCUDAArray(np.transpose(self._ary, axes=axes))
+
+    def __getitem__(self, idx):
+        ret = self._ary.__getitem__(idx)
+        if type(ret) not in [np.ndarray, np.void]:
+            return ret
+        else:
+            return FakeCUDAArray(ret, stream=self.stream)
+
+    def __setitem__(self, idx, val):
+        return self._ary.__setitem__(idx, val)
+
+    def copy_to_host(self, ary=None, stream=0):
+        if ary is None:
+            ary = np.empty_like(self._ary)
+        else:
+            check_array_compatibility(self, ary)
+        np.copyto(ary, self._ary)
+        return ary
+
+    def copy_to_device(self, ary, stream=0):
+        '''
+        Copy from the provided array into this array.
+
+        This may be less forgiving than the CUDA Python implementation, which
+        will copy data up to the length of the smallest of the two arrays,
+        whereas this expects the size of the arrays to be equal.
+        '''
+        sentry_contiguous(self)
+        self_core, ary_core = array_core(self), array_core(ary)
+        if isinstance(ary, FakeCUDAArray):
+            sentry_contiguous(ary)
+            check_array_compatibility(self_core, ary_core)
+        else:
+            ary_core = np.array(
+                ary_core,
+                order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
+                subok=True,
+                copy=False if numpy_version < (2, 0) else None)
+            check_array_compatibility(self_core, ary_core)
+        np.copyto(self_core._ary, ary_core)
+
+    @property
+    def shape(self):
+        return FakeShape(self._ary.shape)
+
+    def ravel(self, *args, **kwargs):
+        return FakeCUDAArray(self._ary.ravel(*args, **kwargs))
+
+    def reshape(self, *args, **kwargs):
+        return FakeCUDAArray(self._ary.reshape(*args, **kwargs))
+
+    def view(self, *args, **kwargs):
+        return FakeCUDAArray(self._ary.view(*args, **kwargs))
+
+    def is_c_contiguous(self):
+        return self._ary.flags.c_contiguous
+
+    def is_f_contiguous(self):
+        return self._ary.flags.f_contiguous
+
+    def __str__(self):
+        return str(self._ary)
+
+    def __repr__(self):
+        return repr(self._ary)
+
+    def __len__(self):
+        return len(self._ary)
+
+    # TODO: Add inplace, bitwise, unary magic methods
+    #  (or maybe inherit this class from numpy)?
+    def __eq__(self, other):
+        return FakeCUDAArray(self._ary == other)
+
+    def __ne__(self, other):
+        return FakeCUDAArray(self._ary != other)
+
+    def __lt__(self, other):
+        return FakeCUDAArray(self._ary < other)
+
+    def __le__(self, other):
+        return FakeCUDAArray(self._ary <= other)
+
+    def __gt__(self, other):
+        return FakeCUDAArray(self._ary > other)
+
+    def __ge__(self, other):
+        return FakeCUDAArray(self._ary >= other)
+
+    def __add__(self, other):
+        return FakeCUDAArray(self._ary + other)
+
+    def __sub__(self, other):
+        return FakeCUDAArray(self._ary - other)
+
+    def __mul__(self, other):
+        return FakeCUDAArray(self._ary * other)
+
+    def __floordiv__(self, other):
+        return FakeCUDAArray(self._ary // other)
+
+    def __truediv__(self, other):
+        return FakeCUDAArray(self._ary / other)
+
+    def __mod__(self, other):
+        return FakeCUDAArray(self._ary % other)
+
+    def __pow__(self, other):
+        return FakeCUDAArray(self._ary ** other)
+
+    def split(self, section, stream=0):
+        return [
+            FakeCUDAArray(a)
+            for a in np.split(self._ary, range(section, len(self), section))
+        ]
+
+
+def array_core(ary):
+    """
+    Extract the repeated core of a broadcast array.
+
+    Broadcast arrays are by definition non-contiguous due to repeated
+    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
+    contiguity and copy the underlying data from such arrays, we must create
+    a view without the repeated dimensions.
+
+    """
+    if not ary.strides or not ary.size:
+        return ary
+    core_index = []
+    for stride in ary.strides:
+        core_index.append(0 if stride == 0 else slice(None))
+    return ary[tuple(core_index)]
+
+
+def is_contiguous(ary):
+    """
+    Returns True iff `ary` is C-style contiguous while ignoring
+    broadcasted and 1-sized dimensions.
+    As opposed to array_core(), it does not call require_context(),
+    which can be quite expensive.
+    """
+    size = ary.dtype.itemsize
+    for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
+        if shape > 1 and stride != 0:
+            if size != stride:
+                return False
+            size *= shape
+    return True
+
+
+def sentry_contiguous(ary):
+    core = array_core(ary)
+    if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
+        raise ValueError(errmsg_contiguous_buffer)
+
+
+def check_array_compatibility(ary1, ary2):
+    ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
+    if ary1.dtype != ary2.dtype:
+        raise TypeError('incompatible dtype: %s vs. %s' %
+                        (ary1.dtype, ary2.dtype))
+    if ary1sq.shape != ary2sq.shape:
+        raise ValueError('incompatible shape: %s vs. %s' %
+                         (ary1.shape, ary2.shape))
+    if ary1sq.strides != ary2sq.strides:
+        raise ValueError('incompatible strides: %s vs. %s' %
+                         (ary1.strides, ary2.strides))
+
+
+def to_device(ary, stream=0, copy=True, to=None):
+    ary = np.array(ary,
+                   copy=False if numpy_version < (2, 0) else None,
+                   subok=True)
+    sentry_contiguous(ary)
+    if to is None:
+        buffer_dtype = np.int64 if ary.dtype.char in 'Mm' else ary.dtype
+        return FakeCUDAArray(
+            np.ndarray(
+                buffer=np.copy(array_core(ary)).view(buffer_dtype),
+                dtype=ary.dtype,
+                shape=ary.shape,
+                strides=ary.strides,
+            ).view(type=type(ary)),
+        )
+    else:
+        to.copy_to_device(ary, stream=stream)
+
+
+@contextmanager
+def pinned(arg):
+    yield
+
+
+def mapped_array(*args, **kwargs):
+    for unused_arg in ('portable', 'wc'):
+        if unused_arg in kwargs:
+            kwargs.pop(unused_arg)
+    return device_array(*args, **kwargs)
+
+
+def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
+    return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order)
+
+
+def managed_array(shape, dtype=np.float64, strides=None, order='C'):
+    return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order)
+
+
+def device_array(*args, **kwargs):
+    stream = kwargs.pop('stream') if 'stream' in kwargs else 0
+    return FakeCUDAArray(np.ndarray(*args, **kwargs), stream=stream)
+
+
+def _contiguous_strides_like_array(ary):
+    """
+    Given an array, compute strides for a new contiguous array of the same
+    shape.
+    """
+    # Don't recompute strides if the default strides will be sufficient to
+    # create a contiguous array.
+    if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
+        return None
+
+    # Otherwise, we need to compute new strides using an algorithm adapted from
+    # NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
+    # core/src/multiarray/ctors.c. We permute the strides in ascending order
+    # then compute the stride for the dimensions with the same permutation.
+
+    # Stride permutation. E.g. a stride array (4, -2, 12) becomes
+    # [(1, -2), (0, 4), (2, 12)]
+    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm.sort(key=lambda x: x[1])
+
+    # Compute new strides using permutation
+    strides = [0] * len(ary.strides)
+    stride = ary.dtype.itemsize
+    for i_perm, _ in strideperm:
+        strides[i_perm] = stride
+        stride *= ary.shape[i_perm]
+    return tuple(strides)
+
+
+def _order_like_array(ary):
+    if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
+        return 'F'
+    else:
+        return 'C'
+
+
+def device_array_like(ary, stream=0):
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order)
+
+
+def pinned_array_like(ary):
+    strides = _contiguous_strides_like_array(ary)
+    order = _order_like_array(ary)
+    return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        order=order)
+
+
+def auto_device(ary, stream=0, copy=True):
+    if isinstance(ary, FakeCUDAArray):
+        return ary, False
+
+    if not isinstance(ary, np.void):
+        ary = np.array(
+            ary,
+            copy=False if numpy_version < (2, 0) else None,
+            subok=True)
+    return to_device(ary, stream, copy), True
+
+
+def is_cuda_ndarray(obj):
+    "Check if an object is a CUDA ndarray"
+    return getattr(obj, '__cuda_ndarray__', False)
+
+
+def verify_cuda_ndarray_interface(obj):
+    "Verify the CUDA ndarray interface for an obj"
+    require_cuda_ndarray(obj)
+
+    def requires_attr(attr, typ):
+        if not hasattr(obj, attr):
+            raise AttributeError(attr)
+        if not isinstance(getattr(obj, attr), typ):
+            raise AttributeError('%s must be of type %s' % (attr, typ))
+
+    requires_attr('shape', tuple)
+    requires_attr('strides', tuple)
+    requires_attr('dtype', np.dtype)
+    requires_attr('size', int)
+
+
+def require_cuda_ndarray(obj):
+    "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
+    if not is_cuda_ndarray(obj):
+        raise ValueError('require an cuda ndarray object')
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/devices.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/devices.py
@@ -0,0 +1,117 @@
+import numpy as np
+from collections import namedtuple
+
+_MemoryInfo = namedtuple("_MemoryInfo", "free,total")
+
+_SIMULATOR_CC = (5, 2)
+
+
+class FakeCUDADevice:
+    def __init__(self):
+        self.uuid = 'GPU-00000000-0000-0000-0000-000000000000'
+
+    @property
+    def compute_capability(self):
+        return _SIMULATOR_CC
+
+
+class FakeCUDAContext:
+    '''
+    This stub implements functionality only for simulating a single GPU
+    at the moment.
+    '''
+    def __init__(self, device_id):
+        self._device_id = device_id
+        self._device = FakeCUDADevice()
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+    def __str__(self):
+        return "<Managed Device {self.id}>".format(self=self)
+
+    @property
+    def id(self):
+        return self._device_id
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def compute_capability(self):
+        return _SIMULATOR_CC
+
+    def reset(self):
+        pass
+
+    def get_memory_info(self):
+        """
+        Cross-platform free / total host memory is hard without external
+        dependencies, e.g. `psutil` - so return infinite memory to maintain API
+        type compatibility
+        """
+        return _MemoryInfo(float('inf'), float('inf'))
+
+    def memalloc(self, sz):
+        """
+        Allocates memory on the simulated device
+        At present, there is no division between simulated
+        host memory and simulated device memory.
+        """
+        return np.ndarray(sz, dtype='u1')
+
+    def memhostalloc(self, sz, mapped=False, portable=False, wc=False):
+        '''Allocates memory on the host'''
+        return self.memalloc(sz)
+
+
+class FakeDeviceList:
+    '''
+    This stub implements a device list containing a single GPU. It also
+    keeps track of the GPU status, i.e. whether the context is closed or not,
+    which may have been set by the user calling reset()
+    '''
+    def __init__(self):
+        self.lst = (FakeCUDAContext(0),)
+        self.closed = False
+
+    def __getitem__(self, devnum):
+        self.closed = False
+        return self.lst[devnum]
+
+    def __str__(self):
+        return ', '.join([str(d) for d in self.lst])
+
+    def __iter__(self):
+        return iter(self.lst)
+
+    def __len__(self):
+        return len(self.lst)
+
+    @property
+    def current(self):
+        if self.closed:
+            return None
+        return self.lst[0]
+
+
+gpus = FakeDeviceList()
+
+
+def reset():
+    gpus[0].closed = True
+
+
+def get_context(devnum=0):
+    return FakeCUDAContext(devnum)
+
+
+def require_context(func):
+    '''
+    In the simulator, a context is always "available", so this is a no-op.
+    '''
+    return func
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/driver.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/driver.py
@@ -0,0 +1,62 @@
+'''
+Most of the driver API is unsupported in the simulator, but some stubs are
+provided to allow tests to import correctly.
+'''
+
+
+def device_memset(dst, val, size, stream=0):
+    dst.view('u1')[:size].fill(bytes([val])[0])
+
+
+def host_to_device(dst, src, size, stream=0):
+    dst.view('u1')[:size] = src.view('u1')[:size]
+
+
+def device_to_host(dst, src, size, stream=0):
+    host_to_device(dst, src, size)
+
+
+def device_memory_size(obj):
+    return obj.itemsize * obj.size
+
+
+def device_to_device(dst, src, size, stream=0):
+    host_to_device(dst, src, size)
+
+
+class FakeDriver(object):
+    def get_device_count(self):
+        return 1
+
+
+driver = FakeDriver()
+
+
+class Linker:
+    @classmethod
+    def new(cls, max_registers=0, lineinfo=False, cc=None):
+        return Linker()
+
+    @property
+    def lto(self):
+        return False
+
+
+class LinkerError(RuntimeError):
+    pass
+
+
+class NvrtcError(RuntimeError):
+    pass
+
+
+class CudaAPIError(RuntimeError):
+    pass
+
+
+def launch_kernel(*args, **kwargs):
+    msg = 'Launching kernels directly is not supported in the simulator'
+    raise RuntimeError(msg)
+
+
+USE_NV_BINDING = False
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/drvapi.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/drvapi.py
@@ -0,0 +1,4 @@
+'''
+drvapi is not implemented in the simulator, but this module exists to allow
+tests to import correctly.
+'''
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/dummyarray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/dummyarray.py
@@ -0,0 +1,4 @@
+# Dummy arrays are not implemented in the simulator. This file allows the dummy
+# array tests to be imported, but they are skipped on the simulator.
+
+Array = None
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/error.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/error.py
@@ -0,0 +1,6 @@
+class CudaSupportError(RuntimeError):
+    pass
+
+
+class NvrtcError(Exception):
+    pass
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/libs.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/libs.py
@@ -0,0 +1,2 @@
+def check_static_lib(lib):
+    raise FileNotFoundError('Linking libraries not supported by cudasim')
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/nvvm.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/nvvm.py
@@ -0,0 +1,29 @@
+'''
+NVVM is not supported in the simulator, but stubs are provided to allow tests
+to import correctly.
+'''
+
+
+class NvvmSupportError(ImportError):
+    pass
+
+
+class NVVM(object):
+    def __init__(self):
+        raise NvvmSupportError('NVVM not supported in the simulator')
+
+
+CompilationUnit = None
+compile_ir = None
+set_cuda_kernel = None
+get_arch_option = None
+LibDevice = None
+NvvmError = None
+
+
+def is_available():
+    return False
+
+
+def get_supported_ccs():
+    return ()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/runtime.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/cudadrv/runtime.py
@@ -0,0 +1,19 @@
+'''
+The runtime API is unsupported in the simulator, but some stubs are
+provided to allow tests to import correctly.
+'''
+
+
+class FakeRuntime(object):
+    def get_version(self):
+        return (-1, -1)
+
+    def is_supported_version(self):
+        return True
+
+    @property
+    def supported_versions(self):
+        return (-1, -1),
+
+
+runtime = FakeRuntime()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/kernel.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/kernel.py
@@ -0,0 +1,312 @@
+from contextlib import contextmanager
+import functools
+import sys
+import threading
+
+import numpy as np
+
+from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray
+from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module
+from ..errors import normalize_kernel_dimensions
+from ..args import wrap_arg, ArgHint
+
+
+"""
+Global variable to keep track of the current "kernel context", i.e the
+FakeCUDAModule.  We only support one kernel launch at a time.
+No support for concurrent kernel launch.
+"""
+_kernel_context = None
+
+
+@contextmanager
+def _push_kernel_context(mod):
+    """
+    Push the current kernel context.
+    """
+    global _kernel_context
+    assert _kernel_context is None, "concurrent simulated kernel not supported"
+    _kernel_context = mod
+    try:
+        yield
+    finally:
+        _kernel_context = None
+
+
+def _get_kernel_context():
+    """
+    Get the current kernel context. This is usually done by a device function.
+    """
+    return _kernel_context
+
+
+class FakeOverload:
+    '''
+    Used only to provide the max_cooperative_grid_blocks method
+    '''
+    def max_cooperative_grid_blocks(self, blockdim):
+        # We can only run one block in a cooperative grid because we have no
+        # mechanism for synchronization between different blocks
+        return 1
+
+
+class FakeOverloadDict(dict):
+    def __getitem__(self, key):
+        # Always return a fake overload for any signature, as we don't keep
+        # track of overloads in the simulator.
+        return FakeOverload()
+
+
+class FakeCUDAKernel(object):
+    '''
+    Wraps a @cuda.jit-ed function.
+    '''
+
+    def __init__(
+        self, fn, device, fastmath=False, extensions=None, debug=False
+    ):
+        if extensions is None:
+            extensions = []
+        self.fn = fn
+        self._device = device
+        self._fastmath = fastmath
+        self._debug = debug
+        self.extensions = list(extensions) # defensive copy
+        # Initial configuration: grid unconfigured, stream 0, no dynamic shared
+        # memory.
+        self.grid_dim = None
+        self.block_dim = None
+        self.stream = 0
+        self.dynshared_size = 0
+        functools.update_wrapper(self, fn)
+
+    def __call__(self, *args):
+        if self._device:
+            with swapped_cuda_module(self.fn, _get_kernel_context()):
+                return self.fn(*args)
+
+        # Ensure we've been given a valid grid configuration
+        grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
+                                                          self.block_dim)
+
+        fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
+                                          self.dynshared_size)
+        with _push_kernel_context(fake_cuda_module):
+            # fake_args substitutes all numpy arrays for FakeCUDAArrays
+            # because they implement some semantics differently
+            retr = []
+
+            def fake_arg(arg):
+                # map the arguments using any extension you've registered
+                _, arg = functools.reduce(
+                    lambda ty_val, extension: extension.prepare_args(
+                        *ty_val,
+                        stream=0,
+                        retr=retr),
+                    self.extensions,
+                    (None, arg)
+                )
+
+                if isinstance(arg, np.ndarray) and arg.ndim > 0:
+                    ret = wrap_arg(arg).to_device(retr)
+                elif isinstance(arg, ArgHint):
+                    ret = arg.to_device(retr)
+                elif isinstance(arg, np.void):
+                    ret = FakeCUDAArray(arg)  # In case a np record comes in.
+                else:
+                    ret = arg
+                if isinstance(ret, FakeCUDAArray):
+                    return FakeWithinKernelCUDAArray(ret)
+                return ret
+
+            fake_args = [fake_arg(arg) for arg in args]
+            with swapped_cuda_module(self.fn, fake_cuda_module):
+                # Execute one block at a time
+                for grid_point in np.ndindex(*grid_dim):
+                    bm = BlockManager(self.fn, grid_dim, block_dim, self._debug)
+                    bm.run(grid_point, *fake_args)
+
+            for wb in retr:
+                wb()
+
+    def __getitem__(self, configuration):
+        self.grid_dim, self.block_dim = \
+            normalize_kernel_dimensions(*configuration[:2])
+
+        if len(configuration) == 4:
+            self.dynshared_size = configuration[3]
+
+        return self
+
+    def bind(self):
+        pass
+
+    def specialize(self, *args):
+        return self
+
+    def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
+        if ntasks < 0:
+            raise ValueError("Can't create ForAll with negative task count: %s"
+                             % ntasks)
+        return self[ntasks, 1, stream, sharedmem]
+
+    @property
+    def overloads(self):
+        return FakeOverloadDict()
+
+    @property
+    def py_func(self):
+        return self.fn
+
+
+# Thread emulation
+
+class BlockThread(threading.Thread):
+    '''
+    Manages the execution of a function for a single CUDA thread.
+    '''
+    def __init__(self, f, manager, blockIdx, threadIdx, debug):
+        if debug:
+            def debug_wrapper(*args, **kwargs):
+                np.seterr(divide='raise')
+                f(*args, **kwargs)
+            target = debug_wrapper
+        else:
+            target = f
+
+        super(BlockThread, self).__init__(target=target)
+        self.syncthreads_event = threading.Event()
+        self.syncthreads_blocked = False
+        self._manager = manager
+        self.blockIdx = Dim3(*blockIdx)
+        self.threadIdx = Dim3(*threadIdx)
+        self.exception = None
+        self.daemon = True
+        self.abort = False
+        self.debug = debug
+        blockDim = Dim3(*self._manager._block_dim)
+        self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y +
+                                                           blockDim.y *
+                                                           self.threadIdx.z))
+
+    def run(self):
+        try:
+            super(BlockThread, self).run()
+        except Exception as e:
+            tid = 'tid=%s' % list(self.threadIdx)
+            ctaid = 'ctaid=%s' % list(self.blockIdx)
+            if str(e) == '':
+                msg = '%s %s' % (tid, ctaid)
+            else:
+                msg = '%s %s: %s' % (tid, ctaid, e)
+            tb = sys.exc_info()[2]
+            # Using `with_traceback` here would cause it to be mutated by
+            # future raise statements, which may or may not matter.
+            self.exception = (type(e)(msg), tb)
+
+    def syncthreads(self):
+
+        if self.abort:
+            raise RuntimeError("abort flag set on syncthreads call")
+
+        self.syncthreads_blocked = True
+        self.syncthreads_event.wait()
+        self.syncthreads_event.clear()
+
+        if self.abort:
+            raise RuntimeError("abort flag set on syncthreads clear")
+
+    def syncthreads_count(self, value):
+        idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
+        self._manager.block_state[idx] = value
+        self.syncthreads()
+        count = np.count_nonzero(self._manager.block_state)
+        self.syncthreads()
+        return count
+
+    def syncthreads_and(self, value):
+        idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
+        self._manager.block_state[idx] = value
+        self.syncthreads()
+        test = np.all(self._manager.block_state)
+        self.syncthreads()
+        return 1 if test else 0
+
+    def syncthreads_or(self, value):
+        idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
+        self._manager.block_state[idx] = value
+        self.syncthreads()
+        test = np.any(self._manager.block_state)
+        self.syncthreads()
+        return 1 if test else 0
+
+    def __str__(self):
+        return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx)
+
+
+class BlockManager(object):
+    '''
+    Manages the execution of a thread block.
+
+    When run() is called, all threads are started. Each thread executes until it
+    hits syncthreads(), at which point it sets its own syncthreads_blocked to
+    True so that the BlockManager knows it is blocked. It then waits on its
+    syncthreads_event.
+
+    The BlockManager polls threads to determine if they are blocked in
+    syncthreads(). If it finds a blocked thread, it adds it to the set of
+    blocked threads. When all threads are blocked, it unblocks all the threads.
+    The thread are unblocked by setting their syncthreads_blocked back to False
+    and setting their syncthreads_event.
+
+    The polling continues until no threads are alive, when execution is
+    complete.
+    '''
+    def __init__(self, f, grid_dim, block_dim, debug):
+        self._grid_dim = grid_dim
+        self._block_dim = block_dim
+        self._f = f
+        self._debug = debug
+        self.block_state = np.zeros(block_dim, dtype=np.bool_)
+
+    def run(self, grid_point, *args):
+        # Create all threads
+        threads = set()
+        livethreads = set()
+        blockedthreads = set()
+        for block_point in np.ndindex(*self._block_dim):
+            def target():
+                self._f(*args)
+            t = BlockThread(target, self, grid_point, block_point, self._debug)
+            t.start()
+            threads.add(t)
+            livethreads.add(t)
+
+        # Potential optimisations:
+        # 1. Continue the while loop immediately after finding a blocked thread
+        # 2. Don't poll already-blocked threads
+        while livethreads:
+            for t in livethreads:
+                if t.syncthreads_blocked:
+                    blockedthreads.add(t)
+                elif t.exception:
+
+                    # Abort all other simulator threads on exception,
+                    # do *not* join immediately to facilitate debugging.
+                    for t_other in threads:
+                        t_other.abort = True
+                        t_other.syncthreads_blocked = False
+                        t_other.syncthreads_event.set()
+
+                    raise t.exception[0].with_traceback(t.exception[1])
+            if livethreads == blockedthreads:
+                for t in blockedthreads:
+                    t.syncthreads_blocked = False
+                    t.syncthreads_event.set()
+                blockedthreads = set()
+            livethreads = set([ t for t in livethreads if t.is_alive() ])
+        # Final check for exceptions in case any were set prior to thread
+        # finishing, before we could check it
+        for t in threads:
+            if t.exception:
+                raise t.exception[0].with_traceback(t.exception[1])
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/kernelapi.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/kernelapi.py
@@ -0,0 +1,495 @@
+'''
+Implements the cuda module as called from within an executing kernel
+(@cuda.jit-decorated function).
+'''
+
+from contextlib import contextmanager
+import sys
+import threading
+import traceback
+from numba.core import types
+import numpy as np
+
+from numba.np import numpy_support
+
+from .vector_types import vector_types
+
+
+class Dim3(object):
+    '''
+    Used to implement thread/block indices/dimensions
+    '''
+    def __init__(self, x, y, z):
+        self.x = x
+        self.y = y
+        self.z = z
+
+    def __str__(self):
+        return '(%s, %s, %s)' % (self.x, self.y, self.z)
+
+    def __repr__(self):
+        return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
+
+    def __iter__(self):
+        yield self.x
+        yield self.y
+        yield self.z
+
+
+class GridGroup:
+    '''
+    Used to implement the grid group.
+    '''
+
+    def sync(self):
+        # Synchronization of the grid group is equivalent to synchronization of
+        # the thread block, because we only support cooperative grids with one
+        # block.
+        threading.current_thread().syncthreads()
+
+
+class FakeCUDACg:
+    '''
+    CUDA Cooperative Groups
+    '''
+    def this_grid(self):
+        return GridGroup()
+
+
+class FakeCUDALocal(object):
+    '''
+    CUDA Local arrays
+    '''
+    def array(self, shape, dtype):
+        if isinstance(dtype, types.Type):
+            dtype = numpy_support.as_dtype(dtype)
+        return np.empty(shape, dtype)
+
+
+class FakeCUDAConst(object):
+    '''
+    CUDA Const arrays
+    '''
+    def array_like(self, ary):
+        return ary
+
+
+class FakeCUDAShared(object):
+    '''
+    CUDA Shared arrays.
+
+    Limitations: assumes that only one call to cuda.shared.array is on a line,
+    and that that line is only executed once per thread. i.e.::
+
+        a = cuda.shared.array(...); b = cuda.shared.array(...)
+
+    will erroneously alias a and b, and::
+
+        for i in range(10):
+            sharedarrs[i] = cuda.shared.array(...)
+
+    will alias all arrays created at that point (though it is not certain that
+    this would be supported by Numba anyway).
+    '''
+
+    def __init__(self, dynshared_size):
+        self._allocations = {}
+        self._dynshared_size = dynshared_size
+        self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
+
+    def array(self, shape, dtype):
+        if isinstance(dtype, types.Type):
+            dtype = numpy_support.as_dtype(dtype)
+        # Dynamic shared memory is requested with size 0 - this all shares the
+        # same underlying memory
+        if shape == 0:
+            # Count must be the maximum number of whole elements that fit in the
+            # buffer (Numpy complains if the buffer is not a multiple of the
+            # element size)
+            count = self._dynshared_size // dtype.itemsize
+            return np.frombuffer(self._dynshared.data, dtype=dtype, count=count)
+
+        # Otherwise, identify allocations by source file and line number
+        # We pass the reference frame explicitly to work around
+        # http://bugs.python.org/issue25108
+        stack = traceback.extract_stack(sys._getframe())
+        caller = stack[-2][0:2]
+        res = self._allocations.get(caller)
+        if res is None:
+            res = np.empty(shape, dtype)
+            self._allocations[caller] = res
+        return res
+
+
+addlock = threading.Lock()
+sublock = threading.Lock()
+andlock = threading.Lock()
+orlock = threading.Lock()
+xorlock = threading.Lock()
+maxlock = threading.Lock()
+minlock = threading.Lock()
+compare_and_swaplock = threading.Lock()
+caslock = threading.Lock()
+inclock = threading.Lock()
+declock = threading.Lock()
+exchlock = threading.Lock()
+
+
+class FakeCUDAAtomic(object):
+    def add(self, array, index, val):
+        with addlock:
+            old = array[index]
+            array[index] += val
+        return old
+
+    def sub(self, array, index, val):
+        with sublock:
+            old = array[index]
+            array[index] -= val
+        return old
+
+    def and_(self, array, index, val):
+        with andlock:
+            old = array[index]
+            array[index] &= val
+        return old
+
+    def or_(self, array, index, val):
+        with orlock:
+            old = array[index]
+            array[index] |= val
+        return old
+
+    def xor(self, array, index, val):
+        with xorlock:
+            old = array[index]
+            array[index] ^= val
+        return old
+
+    def inc(self, array, index, val):
+        with inclock:
+            old = array[index]
+            if old >= val:
+                array[index] = 0
+            else:
+                array[index] += 1
+        return old
+
+    def dec(self, array, index, val):
+        with declock:
+            old = array[index]
+            if (old == 0) or (old > val):
+                array[index] = val
+            else:
+                array[index] -= 1
+        return old
+
+    def exch(self, array, index, val):
+        with exchlock:
+            old = array[index]
+            array[index] = val
+        return old
+
+    def max(self, array, index, val):
+        with maxlock:
+            old = array[index]
+            array[index] = max(old, val)
+        return old
+
+    def min(self, array, index, val):
+        with minlock:
+            old = array[index]
+            array[index] = min(old, val)
+        return old
+
+    def nanmax(self, array, index, val):
+        with maxlock:
+            old = array[index]
+            array[index] = np.nanmax([array[index], val])
+        return old
+
+    def nanmin(self, array, index, val):
+        with minlock:
+            old = array[index]
+            array[index] = np.nanmin([array[index], val])
+        return old
+
+    def compare_and_swap(self, array, old, val):
+        with compare_and_swaplock:
+            index = (0,) * array.ndim
+            loaded = array[index]
+            if loaded == old:
+                array[index] = val
+            return loaded
+
+    def cas(self, array, index, old, val):
+        with caslock:
+            loaded = array[index]
+            if loaded == old:
+                array[index] = val
+            return loaded
+
+
+class FakeCUDAFp16(object):
+    def hadd(self, a, b):
+        return a + b
+
+    def hsub(self, a, b):
+        return a - b
+
+    def hmul(self, a, b):
+        return a * b
+
+    def hdiv(self, a, b):
+        return a / b
+
+    def hfma(self, a, b, c):
+        return a * b + c
+
+    def hneg(self, a):
+        return -a
+
+    def habs(self, a):
+        return abs(a)
+
+    def hsin(self, x):
+        return np.sin(x, dtype=np.float16)
+
+    def hcos(self, x):
+        return np.cos(x, dtype=np.float16)
+
+    def hlog(self, x):
+        return np.log(x, dtype=np.float16)
+
+    def hlog2(self, x):
+        return np.log2(x, dtype=np.float16)
+
+    def hlog10(self, x):
+        return np.log10(x, dtype=np.float16)
+
+    def hexp(self, x):
+        return np.exp(x, dtype=np.float16)
+
+    def hexp2(self, x):
+        return np.exp2(x, dtype=np.float16)
+
+    def hexp10(self, x):
+        return np.float16(10 ** x)
+
+    def hsqrt(self, x):
+        return np.sqrt(x, dtype=np.float16)
+
+    def hrsqrt(self, x):
+        return np.float16(x ** -0.5)
+
+    def hceil(self, x):
+        return np.ceil(x, dtype=np.float16)
+
+    def hfloor(self, x):
+        return np.ceil(x, dtype=np.float16)
+
+    def hrcp(self, x):
+        return np.reciprocal(x, dtype=np.float16)
+
+    def htrunc(self, x):
+        return np.trunc(x, dtype=np.float16)
+
+    def hrint(self, x):
+        return np.rint(x, dtype=np.float16)
+
+    def heq(self, a, b):
+        return a == b
+
+    def hne(self, a, b):
+        return a != b
+
+    def hge(self, a, b):
+        return a >= b
+
+    def hgt(self, a, b):
+        return a > b
+
+    def hle(self, a, b):
+        return a <= b
+
+    def hlt(self, a, b):
+        return a < b
+
+    def hmax(self, a, b):
+        return max(a, b)
+
+    def hmin(self, a, b):
+        return min(a, b)
+
+
+class FakeCUDAModule(object):
+    '''
+    An instance of this class will be injected into the __globals__ for an
+    executing function in order to implement calls to cuda.*. This will fail to
+    work correctly if the user code does::
+
+        from numba import cuda as something_else
+
+    In other words, the CUDA module must be called cuda.
+    '''
+
+    def __init__(self, grid_dim, block_dim, dynshared_size):
+        self.gridDim = Dim3(*grid_dim)
+        self.blockDim = Dim3(*block_dim)
+        self._cg = FakeCUDACg()
+        self._local = FakeCUDALocal()
+        self._shared = FakeCUDAShared(dynshared_size)
+        self._const = FakeCUDAConst()
+        self._atomic = FakeCUDAAtomic()
+        self._fp16 = FakeCUDAFp16()
+        # Insert the vector types into the kernel context
+        # Note that we need to do this in addition to exposing them as module
+        # variables in `simulator.__init__.py`, because the test cases need
+        # to access the actual cuda module as well as the fake cuda module
+        # for vector types.
+        for name, svty in vector_types.items():
+            setattr(self, name, svty)
+            for alias in svty.aliases:
+                setattr(self, alias, svty)
+
+    @property
+    def cg(self):
+        return self._cg
+
+    @property
+    def local(self):
+        return self._local
+
+    @property
+    def shared(self):
+        return self._shared
+
+    @property
+    def const(self):
+        return self._const
+
+    @property
+    def atomic(self):
+        return self._atomic
+
+    @property
+    def fp16(self):
+        return self._fp16
+
+    @property
+    def threadIdx(self):
+        return threading.current_thread().threadIdx
+
+    @property
+    def blockIdx(self):
+        return threading.current_thread().blockIdx
+
+    @property
+    def warpsize(self):
+        return 32
+
+    @property
+    def laneid(self):
+        return threading.current_thread().thread_id % 32
+
+    def syncthreads(self):
+        threading.current_thread().syncthreads()
+
+    def threadfence(self):
+        # No-op
+        pass
+
+    def threadfence_block(self):
+        # No-op
+        pass
+
+    def threadfence_system(self):
+        # No-op
+        pass
+
+    def syncthreads_count(self, val):
+        return threading.current_thread().syncthreads_count(val)
+
+    def syncthreads_and(self, val):
+        return threading.current_thread().syncthreads_and(val)
+
+    def syncthreads_or(self, val):
+        return threading.current_thread().syncthreads_or(val)
+
+    def popc(self, val):
+        return bin(val).count("1")
+
+    def fma(self, a, b, c):
+        return a * b + c
+
+    def cbrt(self, a):
+        return a ** (1 / 3)
+
+    def brev(self, val):
+        return int('{:032b}'.format(val)[::-1], 2)
+
+    def clz(self, val):
+        s = '{:032b}'.format(val)
+        return len(s) - len(s.lstrip('0'))
+
+    def ffs(self, val):
+        # The algorithm is:
+        # 1. Count the number of trailing zeros.
+        # 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
+        # 3. If we've counted 32 zeros (resulting in 33), there were no bits
+        #    set so we need to return zero.
+        s = '{:032b}'.format(val)
+        r = (len(s) - len(s.rstrip('0')) + 1) % 33
+        return r
+
+    def selp(self, a, b, c):
+        return b if a else c
+
+    def grid(self, n):
+        bdim = self.blockDim
+        bid = self.blockIdx
+        tid = self.threadIdx
+        x = bid.x * bdim.x + tid.x
+        if n == 1:
+            return x
+        y = bid.y * bdim.y + tid.y
+        if n == 2:
+            return (x, y)
+        z = bid.z * bdim.z + tid.z
+        if n == 3:
+            return (x, y, z)
+
+        raise RuntimeError("Global ID has 1-3 dimensions. %d requested" % n)
+
+    def gridsize(self, n):
+        bdim = self.blockDim
+        gdim = self.gridDim
+        x = bdim.x * gdim.x
+        if n == 1:
+            return x
+        y = bdim.y * gdim.y
+        if n == 2:
+            return (x, y)
+        z = bdim.z * gdim.z
+        if n == 3:
+            return (x, y, z)
+
+        raise RuntimeError("Global grid has 1-3 dimensions. %d requested" % n)
+
+
+@contextmanager
+def swapped_cuda_module(fn, fake_cuda_module):
+    from numba import cuda
+
+    fn_globs = fn.__globals__
+    # get all globals that is the "cuda" module
+    orig = dict((k, v) for k, v in fn_globs.items() if v is cuda)
+    # build replacement dict
+    repl = dict((k, fake_cuda_module) for k, v in orig.items())
+    # replace
+    fn_globs.update(repl)
+    try:
+        yield
+    finally:
+        # revert
+        fn_globs.update(orig)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/reduction.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/reduction.py
@@ -0,0 +1,15 @@
+from functools import reduce as pyreduce
+
+
+def Reduce(func):
+    def reduce_wrapper(seq, res=None, init=0):
+        r = pyreduce(func, seq, init)
+        if res is not None:
+            res[0] = r
+            return None
+        else:
+            return r
+    return reduce_wrapper
+
+
+reduce = Reduce
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/vector_types.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/simulator/vector_types.py
@@ -0,0 +1,60 @@
+from numba import types
+from numba.cuda.stubs import _vector_type_stubs
+
+
+class SimulatedVectorType:
+    attributes = ['x', 'y', 'z', 'w']
+
+    def __init__(self, *args):
+        args_flattened = []
+        for arg in args:
+            if isinstance(arg, SimulatedVectorType):
+                args_flattened += arg.as_list()
+            else:
+                args_flattened.append(arg)
+        self._attrs = self.attributes[:len(args_flattened)]
+        if not self.num_elements == len(args_flattened):
+            raise TypeError(
+                f"{self.name} expects {self.num_elements}"
+                f" elements, got {len(args_flattened)}"
+            )
+
+        for arg, attr in zip(args_flattened, self._attrs):
+            setattr(self, attr, arg)
+
+    @property
+    def name(self):
+        raise NotImplementedError()
+
+    @property
+    def num_elements(self):
+        raise NotImplementedError()
+
+    def as_list(self):
+        return [getattr(self, attr) for attr in self._attrs]
+
+
+def make_simulated_vector_type(num_elements, name):
+    base_type = types.float32
+
+    obj = type(name, (SimulatedVectorType,), {
+        "num_elements": num_elements,
+        "base_type": base_type,
+        "name": name
+    })
+    obj.user_facing_object = obj
+    return obj
+
+
+def _initialize():
+    _simulated_vector_types = {}
+    for stub in _vector_type_stubs:
+        num_elements = int(stub.__name__[-1])
+        _simulated_vector_types[stub.__name__] = (
+            make_simulated_vector_type(num_elements, stub.__name__)
+        )
+        _simulated_vector_types[stub.__name__].aliases = stub.aliases
+    return _simulated_vector_types
+
+
+vector_types = _initialize()