Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/init.py
@@ -0,0 +1,24 @@
+from numba.cuda.testing import ensure_supported_ccs_initialized
+from numba.testing import unittest
+from numba.testing import load_testsuite
+from numba import cuda
+from os.path import dirname, join
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    this_dir = dirname(__file__)
+    ensure_supported_ccs_initialized()
+    suite.addTests(load_testsuite(loader, join(this_dir, 'nocuda')))
+    if cuda.is_available():
+        suite.addTests(load_testsuite(loader, join(this_dir, 'cudasim')))
+        gpus = cuda.list_devices()
+        if gpus and gpus[0].compute_capability >= (2, 0):
+            suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
+            suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
+            suite.addTests(load_testsuite(loader, join(this_dir, 'doc_examples')))
+        else:
+            print("skipped CUDA tests because GPU CC < 2.0")
+    else:
+        print("skipped CUDA tests")
+    return suite
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/init.py
@@ -0,0 +1,8 @@
+from numba.cuda.testing import ensure_supported_ccs_initialized
+from numba.testing import load_testsuite
+import os
+
+
+def load_tests(loader, tests, pattern):
+    ensure_supported_ccs_initialized()
+    return load_testsuite(loader, os.path.dirname(__file__))
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_array_attr.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_array_attr.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_context_stack.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_context_stack.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_array_slicing.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_array_slicing.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_auto_context.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_auto_context.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_devicerecord.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_devicerecord.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_driver.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_driver.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_libraries.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_libraries.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_memory.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_memory.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_ndarray.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_cuda_ndarray.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_deallocations.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_deallocations.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_detect.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_detect.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_emm_plugins.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_emm_plugins.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_events.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_events.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_host_alloc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_host_alloc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_inline_ptx.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_inline_ptx.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_is_fp16.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_is_fp16.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_linker.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_linker.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_managed_alloc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_managed_alloc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_mvc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_mvc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_nvvm_driver.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_nvvm_driver.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_pinned.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_pinned.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_profiler.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_profiler.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_ptds.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_ptds.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_reset_device.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_reset_device.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_runtime.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_runtime.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_select_device.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_select_device.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_streams.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/pycache/test_streams.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_array_attr.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_array_attr.py
@@ -0,0 +1,145 @@
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+
+
+class TestArrayAttr(CUDATestCase):
+
+    def test_contigous_2d(self):
+        ary = np.arange(10)
+        cary = ary.reshape(2, 5)
+        fary = np.asfortranarray(cary)
+
+        dcary = cuda.to_device(cary)
+        dfary = cuda.to_device(fary)
+        self.assertTrue(dcary.is_c_contiguous())
+        self.assertTrue(not dfary.is_c_contiguous())
+        self.assertTrue(not dcary.is_f_contiguous())
+        self.assertTrue(dfary.is_f_contiguous())
+
+    def test_contigous_3d(self):
+        ary = np.arange(20)
+        cary = ary.reshape(2, 5, 2)
+        fary = np.asfortranarray(cary)
+
+        dcary = cuda.to_device(cary)
+        dfary = cuda.to_device(fary)
+        self.assertTrue(dcary.is_c_contiguous())
+        self.assertTrue(not dfary.is_c_contiguous())
+        self.assertTrue(not dcary.is_f_contiguous())
+        self.assertTrue(dfary.is_f_contiguous())
+
+    def test_contigous_4d(self):
+        ary = np.arange(60)
+        cary = ary.reshape(2, 5, 2, 3)
+        fary = np.asfortranarray(cary)
+
+        dcary = cuda.to_device(cary)
+        dfary = cuda.to_device(fary)
+        self.assertTrue(dcary.is_c_contiguous())
+        self.assertTrue(not dfary.is_c_contiguous())
+        self.assertTrue(not dcary.is_f_contiguous())
+        self.assertTrue(dfary.is_f_contiguous())
+
+    def test_ravel_1d(self):
+        ary = np.arange(60)
+        dary = cuda.to_device(ary)
+        for order in 'CFA':
+            expect = ary.ravel(order=order)
+            dflat = dary.ravel(order=order)
+            flat = dflat.copy_to_host()
+            self.assertTrue(dary is not dflat)  # ravel returns new array
+            self.assertEqual(flat.ndim, 1)
+            self.assertPreciseEqual(expect, flat)
+
+    @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+    def test_ravel_stride_1d(self):
+        ary = np.arange(60)
+        dary = cuda.to_device(ary)
+        # No-copy stride device array
+        darystride = dary[::2]
+        dary_data = dary.__cuda_array_interface__['data'][0]
+        ddarystride_data = darystride.__cuda_array_interface__['data'][0]
+        self.assertEqual(dary_data, ddarystride_data)
+        # Fail on ravel on non-contiguous array
+        with self.assertRaises(NotImplementedError):
+            darystride.ravel()
+
+    def test_ravel_c(self):
+        ary = np.arange(60)
+        reshaped = ary.reshape(2, 5, 2, 3)
+
+        expect = reshaped.ravel(order='C')
+        dary = cuda.to_device(reshaped)
+        dflat = dary.ravel()
+        flat = dflat.copy_to_host()
+        self.assertTrue(dary is not dflat)
+        self.assertEqual(flat.ndim, 1)
+        self.assertPreciseEqual(expect, flat)
+
+        # explicit order kwarg
+        for order in 'CA':
+            expect = reshaped.ravel(order=order)
+            dary = cuda.to_device(reshaped)
+            dflat = dary.ravel(order=order)
+            flat = dflat.copy_to_host()
+            self.assertTrue(dary is not dflat)
+            self.assertEqual(flat.ndim, 1)
+            self.assertPreciseEqual(expect, flat)
+
+    @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+    def test_ravel_stride_c(self):
+        ary = np.arange(60)
+        reshaped = ary.reshape(2, 5, 2, 3)
+
+        dary = cuda.to_device(reshaped)
+        darystride = dary[::2, ::2, ::2, ::2]
+        dary_data = dary.__cuda_array_interface__['data'][0]
+        ddarystride_data = darystride.__cuda_array_interface__['data'][0]
+        self.assertEqual(dary_data, ddarystride_data)
+        with self.assertRaises(NotImplementedError):
+            darystride.ravel()
+
+    def test_ravel_f(self):
+        ary = np.arange(60)
+        reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
+        for order in 'FA':
+            expect = reshaped.ravel(order=order)
+            dary = cuda.to_device(reshaped)
+            dflat = dary.ravel(order=order)
+            flat = dflat.copy_to_host()
+            self.assertTrue(dary is not dflat)
+            self.assertEqual(flat.ndim, 1)
+            self.assertPreciseEqual(expect, flat)
+
+    @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+    def test_ravel_stride_f(self):
+        ary = np.arange(60)
+        reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
+        dary = cuda.to_device(reshaped)
+        darystride = dary[::2, ::2, ::2, ::2]
+        dary_data = dary.__cuda_array_interface__['data'][0]
+        ddarystride_data = darystride.__cuda_array_interface__['data'][0]
+        self.assertEqual(dary_data, ddarystride_data)
+        with self.assertRaises(NotImplementedError):
+            darystride.ravel()
+
+    def test_reshape_c(self):
+        ary = np.arange(10)
+        expect = ary.reshape(2, 5)
+        dary = cuda.to_device(ary)
+        dary_reshaped = dary.reshape(2, 5)
+        got = dary_reshaped.copy_to_host()
+        self.assertPreciseEqual(expect, got)
+
+    def test_reshape_f(self):
+        ary = np.arange(10)
+        expect = ary.reshape(2, 5, order='F')
+        dary = cuda.to_device(ary)
+        dary_reshaped = dary.reshape(2, 5, order='F')
+        got = dary_reshaped.copy_to_host()
+        self.assertPreciseEqual(expect, got)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_context_stack.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_context_stack.py
@@ -0,0 +1,145 @@
+import numbers
+from ctypes import byref
+import weakref
+
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+from numba.cuda.cudadrv import driver
+
+
+class TestContextStack(CUDATestCase):
+    def setUp(self):
+        super().setUp()
+        # Reset before testing
+        cuda.close()
+
+    def test_gpus_current(self):
+        self.assertIs(cuda.gpus.current, None)
+        with cuda.gpus[0]:
+            self.assertEqual(int(cuda.gpus.current.id), 0)
+
+    def test_gpus_len(self):
+        self.assertGreater(len(cuda.gpus), 0)
+
+    def test_gpus_iter(self):
+        gpulist = list(cuda.gpus)
+        self.assertGreater(len(gpulist), 0)
+
+
+class TestContextAPI(CUDATestCase):
+
+    def tearDown(self):
+        super().tearDown()
+        cuda.close()
+
+    def test_context_memory(self):
+        try:
+            mem = cuda.current_context().get_memory_info()
+        except NotImplementedError:
+            self.skipTest('EMM Plugin does not implement get_memory_info()')
+
+        self.assertIsInstance(mem.free, numbers.Number)
+        self.assertEqual(mem.free, mem[0])
+
+        self.assertIsInstance(mem.total, numbers.Number)
+        self.assertEqual(mem.total, mem[1])
+
+        self.assertLessEqual(mem.free, mem.total)
+
+    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
+    @skip_on_cudasim('CUDA HW required')
+    def test_forbidden_context_switch(self):
+        # Cannot switch context inside a `cuda.require_context`
+        @cuda.require_context
+        def switch_gpu():
+            with cuda.gpus[1]:
+                pass
+
+        with cuda.gpus[0]:
+            with self.assertRaises(RuntimeError) as raises:
+                switch_gpu()
+
+            self.assertIn("Cannot switch CUDA-context.", str(raises.exception))
+
+    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
+    def test_accepted_context_switch(self):
+        def switch_gpu():
+            with cuda.gpus[1]:
+                return cuda.current_context().device.id
+
+        with cuda.gpus[0]:
+            devid = switch_gpu()
+        self.assertEqual(int(devid), 1)
+
+
+@skip_on_cudasim('CUDA HW required')
+class Test3rdPartyContext(CUDATestCase):
+    def tearDown(self):
+        super().tearDown()
+        cuda.close()
+
+    def test_attached_primary(self, extra_work=lambda: None):
+        # Emulate primary context creation by 3rd party
+        the_driver = driver.driver
+        if driver.USE_NV_BINDING:
+            dev = driver.binding.CUdevice(0)
+            hctx = the_driver.cuDevicePrimaryCtxRetain(dev)
+        else:
+            dev = 0
+            hctx = driver.drvapi.cu_context()
+            the_driver.cuDevicePrimaryCtxRetain(byref(hctx), dev)
+        try:
+            ctx = driver.Context(weakref.proxy(self), hctx)
+            ctx.push()
+            # Check that the context from numba matches the created primary
+            # context.
+            my_ctx = cuda.current_context()
+            if driver.USE_NV_BINDING:
+                self.assertEqual(int(my_ctx.handle), int(ctx.handle))
+            else:
+                self.assertEqual(my_ctx.handle.value, ctx.handle.value)
+
+            extra_work()
+        finally:
+            ctx.pop()
+            the_driver.cuDevicePrimaryCtxRelease(dev)
+
+    def test_attached_non_primary(self):
+        # Emulate non-primary context creation by 3rd party
+        the_driver = driver.driver
+        if driver.USE_NV_BINDING:
+            flags = 0
+            dev = driver.binding.CUdevice(0)
+            hctx = the_driver.cuCtxCreate(flags, dev)
+        else:
+            hctx = driver.drvapi.cu_context()
+            the_driver.cuCtxCreate(byref(hctx), 0, 0)
+        try:
+            cuda.current_context()
+        except RuntimeError as e:
+            # Expecting an error about non-primary CUDA context
+            self.assertIn("Numba cannot operate on non-primary CUDA context ",
+                          str(e))
+        else:
+            self.fail("No RuntimeError raised")
+        finally:
+            the_driver.cuCtxDestroy(hctx)
+
+    def test_cudajit_in_attached_primary_context(self):
+        def do():
+            from numba import cuda
+
+            @cuda.jit
+            def foo(a):
+                for i in range(a.size):
+                    a[i] = i
+
+            a = cuda.device_array(10)
+            foo[1, 1](a)
+            self.assertEqual(list(a.copy_to_host()), list(range(10)))
+
+        self.test_attached_primary(do)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py
@@ -0,0 +1,376 @@
+from itertools import product
+
+import numpy as np
+
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+from unittest.mock import patch
+
+
+class CudaArrayIndexing(CUDATestCase):
+    def test_index_1d(self):
+        arr = np.arange(10)
+        darr = cuda.to_device(arr)
+        x, = arr.shape
+        for i in range(-x, x):
+            self.assertEqual(arr[i], darr[i])
+        with self.assertRaises(IndexError):
+            darr[-x - 1]
+        with self.assertRaises(IndexError):
+            darr[x]
+
+    def test_index_2d(self):
+        arr = np.arange(3 * 4).reshape(3, 4)
+        darr = cuda.to_device(arr)
+        x, y = arr.shape
+        for i in range(-x, x):
+            for j in range(-y, y):
+                self.assertEqual(arr[i, j], darr[i, j])
+        with self.assertRaises(IndexError):
+            darr[-x - 1, 0]
+        with self.assertRaises(IndexError):
+            darr[x, 0]
+        with self.assertRaises(IndexError):
+            darr[0, -y - 1]
+        with self.assertRaises(IndexError):
+            darr[0, y]
+
+    def test_index_3d(self):
+        arr = np.arange(3 * 4 * 5).reshape(3, 4, 5)
+        darr = cuda.to_device(arr)
+        x, y, z = arr.shape
+        for i in range(-x, x):
+            for j in range(-y, y):
+                for k in range(-z, z):
+                    self.assertEqual(arr[i, j, k], darr[i, j, k])
+        with self.assertRaises(IndexError):
+            darr[-x - 1, 0, 0]
+        with self.assertRaises(IndexError):
+            darr[x, 0, 0]
+        with self.assertRaises(IndexError):
+            darr[0, -y - 1, 0]
+        with self.assertRaises(IndexError):
+            darr[0, y, 0]
+        with self.assertRaises(IndexError):
+            darr[0, 0, -z - 1]
+        with self.assertRaises(IndexError):
+            darr[0, 0, z]
+
+
+class CudaArrayStridedSlice(CUDATestCase):
+
+    def test_strided_index_1d(self):
+        arr = np.arange(10)
+        darr = cuda.to_device(arr)
+        for i in range(arr.size):
+            np.testing.assert_equal(arr[i::2], darr[i::2].copy_to_host())
+
+    def test_strided_index_2d(self):
+        arr = np.arange(6 * 7).reshape(6, 7)
+        darr = cuda.to_device(arr)
+
+        for i in range(arr.shape[0]):
+            for j in range(arr.shape[1]):
+                np.testing.assert_equal(arr[i::2, j::2],
+                                        darr[i::2, j::2].copy_to_host())
+
+    def test_strided_index_3d(self):
+        arr = np.arange(6 * 7 * 8).reshape(6, 7, 8)
+        darr = cuda.to_device(arr)
+
+        for i in range(arr.shape[0]):
+            for j in range(arr.shape[1]):
+                for k in range(arr.shape[2]):
+                    np.testing.assert_equal(
+                        arr[i::2, j::2, k::2],
+                        darr[i::2, j::2, k::2].copy_to_host())
+
+
+class CudaArraySlicing(CUDATestCase):
+    def test_prefix_1d(self):
+        arr = np.arange(5)
+        darr = cuda.to_device(arr)
+        for i in range(arr.size):
+            expect = arr[i:]
+            got = darr[i:].copy_to_host()
+            self.assertTrue(np.all(expect == got))
+
+    def test_prefix_2d(self):
+        arr = np.arange(3 ** 2).reshape(3, 3)
+        darr = cuda.to_device(arr)
+        for i in range(arr.shape[0]):
+            for j in range(arr.shape[1]):
+                expect = arr[i:, j:]
+                sliced = darr[i:, j:]
+                self.assertEqual(expect.shape, sliced.shape)
+                self.assertEqual(expect.strides, sliced.strides)
+                got = sliced.copy_to_host()
+                self.assertTrue(np.all(expect == got))
+
+    def test_select_3d_first_two_dim(self):
+        arr = np.arange(3 * 4 * 5).reshape(3, 4, 5)
+        darr = cuda.to_device(arr)
+        # Select first dimension
+        for i in range(arr.shape[0]):
+            expect = arr[i]
+            sliced = darr[i]
+            self.assertEqual(expect.shape, sliced.shape)
+            self.assertEqual(expect.strides, sliced.strides)
+            got = sliced.copy_to_host()
+            self.assertTrue(np.all(expect == got))
+        # Select second dimension
+        for i in range(arr.shape[0]):
+            for j in range(arr.shape[1]):
+                expect = arr[i, j]
+                sliced = darr[i, j]
+                self.assertEqual(expect.shape, sliced.shape)
+                self.assertEqual(expect.strides, sliced.strides)
+                got = sliced.copy_to_host()
+                self.assertTrue(np.all(expect == got))
+
+    def test_select_f(self):
+        a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='F')
+        da = cuda.to_device(a)
+
+        for i in range(a.shape[0]):
+            for j in range(a.shape[1]):
+                self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
+                                               a[i, j, :]))
+            for j in range(a.shape[2]):
+                self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
+                                               a[i, :, j]))
+        for i in range(a.shape[1]):
+            for j in range(a.shape[2]):
+                self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
+                                               a[:, i, j]))
+
+    def test_select_c(self):
+        a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='C')
+        da = cuda.to_device(a)
+
+        for i in range(a.shape[0]):
+            for j in range(a.shape[1]):
+                self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
+                                               a[i, j, :]))
+            for j in range(a.shape[2]):
+                self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
+                                               a[i, :, j]))
+        for i in range(a.shape[1]):
+            for j in range(a.shape[2]):
+                self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
+                                               a[:, i, j]))
+
+    def test_prefix_select(self):
+        arr = np.arange(5 * 7).reshape(5, 7, order='F')
+
+        darr = cuda.to_device(arr)
+        self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1]))
+
+    def test_negative_slicing_1d(self):
+        arr = np.arange(10)
+        darr = cuda.to_device(arr)
+        for i, j in product(range(-10, 10), repeat=2):
+            np.testing.assert_array_equal(arr[i:j],
+                                          darr[i:j].copy_to_host())
+
+    def test_negative_slicing_2d(self):
+        arr = np.arange(12).reshape(3, 4)
+        darr = cuda.to_device(arr)
+        for x, y, w, s in product(range(-4, 4), repeat=4):
+            np.testing.assert_array_equal(arr[x:y, w:s],
+                                          darr[x:y, w:s].copy_to_host())
+
+    def test_empty_slice_1d(self):
+        arr = np.arange(5)
+        darr = cuda.to_device(arr)
+        for i in range(darr.shape[0]):
+            np.testing.assert_array_equal(darr[i:i].copy_to_host(), arr[i:i])
+        # empty slice of empty slice
+        np.testing.assert_array_equal(darr[:0][:0].copy_to_host(), np.empty(0))
+        # out-of-bound slice just produces empty slices
+        np.testing.assert_array_equal(darr[:0][:1].copy_to_host(),
+                                      arr[:0][:1])
+        np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
+                                      arr[:0][-1:])
+
+    def test_empty_slice_2d(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        np.testing.assert_array_equal(darr[:0].copy_to_host(), arr[:0])
+        np.testing.assert_array_equal(darr[3, :0].copy_to_host(), arr[3, :0])
+        # empty slice of empty slice
+        np.testing.assert_array_equal(darr[:0][:0].copy_to_host(),
+                                      np.empty((0, 7)))
+        # out-of-bound slice just produces empty slices
+        np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1])
+        np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
+                                      arr[:0][-1:])
+
+
+class CudaArraySetting(CUDATestCase):
+    """
+    Most of the slicing logic is tested in the cases above, so these
+    tests focus on the setting logic.
+    """
+
+    def test_scalar(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        arr[2, 2] = 500
+        darr[2, 2] = 500
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_rank(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        arr[2] = 500
+        darr[2] = 500
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_broadcast(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        arr[:, 2] = 500
+        darr[:, 2] = 500
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_array_assign_column(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        _400 = np.full(shape=7, fill_value=400)
+        arr[2] = _400
+        darr[2] = _400
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_array_assign_row(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        _400 = np.full(shape=5, fill_value=400)
+        arr[:, 2] = _400
+        darr[:, 2] = _400
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_array_assign_subarray(self):
+        arr = np.arange(5 * 6 * 7).reshape(5, 6, 7)
+        darr = cuda.to_device(arr)
+        _400 = np.full(shape=(6, 7), fill_value=400)
+        arr[2] = _400
+        darr[2] = _400
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_array_assign_deep_subarray(self):
+        arr = np.arange(5 * 6 * 7 * 8).reshape(5, 6, 7, 8)
+        darr = cuda.to_device(arr)
+        _400 = np.full(shape=(5, 6, 8), fill_value=400)
+        arr[:, :, 2] = _400
+        darr[:, :, 2] = _400
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_array_assign_all(self):
+        arr = np.arange(5 * 7).reshape(5, 7)
+        darr = cuda.to_device(arr)
+        _400 = np.full(shape=(5, 7), fill_value=400)
+        arr[:] = _400
+        darr[:] = _400
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_strides(self):
+        arr = np.ones(20)
+        darr = cuda.to_device(arr)
+        arr[::2] = 500
+        darr[::2] = 500
+        np.testing.assert_array_equal(darr.copy_to_host(), arr)
+
+    def test_incompatible_highdim(self):
+        darr = cuda.to_device(np.arange(5 * 7))
+
+        with self.assertRaises(ValueError) as e:
+            darr[:] = np.ones(shape=(1, 2, 3))
+
+        self.assertIn(
+            member=str(e.exception),
+            container=[
+                "Can't assign 3-D array to 1-D self",  # device
+                "could not broadcast input array from shape (2,3) "
+                "into shape (35,)",  # simulator, NP >= 1.20
+            ])
+
+    def test_incompatible_shape(self):
+        darr = cuda.to_device(np.arange(5))
+
+        with self.assertRaises(ValueError) as e:
+            darr[:] = [1, 3]
+
+        self.assertIn(
+            member=str(e.exception),
+            container=[
+                "Can't copy sequence with size 2 to array axis 0 with "
+                "dimension 5",  # device
+                "could not broadcast input array from shape (2,) into "
+                "shape (5,)",   # simulator, NP >= 1.20
+            ])
+
+    @skip_on_cudasim('cudasim does not use streams and operates synchronously')
+    def test_sync(self):
+        # There should be a synchronization when no stream is supplied
+        darr = cuda.to_device(np.arange(5))
+
+        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
+                          return_value=None) as mock_sync:
+            darr[0] = 10
+
+        mock_sync.assert_called_once()
+
+    @skip_on_cudasim('cudasim does not use streams and operates synchronously')
+    def test_no_sync_default_stream(self):
+        # There should not be a synchronization when the array has a default
+        # stream, whether it is the default stream, the legacy default stream,
+        # the per-thread default stream, or another stream.
+        streams = (cuda.stream(), cuda.default_stream(),
+                   cuda.legacy_default_stream(),
+                   cuda.per_thread_default_stream())
+
+        for stream in streams:
+            darr = cuda.to_device(np.arange(5), stream=stream)
+
+            with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
+                              return_value=None) as mock_sync:
+                darr[0] = 10
+
+            mock_sync.assert_not_called()
+
+    @skip_on_cudasim('cudasim does not use streams and operates synchronously')
+    def test_no_sync_supplied_stream(self):
+        # There should not be a synchronization when a stream is supplied for
+        # the setitem call, whether it is the default stream, the legacy default
+        # stream, the per-thread default stream, or another stream.
+        streams = (cuda.stream(), cuda.default_stream(),
+                   cuda.legacy_default_stream(),
+                   cuda.per_thread_default_stream())
+
+        for stream in streams:
+            darr = cuda.to_device(np.arange(5))
+
+            with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
+                              return_value=None) as mock_sync:
+                darr.setitem(0, 10, stream=stream)
+
+            mock_sync.assert_not_called()
+
+    @unittest.skip('Requires PR #6367')
+    def test_issue_6505(self):
+        # On Windows, the writes to ary_v would not be visible prior to the
+        # assertion, due to the assignment being done with a kernel launch that
+        # returns asynchronously - there should now be a sync after the kernel
+        # launch to ensure that the writes are always visible.
+        ary = cuda.mapped_array(2, dtype=np.int32)
+        ary[:] = 0
+
+        ary_v = ary.view('u1')
+        ary_v[1] = 1
+        ary_v[5] = 1
+        self.assertEqual(sum(ary), 512)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_auto_context.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_auto_context.py
@@ -0,0 +1,21 @@
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase
+
+
+class TestCudaAutoContext(CUDATestCase):
+    def test_auto_context(self):
+        """A problem was revealed by a customer that the use cuda.to_device
+        does not create a CUDA context.
+        This tests the problem
+        """
+        A = np.arange(10, dtype=np.float32)
+        newA = np.empty_like(A)
+        dA = cuda.to_device(A)
+
+        dA.copy_to_host(newA)
+        self.assertTrue(np.allclose(A, newA))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py
@@ -0,0 +1,179 @@
+import numpy as np
+import ctypes
+from numba.cuda.cudadrv.devicearray import (DeviceRecord, from_record_like,
+                                            auto_device)
+from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import skip_on_cudasim
+from numba.np import numpy_support
+from numba import cuda
+
+N_CHARS = 5
+
+recordtype = np.dtype(
+    [
+        ('a', np.float64),
+        ('b', np.int32),
+        ('c', np.complex64),
+        ('d', (np.str_, N_CHARS))
+    ],
+    align=True
+)
+
+recordwitharray = np.dtype(
+    [
+        ('g', np.int32),
+        ('h', np.float32, 2)
+    ],
+    align=True
+)
+
+recwithmat = np.dtype([('i', np.int32),
+                       ('j', np.float32, (3, 3))])
+
+recwithrecwithmat = np.dtype([('x', np.int32), ('y', recwithmat)])
+
+
+@skip_on_cudasim('Device Record API unsupported in the simulator')
+class TestCudaDeviceRecord(CUDATestCase):
+    """
+    Tests the DeviceRecord class with np.void host types.
+    """
+    def setUp(self):
+        super().setUp()
+        self._create_data(np.zeros)
+
+    def _create_data(self, array_ctor):
+        self.dtype = np.dtype([('a', np.int32), ('b', np.float32)], align=True)
+        self.hostz = array_ctor(1, self.dtype)[0]
+        self.hostnz = array_ctor(1, self.dtype)[0]
+        self.hostnz['a'] = 10
+        self.hostnz['b'] = 11.0
+
+    def _check_device_record(self, reference, rec):
+        self.assertEqual(rec.shape, tuple())
+        self.assertEqual(rec.strides, tuple())
+        self.assertEqual(rec.dtype, reference.dtype)
+        self.assertEqual(rec.alloc_size, reference.dtype.itemsize)
+        self.assertIsNotNone(rec.gpu_data)
+        self.assertNotEqual(rec.device_ctypes_pointer, ctypes.c_void_p(0))
+
+        numba_type = numpy_support.from_dtype(reference.dtype)
+        self.assertEqual(rec._numba_type_, numba_type)
+
+    def test_device_record_interface(self):
+        hostrec = self.hostz.copy()
+        devrec = DeviceRecord(self.dtype)
+        self._check_device_record(hostrec, devrec)
+
+    def test_device_record_copy(self):
+        hostrec = self.hostz.copy()
+        devrec = DeviceRecord(self.dtype)
+        devrec.copy_to_device(hostrec)
+
+        # Copy back and check values are all zeros
+        hostrec2 = self.hostnz.copy()
+        devrec.copy_to_host(hostrec2)
+        np.testing.assert_equal(self.hostz, hostrec2)
+
+        # Copy non-zero values to GPU and back and check values
+        hostrec3 = self.hostnz.copy()
+        devrec.copy_to_device(hostrec3)
+
+        hostrec4 = self.hostz.copy()
+        devrec.copy_to_host(hostrec4)
+        np.testing.assert_equal(hostrec4, self.hostnz)
+
+    def test_from_record_like(self):
+        # Create record from host record
+        hostrec = self.hostz.copy()
+        devrec = from_record_like(hostrec)
+        self._check_device_record(hostrec, devrec)
+
+        # Create record from device record and check for distinct data
+        devrec2 = from_record_like(devrec)
+        self._check_device_record(devrec, devrec2)
+        self.assertNotEqual(devrec.gpu_data, devrec2.gpu_data)
+
+    def test_auto_device(self):
+        # Create record from host record
+        hostrec = self.hostnz.copy()
+        devrec, new_gpu_obj = auto_device(hostrec)
+        self._check_device_record(hostrec, devrec)
+        self.assertTrue(new_gpu_obj)
+
+        # Copy data back and check it is equal to auto_device arg
+        hostrec2 = self.hostz.copy()
+        devrec.copy_to_host(hostrec2)
+        np.testing.assert_equal(hostrec2, hostrec)
+
+
+class TestCudaDeviceRecordWithRecord(TestCudaDeviceRecord):
+    """
+    Tests the DeviceRecord class with np.record host types
+    """
+    def setUp(self):
+        CUDATestCase.setUp(self)
+        self._create_data(np.recarray)
+
+
+@skip_on_cudasim('Structured array attr access not supported in simulator')
+class TestRecordDtypeWithStructArrays(CUDATestCase):
+    '''
+    Test operation of device arrays on structured arrays.
+    '''
+
+    def _createSampleArrays(self):
+        self.sample1d = cuda.device_array(3, dtype=recordtype)
+        self.samplerec1darr = cuda.device_array(1, dtype=recordwitharray)[0]
+        self.samplerecmat = cuda.device_array(1,dtype=recwithmat)[0]
+
+    def setUp(self):
+        super().setUp()
+        self._createSampleArrays()
+
+        ary = self.sample1d
+        for i in range(ary.size):
+            x = i + 1
+            ary[i]['a'] = x / 2
+            ary[i]['b'] = x
+            ary[i]['c'] = x * 1j
+            ary[i]['d'] = str(x) * N_CHARS
+
+    def test_structured_array1(self):
+        ary = self.sample1d
+        for i in range(self.sample1d.size):
+            x = i + 1
+            self.assertEqual(ary[i]['a'], x / 2)
+            self.assertEqual(ary[i]['b'], x)
+            self.assertEqual(ary[i]['c'], x * 1j)
+            self.assertEqual(ary[i]['d'], str(x) * N_CHARS)
+
+    def test_structured_array2(self):
+        ary = self.samplerec1darr
+        ary['g'] = 2
+        ary['h'][0] = 3.0
+        ary['h'][1] = 4.0
+        self.assertEqual(ary['g'], 2)
+        self.assertEqual(ary['h'][0], 3.0)
+        self.assertEqual(ary['h'][1], 4.0)
+
+    def test_structured_array3(self):
+        ary = self.samplerecmat
+        mat = np.array([[5.0, 10.0, 15.0],
+                       [20.0, 25.0, 30.0],
+                       [35.0, 40.0, 45.0]],
+                       dtype=np.float32).reshape(3,3)
+        ary['j'][:] = mat
+        np.testing.assert_equal(ary['j'], mat)
+
+    def test_structured_array4(self):
+        arr = np.zeros(1, dtype=recwithrecwithmat)
+        d_arr = cuda.to_device(arr)
+        d_arr[0]['y']['i'] = 1
+        self.assertEqual(d_arr[0]['y']['i'], 1)
+        d_arr[0]['y']['j'][0, 0] = 2.0
+        self.assertEqual(d_arr[0]['y']['j'][0, 0], 2.0)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_driver.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_driver.py
@@ -0,0 +1,235 @@
+from ctypes import byref, c_int, c_void_p, sizeof
+
+from numba.cuda.cudadrv.driver import (host_to_device, device_to_host, driver,
+                                       launch_kernel)
+from numba.cuda.cudadrv import devices, drvapi, driver as _driver
+from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import skip_on_cudasim
+
+
+ptx1 = '''
+    .version 1.4
+    .target sm_10, map_f64_to_f32
+
+    .entry _Z10helloworldPi (
+    .param .u64 __cudaparm__Z10helloworldPi_A)
+    {
+    .reg .u32 %r<3>;
+    .reg .u64 %rd<6>;
+    .loc	14	4	0
+$LDWbegin__Z10helloworldPi:
+    .loc	14	6	0
+    cvt.s32.u16 	%r1, %tid.x;
+    ld.param.u64 	%rd1, [__cudaparm__Z10helloworldPi_A];
+    cvt.u64.u16 	%rd2, %tid.x;
+    mul.lo.u64 	%rd3, %rd2, 4;
+    add.u64 	%rd4, %rd1, %rd3;
+    st.global.s32 	[%rd4+0], %r1;
+    .loc	14	7	0
+    exit;
+$LDWend__Z10helloworldPi:
+    } // _Z10helloworldPi
+'''
+
+ptx2 = '''
+.version 3.0
+.target sm_20
+.address_size 64
+
+    .file	1 "/tmp/tmpxft_000012c7_00000000-9_testcuda.cpp3.i"
+    .file	2 "testcuda.cu"
+
+.entry _Z10helloworldPi(
+    .param .u64 _Z10helloworldPi_param_0
+)
+{
+    .reg .s32 	%r<3>;
+    .reg .s64 	%rl<5>;
+
+
+    ld.param.u64 	%rl1, [_Z10helloworldPi_param_0];
+    cvta.to.global.u64 	%rl2, %rl1;
+    .loc 2 6 1
+    mov.u32 	%r1, %tid.x;
+    mul.wide.u32 	%rl3, %r1, 4;
+    add.s64 	%rl4, %rl2, %rl3;
+    st.global.u32 	[%rl4], %r1;
+    .loc 2 7 2
+    ret;
+}
+'''
+
+
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+class TestCudaDriver(CUDATestCase):
+    def setUp(self):
+        super().setUp()
+        self.assertTrue(len(devices.gpus) > 0)
+        self.context = devices.get_context()
+        device = self.context.device
+        ccmajor, _ = device.compute_capability
+        if ccmajor >= 2:
+            self.ptx = ptx2
+        else:
+            self.ptx = ptx1
+
+    def tearDown(self):
+        super().tearDown()
+        del self.context
+
+    def test_cuda_driver_basic(self):
+        module = self.context.create_module_ptx(self.ptx)
+        function = module.get_function('_Z10helloworldPi')
+
+        array = (c_int * 100)()
+
+        memory = self.context.memalloc(sizeof(array))
+        host_to_device(memory, array, sizeof(array))
+
+        ptr = memory.device_ctypes_pointer
+        stream = 0
+
+        if _driver.USE_NV_BINDING:
+            ptr = c_void_p(int(ptr))
+            stream = _driver.binding.CUstream(stream)
+
+        launch_kernel(function.handle,  # Kernel
+                      1,   1, 1,        # gx, gy, gz
+                      100, 1, 1,        # bx, by, bz
+                      0,                # dynamic shared mem
+                      stream,           # stream
+                      [ptr])            # arguments
+
+        device_to_host(array, memory, sizeof(array))
+        for i, v in enumerate(array):
+            self.assertEqual(i, v)
+
+        module.unload()
+
+    def test_cuda_driver_stream_operations(self):
+        module = self.context.create_module_ptx(self.ptx)
+        function = module.get_function('_Z10helloworldPi')
+
+        array = (c_int * 100)()
+
+        stream = self.context.create_stream()
+
+        with stream.auto_synchronize():
+            memory = self.context.memalloc(sizeof(array))
+            host_to_device(memory, array, sizeof(array), stream=stream)
+
+            ptr = memory.device_ctypes_pointer
+            if _driver.USE_NV_BINDING:
+                ptr = c_void_p(int(ptr))
+
+            launch_kernel(function.handle,  # Kernel
+                          1,   1, 1,        # gx, gy, gz
+                          100, 1, 1,        # bx, by, bz
+                          0,                # dynamic shared mem
+                          stream.handle,    # stream
+                          [ptr])            # arguments
+
+        device_to_host(array, memory, sizeof(array), stream=stream)
+
+        for i, v in enumerate(array):
+            self.assertEqual(i, v)
+
+    def test_cuda_driver_default_stream(self):
+        # Test properties of the default stream
+        ds = self.context.get_default_stream()
+        self.assertIn("Default CUDA stream", repr(ds))
+        self.assertEqual(0, int(ds))
+        # bool(stream) is the check that is done in memcpy to decide if async
+        # version should be used. So the default (0) stream should be true-ish
+        # even though 0 is usually false-ish in Python.
+        self.assertTrue(ds)
+        self.assertFalse(ds.external)
+
+    def test_cuda_driver_legacy_default_stream(self):
+        # Test properties of the legacy default stream
+        ds = self.context.get_legacy_default_stream()
+        self.assertIn("Legacy default CUDA stream", repr(ds))
+        self.assertEqual(1, int(ds))
+        self.assertTrue(ds)
+        self.assertFalse(ds.external)
+
+    def test_cuda_driver_per_thread_default_stream(self):
+        # Test properties of the per-thread default stream
+        ds = self.context.get_per_thread_default_stream()
+        self.assertIn("Per-thread default CUDA stream", repr(ds))
+        self.assertEqual(2, int(ds))
+        self.assertTrue(ds)
+        self.assertFalse(ds.external)
+
+    def test_cuda_driver_stream(self):
+        # Test properties of non-default streams
+        s = self.context.create_stream()
+        self.assertIn("CUDA stream", repr(s))
+        self.assertNotIn("Default", repr(s))
+        self.assertNotIn("External", repr(s))
+        self.assertNotEqual(0, int(s))
+        self.assertTrue(s)
+        self.assertFalse(s.external)
+
+    def test_cuda_driver_external_stream(self):
+        # Test properties of a stream created from an external stream object.
+        # We use the driver API directly to create a stream, to emulate an
+        # external library creating a stream
+        if _driver.USE_NV_BINDING:
+            handle = driver.cuStreamCreate(0)
+            ptr = int(handle)
+        else:
+            handle = drvapi.cu_stream()
+            driver.cuStreamCreate(byref(handle), 0)
+            ptr = handle.value
+        s = self.context.create_external_stream(ptr)
+
+        self.assertIn("External CUDA stream", repr(s))
+        # Ensure neither "Default" nor "default"
+        self.assertNotIn("efault", repr(s))
+        self.assertEqual(ptr, int(s))
+        self.assertTrue(s)
+        self.assertTrue(s.external)
+
+    def test_cuda_driver_occupancy(self):
+        module = self.context.create_module_ptx(self.ptx)
+        function = module.get_function('_Z10helloworldPi')
+
+        value = self.context.get_active_blocks_per_multiprocessor(function,
+                                                                  128, 128)
+        self.assertTrue(value > 0)
+
+        def b2d(bs):
+            return bs
+
+        grid, block = self.context.get_max_potential_block_size(function, b2d,
+                                                                128, 128)
+        self.assertTrue(grid > 0)
+        self.assertTrue(block > 0)
+
+
+class TestDevice(CUDATestCase):
+    def test_device_get_uuid(self):
+        # A device UUID looks like:
+        #
+        #     GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643
+        #
+        # To test, we construct an RE that matches this form and verify that
+        # the returned UUID matches.
+        #
+        # Device UUIDs may not conform to parts of the UUID specification (RFC
+        # 4122) pertaining to versions and variants, so we do not extract and
+        # validate the values of these bits.
+
+        h = '[0-9a-f]{%d}'
+        h4 = h % 4
+        h8 = h % 8
+        h12 = h % 12
+        uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$'
+
+        dev = devices.get_context().device
+        self.assertRegex(dev.uuid, uuid_format)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_libraries.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_libraries.py
@@ -0,0 +1,22 @@
+from numba.cuda.testing import unittest
+from numba.cuda.testing import skip_on_cudasim, skip_unless_conda_cudatoolkit
+from numba.misc.findlib import find_lib
+
+
+@skip_on_cudasim('Library detection unsupported in the simulator')
+@skip_unless_conda_cudatoolkit
+class TestLibraryDetection(unittest.TestCase):
+    def test_detect(self):
+        """
+        This test is solely present to ensure that shipped cudatoolkits have
+        additional core libraries in locations that Numba scans by default.
+        PyCulib (and potentially others) rely on Numba's library finding
+        capacity to find and subsequently load these libraries.
+        """
+        core_libs = ['nvvm']
+        for l in core_libs:
+            self.assertNotEqual(find_lib(l), [])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_memory.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_memory.py
@@ -0,0 +1,193 @@
+import ctypes
+
+import numpy as np
+
+from numba.cuda.cudadrv import driver, drvapi, devices
+from numba.cuda.testing import unittest, ContextResettingTestCase
+from numba.cuda.testing import skip_on_cudasim
+
+
+@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
+class TestCudaMemory(ContextResettingTestCase):
+    def setUp(self):
+        super().setUp()
+        self.context = devices.get_context()
+
+    def tearDown(self):
+        del self.context
+        super(TestCudaMemory, self).tearDown()
+
+    def _template(self, obj):
+        self.assertTrue(driver.is_device_memory(obj))
+        driver.require_device_memory(obj)
+        if driver.USE_NV_BINDING:
+            expected_class = driver.binding.CUdeviceptr
+        else:
+            expected_class = drvapi.cu_device_ptr
+        self.assertTrue(isinstance(obj.device_ctypes_pointer,
+                                   expected_class))
+
+    def test_device_memory(self):
+        devmem = self.context.memalloc(1024)
+        self._template(devmem)
+
+    def test_device_view(self):
+        devmem = self.context.memalloc(1024)
+        self._template(devmem.view(10))
+
+    def test_host_alloc(self):
+        devmem = self.context.memhostalloc(1024, mapped=True)
+        self._template(devmem)
+
+    def test_pinned_memory(self):
+        ary = np.arange(10)
+        devmem = self.context.mempin(ary, ary.ctypes.data,
+                                     ary.size * ary.dtype.itemsize,
+                                     mapped=True)
+        self._template(devmem)
+
+    def test_managed_memory(self):
+        devmem = self.context.memallocmanaged(1024)
+        self._template(devmem)
+
+    def test_derived_pointer(self):
+        # Use MemoryPointer.view to create derived pointer
+
+        def handle_val(mem):
+            if driver.USE_NV_BINDING:
+                return int(mem.handle)
+            else:
+                return mem.handle.value
+
+        def check(m, offset):
+            # create view
+            v1 = m.view(offset)
+            self.assertEqual(handle_val(v1.owner), handle_val(m))
+            self.assertEqual(m.refct, 2)
+            self.assertEqual(handle_val(v1) - offset, handle_val(v1.owner))
+            # create a view
+            v2 = v1.view(offset)
+            self.assertEqual(handle_val(v2.owner), handle_val(m))
+            self.assertEqual(handle_val(v2.owner), handle_val(m))
+            self.assertEqual(handle_val(v2) - offset * 2,
+                             handle_val(v2.owner))
+            self.assertEqual(m.refct, 3)
+            del v2
+            self.assertEqual(m.refct, 2)
+            del v1
+            self.assertEqual(m.refct, 1)
+
+        m = self.context.memalloc(1024)
+        check(m=m, offset=0)
+        check(m=m, offset=1)
+
+    def test_user_extension(self):
+        # User can use MemoryPointer to wrap externally defined pointers.
+        # This test checks if the finalizer is invokded at correct time
+        fake_ptr = ctypes.c_void_p(0xdeadbeef)
+        dtor_invoked = [0]
+
+        def dtor():
+            dtor_invoked[0] += 1
+
+        # Ensure finalizer is called when pointer is deleted
+        ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
+                                   size=40, finalizer=dtor)
+        self.assertEqual(dtor_invoked[0], 0)
+        del ptr
+        self.assertEqual(dtor_invoked[0], 1)
+
+        # Ensure removing derived pointer doesn't call finalizer
+        ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
+                                   size=40, finalizer=dtor)
+        owned = ptr.own()
+        del owned
+        self.assertEqual(dtor_invoked[0], 1)
+        del ptr
+        self.assertEqual(dtor_invoked[0], 2)
+
+
+class TestCudaMemoryFunctions(ContextResettingTestCase):
+    def setUp(self):
+        super().setUp()
+        self.context = devices.get_context()
+
+    def tearDown(self):
+        del self.context
+        super(TestCudaMemoryFunctions, self).tearDown()
+
+    def test_memcpy(self):
+        hstary = np.arange(100, dtype=np.uint32)
+        hstary2 = np.arange(100, dtype=np.uint32)
+        sz = hstary.size * hstary.dtype.itemsize
+        devary = self.context.memalloc(sz)
+
+        driver.host_to_device(devary, hstary, sz)
+        driver.device_to_host(hstary2, devary, sz)
+
+        self.assertTrue(np.all(hstary == hstary2))
+
+    def test_memset(self):
+        dtype = np.dtype('uint32')
+        n = 10
+        sz = dtype.itemsize * 10
+        devary = self.context.memalloc(sz)
+        driver.device_memset(devary, 0xab, sz)
+
+        hstary = np.empty(n, dtype=dtype)
+        driver.device_to_host(hstary, devary, sz)
+
+        hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32'))
+        self.assertTrue(np.all(hstary == hstary2))
+
+    def test_d2d(self):
+        hst = np.arange(100, dtype=np.uint32)
+        hst2 = np.empty_like(hst)
+        sz = hst.size * hst.dtype.itemsize
+        dev1 = self.context.memalloc(sz)
+        dev2 = self.context.memalloc(sz)
+        driver.host_to_device(dev1, hst, sz)
+        driver.device_to_device(dev2, dev1, sz)
+        driver.device_to_host(hst2, dev2, sz)
+        self.assertTrue(np.all(hst == hst2))
+
+
+@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
+class TestMVExtent(ContextResettingTestCase):
+    def test_c_contiguous_array(self):
+        ary = np.arange(100)
+        arysz = ary.dtype.itemsize * ary.size
+        s, e = driver.host_memory_extents(ary)
+        self.assertTrue(ary.ctypes.data == s)
+        self.assertTrue(arysz == driver.host_memory_size(ary))
+
+    def test_f_contiguous_array(self):
+        ary = np.asfortranarray(np.arange(100).reshape(2, 50))
+        arysz = ary.dtype.itemsize * np.prod(ary.shape)
+        s, e = driver.host_memory_extents(ary)
+        self.assertTrue(ary.ctypes.data == s)
+        self.assertTrue(arysz == driver.host_memory_size(ary))
+
+    def test_single_element_array(self):
+        ary = np.asarray(np.uint32(1234))
+        arysz = ary.dtype.itemsize
+        s, e = driver.host_memory_extents(ary)
+        self.assertTrue(ary.ctypes.data == s)
+        self.assertTrue(arysz == driver.host_memory_size(ary))
+
+    def test_ctypes_struct(self):
+        class mystruct(ctypes.Structure):
+            _fields_ = [('x', ctypes.c_int), ('y', ctypes.c_int)]
+
+        data = mystruct(x=123, y=432)
+        sz = driver.host_memory_size(data)
+        self.assertTrue(ctypes.sizeof(data) == sz)
+
+    def test_ctypes_double(self):
+        data = ctypes.c_double(1.234)
+        sz = driver.host_memory_size(data)
+        self.assertTrue(ctypes.sizeof(data) == sz)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_ndarray.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_cuda_ndarray.py
@@ -0,0 +1,547 @@
+import itertools
+import numpy as np
+from numba.cuda.cudadrv import devicearray
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import skip_on_cudasim
+
+
+class TestCudaNDArray(CUDATestCase):
+    def test_device_array_interface(self):
+        dary = cuda.device_array(shape=100)
+        devicearray.verify_cuda_ndarray_interface(dary)
+
+        ary = np.empty(100)
+        dary = cuda.to_device(ary)
+        devicearray.verify_cuda_ndarray_interface(dary)
+
+        ary = np.asarray(1.234)
+        dary = cuda.to_device(ary)
+        self.assertEqual(dary.ndim, 0)
+        devicearray.verify_cuda_ndarray_interface(dary)
+
+    def test_device_array_from_readonly(self):
+        ary = np.arange(100, dtype=np.float32)
+        # Make the array readonly
+        ary.flags.writeable = False
+        self.assertFalse(ary.flags.writeable)
+        # Ensure that we can copy the readonly array
+        dary = cuda.to_device(ary)
+        retr = dary.copy_to_host()
+        np.testing.assert_array_equal(retr, ary)
+
+    def test_devicearray_dtype(self):
+        dary = cuda.device_array(shape=(100,), dtype="f4")
+        self.assertEqual(dary.dtype, np.dtype("f4"))
+
+    def test_devicearray_no_copy(self):
+        array = np.arange(100, dtype=np.float32)
+        cuda.to_device(array, copy=False)
+
+    def test_devicearray_shape(self):
+        ary = np.arange(2 * 3 * 4).reshape(2, 3, 4)
+        dary = cuda.to_device(ary)
+        self.assertEqual(ary.shape, dary.shape)
+        self.assertEqual(ary.shape[1:], dary.shape[1:])
+
+    def test_devicearray(self):
+        array = np.arange(100, dtype=np.int32)
+        original = array.copy()
+        gpumem = cuda.to_device(array)
+        array[:] = 0
+        gpumem.copy_to_host(array)
+
+        np.testing.assert_array_equal(array, original)
+
+    def test_stream_bind(self):
+        stream = cuda.stream()
+        with stream.auto_synchronize():
+            arr = cuda.device_array(
+                (3, 3),
+                dtype=np.float64,
+                stream=stream)
+            self.assertEqual(arr.bind(stream).stream, stream)
+            self.assertEqual(arr.stream, stream)
+
+    def test_len_1d(self):
+        ary = np.empty((3,))
+        dary = cuda.device_array(3)
+        self.assertEqual(len(ary), len(dary))
+
+    def test_len_2d(self):
+        ary = np.empty((3, 5))
+        dary = cuda.device_array((3, 5))
+        self.assertEqual(len(ary), len(dary))
+
+    def test_len_3d(self):
+        ary = np.empty((3, 5, 7))
+        dary = cuda.device_array((3, 5, 7))
+        self.assertEqual(len(ary), len(dary))
+
+    def test_devicearray_partition(self):
+        N = 100
+        array = np.arange(N, dtype=np.int32)
+        original = array.copy()
+        gpumem = cuda.to_device(array)
+        left, right = gpumem.split(N // 2)
+
+        array[:] = 0
+
+        self.assertTrue(np.all(array == 0))
+
+        right.copy_to_host(array[N // 2:])
+        left.copy_to_host(array[:N // 2])
+
+        self.assertTrue(np.all(array == original))
+
+    def test_devicearray_replace(self):
+        N = 100
+        array = np.arange(N, dtype=np.int32)
+        original = array.copy()
+        gpumem = cuda.to_device(array)
+        cuda.to_device(array * 2, to=gpumem)
+        gpumem.copy_to_host(array)
+        np.testing.assert_array_equal(array, original * 2)
+
+    @skip_on_cudasim('This works in the simulator')
+    def test_devicearray_transpose_wrongdim(self):
+        gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4, 1))
+
+        with self.assertRaises(NotImplementedError) as e:
+            np.transpose(gpumem)
+
+        self.assertEqual(
+            "transposing a non-2D DeviceNDArray isn't supported",
+            str(e.exception))
+
+    def test_devicearray_transpose_identity(self):
+        # any-shape identities should work
+        original = np.array(np.arange(24)).reshape(3, 4, 2)
+        array = np.transpose(cuda.to_device(original),
+                             axes=(0, 1, 2)).copy_to_host()
+        self.assertTrue(np.all(array == original))
+
+    def test_devicearray_transpose_duplicatedaxis(self):
+        gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
+
+        with self.assertRaises(ValueError) as e:
+            np.transpose(gpumem, axes=(0, 0))
+
+        self.assertIn(
+            str(e.exception),
+            container=[
+                'invalid axes list (0, 0)',  # GPU
+                'repeated axis in transpose',  # sim
+            ])
+
+    def test_devicearray_transpose_wrongaxis(self):
+        gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
+
+        with self.assertRaises(ValueError) as e:
+            np.transpose(gpumem, axes=(0, 2))
+
+        self.assertIn(
+            str(e.exception),
+            container=[
+                'invalid axes list (0, 2)',  # GPU
+                'invalid axis for this array',
+                'axis 2 is out of bounds for array of dimension 2',  # sim
+            ])
+
+    def test_devicearray_view_ok(self):
+        original = np.array(np.arange(12), dtype="i2").reshape(3, 4)
+        array = cuda.to_device(original)
+        for dtype in ("i4", "u4", "i8", "f8"):
+            with self.subTest(dtype=dtype):
+                np.testing.assert_array_equal(
+                    array.view(dtype).copy_to_host(),
+                    original.view(dtype)
+                )
+
+    def test_devicearray_view_ok_not_c_contig(self):
+        original = np.array(np.arange(32), dtype="i2").reshape(4, 8)
+        array = cuda.to_device(original)[:, ::2]
+        original = original[:, ::2]
+        np.testing.assert_array_equal(
+            array.view("u2").copy_to_host(),
+            original.view("u2")
+        )
+
+    def test_devicearray_view_bad_not_c_contig(self):
+        original = np.array(np.arange(32), dtype="i2").reshape(4, 8)
+        array = cuda.to_device(original)[:, ::2]
+        with self.assertRaises(ValueError) as e:
+            array.view("i4")
+
+        msg = str(e.exception)
+        self.assertIn('To change to a dtype of a different size,', msg)
+
+        contiguous_pre_np123 = 'the array must be C-contiguous' in msg
+        contiguous_post_np123 = 'the last axis must be contiguous' in msg
+        self.assertTrue(contiguous_pre_np123 or contiguous_post_np123,
+                        'Expected message to mention contiguity')
+
+    def test_devicearray_view_bad_itemsize(self):
+        original = np.array(np.arange(12), dtype="i2").reshape(4, 3)
+        array = cuda.to_device(original)
+        with self.assertRaises(ValueError) as e:
+            array.view("i4")
+        self.assertEqual(
+            "When changing to a larger dtype,"
+            " its size must be a divisor of the total size in bytes"
+            " of the last axis of the array.",
+            str(e.exception))
+
+    def test_devicearray_transpose_ok(self):
+        original = np.array(np.arange(12)).reshape(3, 4)
+        array = np.transpose(cuda.to_device(original)).copy_to_host()
+        self.assertTrue(np.all(array == original.T))
+
+    def test_devicearray_transpose_T(self):
+        original = np.array(np.arange(12)).reshape(3, 4)
+        array = cuda.to_device(original).T.copy_to_host()
+        self.assertTrue(np.all(array == original.T))
+
+    def test_devicearray_contiguous_slice(self):
+        # memcpys are dumb ranges of bytes, so trying to
+        # copy to a non-contiguous range shouldn't work!
+        a = np.arange(25).reshape(5, 5, order='F')
+        s = np.full(fill_value=5, shape=(5,))
+
+        d = cuda.to_device(a)
+        a[2] = s
+
+        # d is in F-order (not C-order), so d[2] is not contiguous
+        # (40-byte strides). This means we can't memcpy to it!
+        with self.assertRaises(ValueError) as e:
+            d[2].copy_to_device(s)
+        self.assertEqual(
+            devicearray.errmsg_contiguous_buffer,
+            str(e.exception))
+
+        # if d[2].copy_to_device(s), then this would pass:
+        # self.assertTrue((a == d.copy_to_host()).all())
+
+    def _test_devicearray_contiguous_host_copy(self, a_c, a_f):
+        """
+        Checks host->device memcpys
+        """
+        self.assertTrue(a_c.flags.c_contiguous)
+        self.assertTrue(a_f.flags.f_contiguous)
+
+        for original, copy in [
+            (a_f, a_f),
+            (a_f, a_c),
+            (a_c, a_f),
+            (a_c, a_c),
+        ]:
+            msg = '%s => %s' % (
+                'C' if original.flags.c_contiguous else 'F',
+                'C' if copy.flags.c_contiguous else 'F',
+            )
+
+            d = cuda.to_device(original)
+            d.copy_to_device(copy)
+            self.assertTrue(np.all(d.copy_to_host() == a_c), msg=msg)
+            self.assertTrue(np.all(d.copy_to_host() == a_f), msg=msg)
+
+    def test_devicearray_contiguous_copy_host_3d(self):
+        a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
+        a_f = np.array(a_c, order='F')
+        self._test_devicearray_contiguous_host_copy(a_c, a_f)
+
+    def test_devicearray_contiguous_copy_host_1d(self):
+        a_c = np.arange(5)
+        a_f = np.array(a_c, order='F')
+        self._test_devicearray_contiguous_host_copy(a_c, a_f)
+
+    def test_devicearray_contiguous_copy_device(self):
+        a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
+        a_f = np.array(a_c, order='F')
+        self.assertTrue(a_c.flags.c_contiguous)
+        self.assertTrue(a_f.flags.f_contiguous)
+
+        d = cuda.to_device(a_c)
+
+        with self.assertRaises(ValueError) as e:
+            d.copy_to_device(cuda.to_device(a_f))
+        self.assertEqual(
+            "incompatible strides: {} vs. {}".format(a_c.strides, a_f.strides),
+            str(e.exception))
+
+        d.copy_to_device(cuda.to_device(a_c))
+        self.assertTrue(np.all(d.copy_to_host() == a_c))
+
+        d = cuda.to_device(a_f)
+
+        with self.assertRaises(ValueError) as e:
+            d.copy_to_device(cuda.to_device(a_c))
+        self.assertEqual(
+            "incompatible strides: {} vs. {}".format(a_f.strides, a_c.strides),
+            str(e.exception))
+
+        d.copy_to_device(cuda.to_device(a_f))
+        self.assertTrue(np.all(d.copy_to_host() == a_f))
+
+    def test_devicearray_broadcast_host_copy(self):
+        broadsize = 4
+        coreshape = (2, 3)
+        coresize = np.prod(coreshape)
+        core_c = np.arange(coresize).reshape(coreshape, order='C')
+        core_f = np.arange(coresize).reshape(coreshape, order='F')
+        for dim in range(len(coreshape)):
+            newindex = (slice(None),) * dim + (np.newaxis,)
+            broadshape = coreshape[:dim] + (broadsize,) + coreshape[dim:]
+            broad_c = np.broadcast_to(core_c[newindex], broadshape)
+            broad_f = np.broadcast_to(core_f[newindex], broadshape)
+            dbroad_c = cuda.to_device(broad_c)
+            dbroad_f = cuda.to_device(broad_f)
+            np.testing.assert_array_equal(dbroad_c.copy_to_host(), broad_c)
+            np.testing.assert_array_equal(dbroad_f.copy_to_host(), broad_f)
+            # Also test copying across different core orderings
+            dbroad_c.copy_to_device(broad_f)
+            dbroad_f.copy_to_device(broad_c)
+            np.testing.assert_array_equal(dbroad_c.copy_to_host(), broad_f)
+            np.testing.assert_array_equal(dbroad_f.copy_to_host(), broad_c)
+
+    def test_devicearray_contiguous_host_strided(self):
+        a_c = np.arange(10)
+        d = cuda.to_device(a_c)
+        arr = np.arange(20)[::2]
+        d.copy_to_device(arr)
+        np.testing.assert_array_equal(d.copy_to_host(), arr)
+
+    def test_devicearray_contiguous_device_strided(self):
+        d = cuda.to_device(np.arange(20))
+        arr = np.arange(20)
+
+        with self.assertRaises(ValueError) as e:
+            d.copy_to_device(cuda.to_device(arr)[::2])
+        self.assertEqual(
+            devicearray.errmsg_contiguous_buffer,
+            str(e.exception))
+
+    @skip_on_cudasim('DeviceNDArray class not present in simulator')
+    def test_devicearray_relaxed_strides(self):
+        # From the reproducer in Issue #6824.
+
+        # Construct a device array that is contiguous even though
+        # the strides for the first axis (800) are not equal to
+        # the strides * size (10 * 8 = 80) for the previous axis,
+        # because the first axis size is 1.
+        arr = devicearray.DeviceNDArray((1, 10), (800, 8), np.float64)
+
+        # Ensure we still believe the array to be contiguous because
+        # strides checking is relaxed.
+        self.assertTrue(arr.flags['C_CONTIGUOUS'])
+        self.assertTrue(arr.flags['F_CONTIGUOUS'])
+
+    def test_c_f_contiguity_matches_numpy(self):
+        # From the reproducer in Issue #4943.
+
+        shapes = ((1, 4), (4, 1))
+        orders = ('C', 'F')
+
+        for shape, order in itertools.product(shapes, orders):
+            arr = np.ndarray(shape, order=order)
+            d_arr = cuda.to_device(arr)
+            self.assertEqual(arr.flags['C_CONTIGUOUS'],
+                             d_arr.flags['C_CONTIGUOUS'])
+            self.assertEqual(arr.flags['F_CONTIGUOUS'],
+                             d_arr.flags['F_CONTIGUOUS'])
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_simple_c(self):
+        # C-order 1D array
+        a = np.zeros(10, order='C')
+        d = cuda.to_device(a)
+        self.assertEqual(d._numba_type_.layout, 'C')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_simple_f(self):
+        # F-order array that is also C layout.
+        a = np.zeros(10, order='F')
+        d = cuda.to_device(a)
+        self.assertEqual(d._numba_type_.layout, 'C')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_2d_c(self):
+        # C-order 2D array
+        a = np.zeros((2, 10), order='C')
+        d = cuda.to_device(a)
+        self.assertEqual(d._numba_type_.layout, 'C')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_2d_f(self):
+        # F-order array that can only be F layout
+        a = np.zeros((2, 10), order='F')
+        d = cuda.to_device(a)
+        self.assertEqual(d._numba_type_.layout, 'F')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_noncontig_slice_c(self):
+        # Non-contiguous slice of C-order array
+        a = np.zeros((5, 5), order='C')
+        d = cuda.to_device(a)[:,2]
+        self.assertEqual(d._numba_type_.layout, 'A')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_noncontig_slice_f(self):
+        # Non-contiguous slice of F-order array
+        a = np.zeros((5, 5), order='F')
+        d = cuda.to_device(a)[2,:]
+        self.assertEqual(d._numba_type_.layout, 'A')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_contig_slice_c(self):
+        # Contiguous slice of C-order array
+        a = np.zeros((5, 5), order='C')
+        d = cuda.to_device(a)[2,:]
+        self.assertEqual(d._numba_type_.layout, 'C')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_contig_slice_f(self):
+        # Contiguous slice of F-order array - is both C- and F-contiguous, so
+        # types as 'C' layout
+        a = np.zeros((5, 5), order='F')
+        d = cuda.to_device(a)[:,2]
+        self.assertEqual(d._numba_type_.layout, 'C')
+
+    @skip_on_cudasim('Typing not done in the simulator')
+    def test_devicearray_typing_order_broadcasted(self):
+        # Broadcasted array, similar to that used for passing scalars to ufuncs
+        a = np.broadcast_to(np.array([1]), (10,))
+        d = cuda.to_device(a)
+        self.assertEqual(d._numba_type_.layout, 'A')
+
+    def test_bug6697(self):
+        ary = np.arange(10, dtype=np.int16)
+        dary = cuda.to_device(ary)
+        got = np.asarray(dary)
+        self.assertEqual(got.dtype, dary.dtype)
+
+    @skip_on_cudasim('DeviceNDArray class not present in simulator')
+    def test_issue_8477(self):
+        # Ensure that we can copy a zero-length device array to a zero-length
+        # host array when the strides of the device and host arrays differ -
+        # this should be possible because the strides are irrelevant when the
+        # length is zero. For more info see
+        # https://github.com/numba/numba/issues/8477.
+
+        # Create a device array with shape (0,) and strides (8,)
+        dev_array = devicearray.DeviceNDArray(shape=(0,), strides=(8,),
+                                              dtype=np.int8)
+
+        # Create a host array with shape (0,) and strides (0,)
+        host_array = np.ndarray(shape=(0,), strides=(0,), dtype=np.int8)
+
+        # Sanity check for this test - ensure our destination has the strides
+        # we expect, because strides can be ignored in some cases by the
+        # ndarray constructor - checking here ensures that we haven't failed to
+        # account for unexpected behaviour across different versions of NumPy
+        self.assertEqual(host_array.strides, (0,))
+
+        # Ensure that the copy succeeds in both directions
+        dev_array.copy_to_host(host_array)
+        dev_array.copy_to_device(host_array)
+
+        # Ensure that a device-to-device copy also succeeds when the strides
+        # differ - one way of doing this is to copy the host array across and
+        # use that for copies in both directions.
+        dev_array_from_host = cuda.to_device(host_array)
+        self.assertEqual(dev_array_from_host.shape, (0,))
+        self.assertEqual(dev_array_from_host.strides, (0,))
+
+        dev_array.copy_to_device(dev_array_from_host)
+        dev_array_from_host.copy_to_device(dev_array)
+
+
+class TestRecarray(CUDATestCase):
+    def test_recarray(self):
+        # From issue #4111
+        a = np.recarray((16,), dtype=[
+            ("value1", np.int64),
+            ("value2", np.float64),
+        ])
+        a.value1 = np.arange(a.size, dtype=np.int64)
+        a.value2 = np.arange(a.size, dtype=np.float64) / 100
+
+        expect1 = a.value1
+        expect2 = a.value2
+
+        def test(x, out1, out2):
+            i = cuda.grid(1)
+            if i < x.size:
+                out1[i] = x.value1[i]
+                out2[i] = x.value2[i]
+
+        got1 = np.zeros_like(expect1)
+        got2 = np.zeros_like(expect2)
+        cuda.jit(test)[1, a.size](a, got1, got2)
+
+        np.testing.assert_array_equal(expect1, got1)
+        np.testing.assert_array_equal(expect2, got2)
+
+
+class TestCoreContiguous(CUDATestCase):
+    def _test_against_array_core(self, view):
+        self.assertEqual(
+            devicearray.is_contiguous(view),
+            devicearray.array_core(view).flags['C_CONTIGUOUS']
+        )
+
+    def test_device_array_like_1d(self):
+        d_a = cuda.device_array(10, order='C')
+        self._test_against_array_core(d_a)
+
+    def test_device_array_like_2d(self):
+        d_a = cuda.device_array((10, 12), order='C')
+        self._test_against_array_core(d_a)
+
+    def test_device_array_like_2d_transpose(self):
+        d_a = cuda.device_array((10, 12), order='C')
+        self._test_against_array_core(d_a.T)
+
+    def test_device_array_like_3d(self):
+        d_a = cuda.device_array((10, 12, 14), order='C')
+        self._test_against_array_core(d_a)
+
+    def test_device_array_like_1d_f(self):
+        d_a = cuda.device_array(10, order='F')
+        self._test_against_array_core(d_a)
+
+    def test_device_array_like_2d_f(self):
+        d_a = cuda.device_array((10, 12), order='F')
+        self._test_against_array_core(d_a)
+
+    def test_device_array_like_2d_f_transpose(self):
+        d_a = cuda.device_array((10, 12), order='F')
+        self._test_against_array_core(d_a.T)
+
+    def test_device_array_like_3d_f(self):
+        d_a = cuda.device_array((10, 12, 14), order='F')
+        self._test_against_array_core(d_a)
+
+    def test_1d_view(self):
+        shape = 10
+        view = np.zeros(shape)[::2]
+        self._test_against_array_core(view)
+
+    def test_1d_view_f(self):
+        shape = 10
+        view = np.zeros(shape, order='F')[::2]
+        self._test_against_array_core(view)
+
+    def test_2d_view(self):
+        shape = (10, 12)
+        view = np.zeros(shape)[::2, ::2]
+        self._test_against_array_core(view)
+
+    def test_2d_view_f(self):
+        shape = (10, 12)
+        view = np.zeros(shape, order='F')[::2, ::2]
+        self._test_against_array_core(view)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_deallocations.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_deallocations.py
@@ -0,0 +1,249 @@
+from contextlib import contextmanager
+
+import numpy as np
+
+from numba import cuda
+from numba.cuda.testing import (unittest, skip_on_cudasim,
+                                skip_if_external_memmgr, CUDATestCase)
+from numba.tests.support import captured_stderr
+from numba.core import config
+
+
+@skip_on_cudasim('not supported on CUDASIM')
+@skip_if_external_memmgr('Deallocation specific to Numba memory management')
+class TestDeallocation(CUDATestCase):
+    def test_max_pending_count(self):
+        # get deallocation manager and flush it
+        deallocs = cuda.current_context().memory_manager.deallocations
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+        # deallocate to maximum count
+        for i in range(config.CUDA_DEALLOCS_COUNT):
+            cuda.to_device(np.arange(1))
+            self.assertEqual(len(deallocs), i + 1)
+        # one more to trigger .clear()
+        cuda.to_device(np.arange(1))
+        self.assertEqual(len(deallocs), 0)
+
+    def test_max_pending_bytes(self):
+        # get deallocation manager and flush it
+        ctx = cuda.current_context()
+        deallocs = ctx.memory_manager.deallocations
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+
+        mi = ctx.get_memory_info()
+
+        max_pending = 10**6  # 1MB
+        old_ratio = config.CUDA_DEALLOCS_RATIO
+        try:
+            # change to a smaller ratio
+            config.CUDA_DEALLOCS_RATIO = max_pending / mi.total
+            # due to round off error (floor is used in calculating
+            # _max_pending_bytes) it can be off by 1.
+            self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending,
+                                   delta=1)
+
+            # allocate half the max size
+            # this will not trigger deallocation
+            cuda.to_device(np.ones(max_pending // 2, dtype=np.int8))
+            self.assertEqual(len(deallocs), 1)
+
+            # allocate another remaining
+            # this will not trigger deallocation
+            cuda.to_device(np.ones(deallocs._max_pending_bytes -
+                                   deallocs._size, dtype=np.int8))
+            self.assertEqual(len(deallocs), 2)
+
+            # another byte to trigger .clear()
+            cuda.to_device(np.ones(1, dtype=np.int8))
+            self.assertEqual(len(deallocs), 0)
+        finally:
+            # restore old ratio
+            config.CUDA_DEALLOCS_RATIO = old_ratio
+
+
+@skip_on_cudasim("defer_cleanup has no effect in CUDASIM")
+@skip_if_external_memmgr('Deallocation specific to Numba memory management')
+class TestDeferCleanup(CUDATestCase):
+    def test_basic(self):
+        harr = np.arange(5)
+        darr1 = cuda.to_device(harr)
+        deallocs = cuda.current_context().memory_manager.deallocations
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+        with cuda.defer_cleanup():
+            darr2 = cuda.to_device(harr)
+            del darr1
+            self.assertEqual(len(deallocs), 1)
+            del darr2
+            self.assertEqual(len(deallocs), 2)
+            deallocs.clear()
+            self.assertEqual(len(deallocs), 2)
+
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+
+    def test_nested(self):
+        harr = np.arange(5)
+        darr1 = cuda.to_device(harr)
+        deallocs = cuda.current_context().memory_manager.deallocations
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+        with cuda.defer_cleanup():
+            with cuda.defer_cleanup():
+                darr2 = cuda.to_device(harr)
+                del darr1
+                self.assertEqual(len(deallocs), 1)
+                del darr2
+                self.assertEqual(len(deallocs), 2)
+                deallocs.clear()
+                self.assertEqual(len(deallocs), 2)
+            deallocs.clear()
+            self.assertEqual(len(deallocs), 2)
+
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+
+    def test_exception(self):
+        harr = np.arange(5)
+        darr1 = cuda.to_device(harr)
+        deallocs = cuda.current_context().memory_manager.deallocations
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+
+        class CustomError(Exception):
+            pass
+
+        with self.assertRaises(CustomError):
+            with cuda.defer_cleanup():
+                darr2 = cuda.to_device(harr)
+                del darr2
+                self.assertEqual(len(deallocs), 1)
+                deallocs.clear()
+                self.assertEqual(len(deallocs), 1)
+                raise CustomError
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+        del darr1
+        self.assertEqual(len(deallocs), 1)
+        deallocs.clear()
+        self.assertEqual(len(deallocs), 0)
+
+
+class TestDeferCleanupAvail(CUDATestCase):
+    def test_context_manager(self):
+        # just make sure the API is available
+        with cuda.defer_cleanup():
+            pass
+
+
+@skip_on_cudasim('not supported on CUDASIM')
+class TestDel(CUDATestCase):
+    """
+    Ensure resources are deleted properly without ignored exception.
+    """
+    @contextmanager
+    def check_ignored_exception(self, ctx):
+        with captured_stderr() as cap:
+            yield
+            ctx.deallocations.clear()
+        self.assertFalse(cap.getvalue())
+
+    def test_stream(self):
+        ctx = cuda.current_context()
+        stream = ctx.create_stream()
+        with self.check_ignored_exception(ctx):
+            del stream
+
+    def test_event(self):
+        ctx = cuda.current_context()
+        event = ctx.create_event()
+        with self.check_ignored_exception(ctx):
+            del event
+
+    def test_pinned_memory(self):
+        ctx = cuda.current_context()
+        mem = ctx.memhostalloc(32)
+        with self.check_ignored_exception(ctx):
+            del mem
+
+    def test_mapped_memory(self):
+        ctx = cuda.current_context()
+        mem = ctx.memhostalloc(32, mapped=True)
+        with self.check_ignored_exception(ctx):
+            del mem
+
+    def test_device_memory(self):
+        ctx = cuda.current_context()
+        mem = ctx.memalloc(32)
+        with self.check_ignored_exception(ctx):
+            del mem
+
+    def test_managed_memory(self):
+        ctx = cuda.current_context()
+        mem = ctx.memallocmanaged(32)
+        with self.check_ignored_exception(ctx):
+            del mem
+
+    def test_pinned_contextmanager(self):
+        # Check that temporarily pinned memory is unregistered immediately,
+        # such that it can be re-pinned at any time
+        class PinnedException(Exception):
+            pass
+
+        arr = np.zeros(1)
+        ctx = cuda.current_context()
+        ctx.deallocations.clear()
+        with self.check_ignored_exception(ctx):
+            with cuda.pinned(arr):
+                pass
+            with cuda.pinned(arr):
+                pass
+            # Should also work inside a `defer_cleanup` block
+            with cuda.defer_cleanup():
+                with cuda.pinned(arr):
+                    pass
+                with cuda.pinned(arr):
+                    pass
+            # Should also work when breaking out of the block due to an
+            # exception
+            try:
+                with cuda.pinned(arr):
+                    raise PinnedException
+            except PinnedException:
+                with cuda.pinned(arr):
+                    pass
+
+    def test_mapped_contextmanager(self):
+        # Check that temporarily mapped memory is unregistered immediately,
+        # such that it can be re-mapped at any time
+        class MappedException(Exception):
+            pass
+
+        arr = np.zeros(1)
+        ctx = cuda.current_context()
+        ctx.deallocations.clear()
+        with self.check_ignored_exception(ctx):
+            with cuda.mapped(arr):
+                pass
+            with cuda.mapped(arr):
+                pass
+            # Should also work inside a `defer_cleanup` block
+            with cuda.defer_cleanup():
+                with cuda.mapped(arr):
+                    pass
+                with cuda.mapped(arr):
+                    pass
+            # Should also work when breaking out of the block due to an
+            # exception
+            try:
+                with cuda.mapped(arr):
+                    raise MappedException
+            except MappedException:
+                with cuda.mapped(arr):
+                    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_detect.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_detect.py
@@ -0,0 +1,81 @@
+import os
+import sys
+import subprocess
+import threading
+from numba import cuda
+from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
+                                skip_under_cuda_memcheck)
+from numba.tests.support import captured_stdout
+
+
+class TestCudaDetect(CUDATestCase):
+    def test_cuda_detect(self):
+        # exercise the code path
+        with captured_stdout() as out:
+            cuda.detect()
+        output = out.getvalue()
+        self.assertIn('Found', output)
+        self.assertIn('CUDA devices', output)
+
+
+@skip_under_cuda_memcheck('Hangs cuda-memcheck')
+class TestCUDAFindLibs(CUDATestCase):
+
+    def run_cmd(self, cmdline, env):
+        popen = subprocess.Popen(cmdline,
+                                 stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE,
+                                 env=env)
+
+        # finish in 5 minutes or kill it
+        timeout = threading.Timer(5 * 60., popen.kill)
+        try:
+            timeout.start()
+            out, err = popen.communicate()
+            # the process should exit with an error
+            return out.decode(), err.decode()
+        finally:
+            timeout.cancel()
+        return None, None
+
+    def run_test_in_separate_process(self, envvar, envvar_value):
+        env_copy = os.environ.copy()
+        env_copy[envvar] = str(envvar_value)
+        code = """if 1:
+            from numba import cuda
+            @cuda.jit('(int64,)')
+            def kernel(x):
+                pass
+            kernel(1,)
+            """
+        cmdline = [sys.executable, "-c", code]
+        return self.run_cmd(cmdline, env_copy)
+
+    @skip_on_cudasim('Simulator does not hit device library search code path')
+    @unittest.skipIf(not sys.platform.startswith('linux'), "linux only")
+    def test_cuda_find_lib_errors(self):
+        """
+        This tests that the find_libs works as expected in the case of an
+        environment variable being used to set the path.
+        """
+        # one of these is likely to exist on linux, it's also unlikely that
+        # someone has extracted the contents of libdevice into here!
+        locs = ['lib', 'lib64']
+
+        looking_for = None
+        for l in locs:
+            looking_for = os.path.join(os.path.sep, l)
+            if os.path.exists(looking_for):
+                break
+
+        # This is the testing part, the test will only run if there's a valid
+        # path in which to look
+        if looking_for is not None:
+            out, err = self.run_test_in_separate_process("NUMBA_CUDA_DRIVER",
+                                                         looking_for)
+            self.assertTrue(out is not None)
+            self.assertTrue(err is not None)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_emm_plugins.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_emm_plugins.py
@@ -0,0 +1,192 @@
+import ctypes
+import numpy as np
+import weakref
+
+from numba import cuda
+from numba.core import config
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+from numba.tests.support import linux_only
+
+if not config.ENABLE_CUDASIM:
+    class DeviceOnlyEMMPlugin(cuda.HostOnlyCUDAMemoryManager):
+        """
+        Dummy EMM Plugin implementation for testing. It memorises which plugin
+        API methods have been called so that the tests can check that Numba
+        called into the plugin as expected.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+            # For tracking our dummy allocations
+            self.allocations = {}
+            self.count = 0
+
+            # For tracking which methods have been called
+            self.initialized = False
+            self.memalloc_called = False
+            self.reset_called = False
+            self.get_memory_info_called = False
+            self.get_ipc_handle_called = False
+
+        def memalloc(self, size):
+            # We maintain a list of allocations and keep track of them, so that
+            # we can test that the finalizers of objects returned by memalloc
+            # get called.
+
+            # Numba should have initialized the memory manager when preparing
+            # the context for use, prior to any memalloc call.
+            if not self.initialized:
+                raise RuntimeError("memalloc called before initialize")
+            self.memalloc_called = True
+
+            # Create an allocation and record it
+            self.count += 1
+            alloc_count = self.count
+            self.allocations[alloc_count] = size
+
+            # The finalizer deletes the record from our internal dict of
+            # allocations.
+            finalizer_allocs = self.allocations
+
+            def finalizer():
+                del finalizer_allocs[alloc_count]
+
+            # We use an AutoFreePointer so that the finalizer will be run when
+            # the reference count drops to zero.
+            ctx = weakref.proxy(self.context)
+            ptr = ctypes.c_void_p(alloc_count)
+            return cuda.cudadrv.driver.AutoFreePointer(ctx, ptr, size,
+                                                       finalizer=finalizer)
+
+        def initialize(self):
+            # No special initialization needed.
+            self.initialized = True
+
+        def reset(self):
+            # We remove all allocations on reset, just as a real EMM Plugin
+            # would do. Note that our finalizers in memalloc don't check
+            # whether the allocations are still alive, so running them after
+            # reset will detect any allocations that are floating around at
+            # exit time; however, the atexit finalizer for weakref will only
+            # print a traceback, not terminate the interpreter abnormally.
+            self.reset_called = True
+
+        def get_memory_info(self):
+            # Return some dummy memory information
+            self.get_memory_info_called = True
+            return cuda.MemoryInfo(free=32, total=64)
+
+        def get_ipc_handle(self, memory):
+            # The dummy IPC handle is only a string, so it is important that
+            # the tests don't try to do too much with it (e.g. open / close
+            # it).
+            self.get_ipc_handle_called = True
+            return "Dummy IPC handle for alloc %s" % memory.device_pointer.value
+
+        @property
+        def interface_version(self):
+            # The expected version for an EMM Plugin.
+            return 1
+
+    class BadVersionEMMPlugin(DeviceOnlyEMMPlugin):
+        """A plugin that claims to implement a different interface version"""
+
+        @property
+        def interface_version(self):
+            return 2
+
+
+@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
+class TestDeviceOnlyEMMPlugin(CUDATestCase):
+    """
+    Tests that the API of an EMM Plugin that implements device allocations
+    only is used correctly by Numba.
+    """
+
+    def setUp(self):
+        super().setUp()
+        # Always start afresh with a new context and memory manager
+        cuda.close()
+        cuda.set_memory_manager(DeviceOnlyEMMPlugin)
+
+    def tearDown(self):
+        super().tearDown()
+        # Unset the memory manager for subsequent tests
+        cuda.close()
+        cuda.cudadrv.driver._memory_manager = None
+
+    def test_memalloc(self):
+        mgr = cuda.current_context().memory_manager
+
+        # Allocate an array and check that memalloc was called with the correct
+        # size.
+        arr_1 = np.arange(10)
+        d_arr_1 = cuda.device_array_like(arr_1)
+        self.assertTrue(mgr.memalloc_called)
+        self.assertEqual(mgr.count, 1)
+        self.assertEqual(mgr.allocations[1], arr_1.nbytes)
+
+        # Allocate again, with a different size, and check that it is also
+        # correct.
+        arr_2 = np.arange(5)
+        d_arr_2 = cuda.device_array_like(arr_2)
+        self.assertEqual(mgr.count, 2)
+        self.assertEqual(mgr.allocations[2], arr_2.nbytes)
+
+        # Remove the first array, and check that our finalizer was called for
+        # the first array only.
+        del d_arr_1
+        self.assertNotIn(1, mgr.allocations)
+        self.assertIn(2, mgr.allocations)
+
+        # Remove the second array and check that its finalizer was also
+        # called.
+        del d_arr_2
+        self.assertNotIn(2, mgr.allocations)
+
+    def test_initialized_in_context(self):
+        # If we have a CUDA context, it should already have initialized its
+        # memory manager.
+        self.assertTrue(cuda.current_context().memory_manager.initialized)
+
+    def test_reset(self):
+        ctx = cuda.current_context()
+        ctx.reset()
+        self.assertTrue(ctx.memory_manager.reset_called)
+
+    def test_get_memory_info(self):
+        ctx = cuda.current_context()
+        meminfo = ctx.get_memory_info()
+        self.assertTrue(ctx.memory_manager.get_memory_info_called)
+        self.assertEqual(meminfo.free, 32)
+        self.assertEqual(meminfo.total, 64)
+
+    @linux_only
+    def test_get_ipc_handle(self):
+        # We don't attempt to close the IPC handle in this test because Numba
+        # will be expecting a real IpcHandle object to have been returned from
+        # get_ipc_handle, and it would cause problems to do so.
+        arr = np.arange(2)
+        d_arr = cuda.device_array_like(arr)
+        ipch = d_arr.get_ipc_handle()
+        ctx = cuda.current_context()
+        self.assertTrue(ctx.memory_manager.get_ipc_handle_called)
+        self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle)
+
+
+@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
+class TestBadEMMPluginVersion(CUDATestCase):
+    """
+    Ensure that Numba rejects EMM Plugins with incompatible version
+    numbers.
+    """
+
+    def test_bad_plugin_version(self):
+        with self.assertRaises(RuntimeError) as raises:
+            cuda.set_memory_manager(BadVersionEMMPlugin)
+        self.assertIn('version 1 required', str(raises.exception))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_events.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_events.py
@@ -0,0 +1,38 @@
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase
+
+
+class TestCudaEvent(CUDATestCase):
+    def test_event_elapsed(self):
+        N = 32
+        dary = cuda.device_array(N, dtype=np.double)
+        evtstart = cuda.event()
+        evtend = cuda.event()
+
+        evtstart.record()
+        cuda.to_device(np.arange(N, dtype=np.double), to=dary)
+        evtend.record()
+        evtend.wait()
+        evtend.synchronize()
+        # Exercise the code path
+        evtstart.elapsed_time(evtend)
+
+    def test_event_elapsed_stream(self):
+        N = 32
+        stream = cuda.stream()
+        dary = cuda.device_array(N, dtype=np.double)
+        evtstart = cuda.event()
+        evtend = cuda.event()
+
+        evtstart.record(stream=stream)
+        cuda.to_device(np.arange(N, dtype=np.double), to=dary, stream=stream)
+        evtend.record(stream=stream)
+        evtend.wait(stream=stream)
+        evtend.synchronize()
+        # Exercise the code path
+        evtstart.elapsed_time(evtend)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_host_alloc.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_host_alloc.py
@@ -0,0 +1,65 @@
+import numpy as np
+from numba.cuda.cudadrv import driver
+from numba import cuda
+from numba.cuda.testing import unittest, ContextResettingTestCase
+
+
+class TestHostAlloc(ContextResettingTestCase):
+    def test_host_alloc_driver(self):
+        n = 32
+        mem = cuda.current_context().memhostalloc(n, mapped=True)
+
+        dtype = np.dtype(np.uint8)
+        ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype,
+                         buffer=mem)
+
+        magic = 0xab
+        driver.device_memset(mem, magic, n)
+
+        self.assertTrue(np.all(ary == magic))
+
+        ary.fill(n)
+
+        recv = np.empty_like(ary)
+
+        driver.device_to_host(recv, mem, ary.size)
+
+        self.assertTrue(np.all(ary == recv))
+        self.assertTrue(np.all(recv == n))
+
+    def test_host_alloc_pinned(self):
+        ary = cuda.pinned_array(10, dtype=np.uint32)
+        ary.fill(123)
+        self.assertTrue(all(ary == 123))
+        devary = cuda.to_device(ary)
+        driver.device_memset(devary, 0, driver.device_memory_size(devary))
+        self.assertTrue(all(ary == 123))
+        devary.copy_to_host(ary)
+        self.assertTrue(all(ary == 0))
+
+    def test_host_alloc_mapped(self):
+        ary = cuda.mapped_array(10, dtype=np.uint32)
+        ary.fill(123)
+        self.assertTrue(all(ary == 123))
+        driver.device_memset(ary, 0, driver.device_memory_size(ary))
+        self.assertTrue(all(ary == 0))
+        self.assertTrue(sum(ary != 0) == 0)
+
+    def test_host_operators(self):
+        for ary in [cuda.mapped_array(10, dtype=np.uint32),
+                    cuda.pinned_array(10, dtype=np.uint32)]:
+            ary[:] = range(10)
+            self.assertTrue(sum(ary + 1) == 55)
+            self.assertTrue(sum((ary + 1) * 2 - 1) == 100)
+            self.assertTrue(sum(ary < 5) == 5)
+            self.assertTrue(sum(ary <= 5) == 6)
+            self.assertTrue(sum(ary > 6) == 3)
+            self.assertTrue(sum(ary >= 6) == 4)
+            self.assertTrue(sum(ary ** 2) == 285)
+            self.assertTrue(sum(ary // 2) == 20)
+            self.assertTrue(sum(ary / 2.0) == 22.5)
+            self.assertTrue(sum(ary % 2) == 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_init.py
@@ -0,0 +1,139 @@
+import multiprocessing as mp
+import os
+
+from numba import cuda
+from numba.cuda.cudadrv.driver import CudaAPIError, driver
+from numba.cuda.cudadrv.error import CudaSupportError
+from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
+
+
+# A mock of cuInit that always raises a CudaAPIError
+def cuInit_raising(arg):
+    raise CudaAPIError(999, 'CUDA_ERROR_UNKNOWN')
+
+
+# Test code to run in a child that patches driver.cuInit to a variant that
+# always raises. We can't use mock.patch.object here because driver.cuInit is
+# not assigned until we attempt to initialize - mock.patch.object cannot locate
+# the non-existent original method, and so fails. Instead we patch
+# driver.cuInit with our raising version prior to any attempt to initialize.
+def cuInit_raising_test(result_queue):
+    driver.cuInit = cuInit_raising
+
+    success = False
+    msg = None
+
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError as e:
+        success = True
+        msg = e.msg
+
+    result_queue.put((success, msg))
+
+
+# Similar to cuInit_raising_test above, but for testing that the string
+# returned by cuda_error() is as expected.
+def initialization_error_test(result_queue):
+    driver.cuInit = cuInit_raising
+
+    success = False
+    msg = None
+
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError:
+        success = True
+
+    msg = cuda.cuda_error()
+    result_queue.put((success, msg))
+
+
+# For testing the path where Driver.__init__() catches a CudaSupportError
+def cuda_disabled_test(result_queue):
+    success = False
+    msg = None
+
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError as e:
+        success = True
+        msg = e.msg
+
+    result_queue.put((success, msg))
+
+
+# Similar to cuda_disabled_test, but checks cuda.cuda_error() instead of the
+# exception raised on initialization
+def cuda_disabled_error_test(result_queue):
+    success = False
+    msg = None
+
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError:
+        success = True
+
+    msg = cuda.cuda_error()
+    result_queue.put((success, msg))
+
+
+@skip_on_cudasim('CUDA Simulator does not initialize driver')
+class TestInit(CUDATestCase):
+    def _test_init_failure(self, target, expected):
+        # Run the initialization failure test in a separate subprocess
+        ctx = mp.get_context('spawn')
+        result_queue = ctx.Queue()
+        proc = ctx.Process(target=target, args=(result_queue,))
+        proc.start()
+        proc.join(30) # should complete within 30s
+        success, msg = result_queue.get()
+
+        # Ensure the child process raised an exception during initialization
+        # before checking the message
+        if not success:
+            self.fail('CudaSupportError not raised')
+
+        self.assertIn(expected, msg)
+
+    def test_init_failure_raising(self):
+        expected = 'Error at driver init: CUDA_ERROR_UNKNOWN (999)'
+        self._test_init_failure(cuInit_raising_test, expected)
+
+    def test_init_failure_error(self):
+        expected = 'CUDA_ERROR_UNKNOWN (999)'
+        self._test_init_failure(initialization_error_test, expected)
+
+    def _test_cuda_disabled(self, target):
+        # Uses _test_init_failure to launch the test in a separate subprocess
+        # with CUDA disabled.
+        cuda_disabled = os.environ.get('NUMBA_DISABLE_CUDA')
+        os.environ['NUMBA_DISABLE_CUDA'] = "1"
+        try:
+            expected = 'CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1'
+            self._test_init_failure(cuda_disabled_test, expected)
+        finally:
+            if cuda_disabled is not None:
+                os.environ['NUMBA_DISABLE_CUDA'] = cuda_disabled
+            else:
+                os.environ.pop('NUMBA_DISABLE_CUDA')
+
+    def test_cuda_disabled_raising(self):
+        self._test_cuda_disabled(cuda_disabled_test)
+
+    def test_cuda_disabled_error(self):
+        self._test_cuda_disabled(cuda_disabled_error_test)
+
+    def test_init_success(self):
+        # Here we assume that initialization is successful (because many bad
+        # things will happen with the test suite if it is not) and check that
+        # there is no error recorded.
+        self.assertIsNone(cuda.cuda_error())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_inline_ptx.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_inline_ptx.py
@@ -0,0 +1,37 @@
+from llvmlite import ir
+
+from numba.cuda.cudadrv import nvvm
+from numba.cuda.testing import unittest, ContextResettingTestCase
+from numba.cuda.testing import skip_on_cudasim
+
+
+@skip_on_cudasim('Inline PTX cannot be used in the simulator')
+class TestCudaInlineAsm(ContextResettingTestCase):
+    def test_inline_rsqrt(self):
+        mod = ir.Module(__name__)
+        mod.triple = 'nvptx64-nvidia-cuda'
+        nvvm.add_ir_version(mod)
+        fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())])
+        fn = ir.Function(mod, fnty, 'cu_rsqrt')
+        bldr = ir.IRBuilder(fn.append_basic_block('entry'))
+
+        rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
+        inlineasm = ir.InlineAsm(rsqrt_approx_fnty,
+                                 'rsqrt.approx.f32 $0, $1;',
+                                 '=f,f', side_effect=True)
+        val = bldr.load(fn.args[0])
+        res = bldr.call(inlineasm, [val])
+
+        bldr.store(res, fn.args[0])
+        bldr.ret_void()
+
+        # generate ptx
+        mod.data_layout = nvvm.NVVM().data_layout
+        nvvm.set_cuda_kernel(fn)
+        nvvmir = str(mod)
+        ptx = nvvm.compile_ir(nvvmir)
+        self.assertTrue('rsqrt.approx.f32' in str(ptx))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_is_fp16.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_is_fp16.py
@@ -0,0 +1,12 @@
+from numba import cuda
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
+
+
+class TestIsFP16Supported(CUDATestCase):
+    def test_is_fp16_supported(self):
+        self.assertTrue(cuda.is_float16_supported())
+
+    @skip_on_cudasim
+    @skip_unless_cc_53
+    def test_device_supports_float16(self):
+        self.assertTrue(cuda.get_current_device().supports_float16)
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_linker.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_linker.py
@@ -0,0 +1,317 @@
+import numpy as np
+import warnings
+from numba.cuda.testing import unittest
+from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
+from numba.cuda.testing import CUDATestCase, test_data_dir
+from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
+                                       LinkerError)
+from numba.cuda.cudadrv.error import NvrtcError
+from numba.cuda import require_context
+from numba.tests.support import ignore_internal_warnings
+from numba import cuda, void, float64, int64, int32, typeof, float32
+
+
+CONST1D = np.arange(10, dtype=np.float64)
+
+
+def simple_const_mem(A):
+    C = cuda.const.array_like(CONST1D)
+    i = cuda.grid(1)
+
+    A[i] = C[i] + 1.0
+
+
+def func_with_lots_of_registers(x, a, b, c, d, e, f):
+    a1 = 1.0
+    a2 = 1.0
+    a3 = 1.0
+    a4 = 1.0
+    a5 = 1.0
+    b1 = 1.0
+    b2 = 1.0
+    b3 = 1.0
+    b4 = 1.0
+    b5 = 1.0
+    c1 = 1.0
+    c2 = 1.0
+    c3 = 1.0
+    c4 = 1.0
+    c5 = 1.0
+    d1 = 10
+    d2 = 10
+    d3 = 10
+    d4 = 10
+    d5 = 10
+    for i in range(a):
+        a1 += b
+        a2 += c
+        a3 += d
+        a4 += e
+        a5 += f
+        b1 *= b
+        b2 *= c
+        b3 *= d
+        b4 *= e
+        b5 *= f
+        c1 /= b
+        c2 /= c
+        c3 /= d
+        c4 /= e
+        c5 /= f
+        d1 <<= b
+        d2 <<= c
+        d3 <<= d
+        d4 <<= e
+        d5 <<= f
+    x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
+    x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
+    x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
+    x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5
+
+
+def simple_smem(ary, dty):
+    sm = cuda.shared.array(100, dty)
+    i = cuda.grid(1)
+    if i == 0:
+        for j in range(100):
+            sm[j] = j
+    cuda.syncthreads()
+    ary[i] = sm[i]
+
+
+def coop_smem2d(ary):
+    i, j = cuda.grid(2)
+    sm = cuda.shared.array((10, 20), float32)
+    sm[i, j] = (i + 1) / (j + 1)
+    cuda.syncthreads()
+    ary[i, j] = sm[i, j]
+
+
+def simple_maxthreads(ary):
+    i = cuda.grid(1)
+    ary[i] = i
+
+
+LMEM_SIZE = 1000
+
+
+def simple_lmem(A, B, dty):
+    C = cuda.local.array(LMEM_SIZE, dty)
+    for i in range(C.shape[0]):
+        C[i] = A[i]
+    for i in range(C.shape[0]):
+        B[i] = C[i]
+
+
+@skip_on_cudasim('Linking unsupported in the simulator')
+class TestLinker(CUDATestCase):
+    _NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}
+
+    @require_context
+    def test_linker_basic(self):
+        '''Simply go through the constructor and destructor
+        '''
+        linker = Linker.new(cc=(5, 3))
+        del linker
+
+    def _test_linking(self, eager):
+        global bar  # must be a global; other it is recognized as a freevar
+        bar = cuda.declare_device('bar', 'int32(int32)')
+
+        link = str(test_data_dir / 'jitlink.ptx')
+
+        if eager:
+            args = ['void(int32[:], int32[:])']
+        else:
+            args = []
+
+        @cuda.jit(*args, link=[link])
+        def foo(x, y):
+            i = cuda.grid(1)
+            x[i] += bar(y[i])
+
+        A = np.array([123], dtype=np.int32)
+        B = np.array([321], dtype=np.int32)
+
+        foo[1, 1](A, B)
+
+        self.assertTrue(A[0] == 123 + 2 * 321)
+
+    def test_linking_lazy_compile(self):
+        self._test_linking(eager=False)
+
+    def test_linking_eager_compile(self):
+        self._test_linking(eager=True)
+
+    def test_linking_cu(self):
+        bar = cuda.declare_device('bar', 'int32(int32)')
+
+        link = str(test_data_dir / 'jitlink.cu')
+
+        @cuda.jit(link=[link])
+        def kernel(r, x):
+            i = cuda.grid(1)
+
+            if i < len(r):
+                r[i] = bar(x[i])
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.zeros_like(x)
+
+        kernel[1, 32](r, x)
+
+        # Matches the operation of bar() in jitlink.cu
+        expected = x * 2
+        np.testing.assert_array_equal(r, expected)
+
+    def test_linking_cu_log_warning(self):
+        bar = cuda.declare_device('bar', 'int32(int32)')
+
+        link = str(test_data_dir / 'warn.cu')
+
+        with warnings.catch_warnings(record=True) as w:
+            ignore_internal_warnings()
+
+            @cuda.jit('void(int32)', link=[link])
+            def kernel(x):
+                bar(x)
+
+        self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
+        # Check the warning refers to the log messages
+        self.assertIn('NVRTC log messages', str(w[0].message))
+        # Check the message pertaining to the unused variable is provided
+        self.assertIn('declared but never referenced', str(w[0].message))
+
+    def test_linking_cu_error(self):
+        bar = cuda.declare_device('bar', 'int32(int32)')
+
+        link = str(test_data_dir / 'error.cu')
+
+        with self.assertRaises(NvrtcError) as e:
+            @cuda.jit('void(int32)', link=[link])
+            def kernel(x):
+                bar(x)
+
+        msg = e.exception.args[0]
+        # Check the error message refers to the NVRTC compile
+        self.assertIn('NVRTC Compilation failure', msg)
+        # Check the expected error in the CUDA source is reported
+        self.assertIn('identifier "SYNTAX" is undefined', msg)
+        # Check the filename is reported correctly
+        self.assertIn('in the compilation of "error.cu"', msg)
+
+    def test_linking_unknown_filetype_error(self):
+        expected_err = "Don't know how to link file with extension .cuh"
+        with self.assertRaisesRegex(RuntimeError, expected_err):
+            @cuda.jit('void()', link=['header.cuh'])
+            def kernel():
+                pass
+
+    def test_linking_file_with_no_extension_error(self):
+        expected_err = "Don't know how to link file with no extension"
+        with self.assertRaisesRegex(RuntimeError, expected_err):
+            @cuda.jit('void()', link=['data'])
+            def kernel():
+                pass
+
+    @skip_if_cuda_includes_missing
+    def test_linking_cu_cuda_include(self):
+        link = str(test_data_dir / 'cuda_include.cu')
+
+        # An exception will be raised when linking this kernel due to the
+        # compile failure if CUDA includes cannot be found by Nvrtc.
+        @cuda.jit('void()', link=[link])
+        def kernel():
+            pass
+
+    def test_try_to_link_nonexistent(self):
+        with self.assertRaises(LinkerError) as e:
+            @cuda.jit('void(int32[::1])', link=['nonexistent.a'])
+            def f(x):
+                x[0] = 0
+        self.assertIn('nonexistent.a not found', e.exception.args)
+
+    def test_set_registers_no_max(self):
+        """Ensure that the jitted kernel used in the test_set_registers_* tests
+        uses more than 57 registers - this ensures that test_set_registers_*
+        are really checking that they reduced the number of registers used from
+        something greater than the maximum."""
+        compiled = cuda.jit(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        self.assertGreater(compiled.get_regs_per_thread(), 57)
+
+    def test_set_registers_57(self):
+        compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        self.assertLessEqual(compiled.get_regs_per_thread(), 57)
+
+    def test_set_registers_38(self):
+        compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        self.assertLessEqual(compiled.get_regs_per_thread(), 38)
+
+    def test_set_registers_eager(self):
+        sig = void(float64[::1], int64, int64, int64, int64, int64, int64)
+        compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)
+        self.assertLessEqual(compiled.get_regs_per_thread(), 38)
+
+    def test_get_const_mem_size(self):
+        sig = void(float64[::1])
+        compiled = cuda.jit(sig)(simple_const_mem)
+        const_mem_size = compiled.get_const_mem_size()
+        self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)
+
+    def test_get_no_shared_memory(self):
+        compiled = cuda.jit(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        shared_mem_size = compiled.get_shared_mem_per_block()
+        self.assertEqual(shared_mem_size, 0)
+
+    def test_get_shared_mem_per_block(self):
+        sig = void(int32[::1], typeof(np.int32))
+        compiled = cuda.jit(sig)(simple_smem)
+        shared_mem_size = compiled.get_shared_mem_per_block()
+        self.assertEqual(shared_mem_size, 400)
+
+    def test_get_shared_mem_per_specialized(self):
+        compiled = cuda.jit(simple_smem)
+        compiled_specialized = compiled.specialize(
+            np.zeros(100, dtype=np.int32), np.float64)
+        shared_mem_size = compiled_specialized.get_shared_mem_per_block()
+        self.assertEqual(shared_mem_size, 800)
+
+    def test_get_max_threads_per_block(self):
+        compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)
+        max_threads = compiled.get_max_threads_per_block()
+        self.assertGreater(max_threads, 0)
+
+    def test_max_threads_exceeded(self):
+        compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)
+        max_threads = compiled.get_max_threads_per_block()
+        nelem = max_threads + 1
+        ary = np.empty(nelem, dtype=np.int32)
+        try:
+            compiled[1, nelem](ary)
+        except CudaAPIError as e:
+            self.assertIn("cuLaunchKernel", e.msg)
+
+    def test_get_local_mem_per_thread(self):
+        sig = void(int32[::1], int32[::1], typeof(np.int32))
+        compiled = cuda.jit(sig)(simple_lmem)
+        local_mem_size = compiled.get_local_mem_per_thread()
+        calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE
+        self.assertGreaterEqual(local_mem_size, calc_size)
+
+    def test_get_local_mem_per_specialized(self):
+        compiled = cuda.jit(simple_lmem)
+        compiled_specialized = compiled.specialize(
+            np.zeros(LMEM_SIZE, dtype=np.int32),
+            np.zeros(LMEM_SIZE, dtype=np.int32),
+            np.float64)
+        local_mem_size = compiled_specialized.get_local_mem_per_thread()
+        calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
+        self.assertGreaterEqual(local_mem_size, calc_size)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_managed_alloc.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_managed_alloc.py
@@ -0,0 +1,127 @@
+import numpy as np
+from ctypes import byref, c_size_t
+from numba.cuda.cudadrv.driver import device_memset, driver, USE_NV_BINDING
+from numba import cuda
+from numba.cuda.testing import unittest, ContextResettingTestCase
+from numba.cuda.testing import skip_on_cudasim, skip_on_arm
+from numba.tests.support import linux_only
+
+
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+@linux_only
+@skip_on_arm('Managed Alloc support is experimental/untested on ARM')
+class TestManagedAlloc(ContextResettingTestCase):
+
+    def get_total_gpu_memory(self):
+        # We use a driver function to directly get the total GPU memory because
+        # an EMM plugin may report something different (or not implement
+        # get_memory_info at all).
+        if USE_NV_BINDING:
+            free, total = driver.cuMemGetInfo()
+            return total
+        else:
+            free = c_size_t()
+            total = c_size_t()
+            driver.cuMemGetInfo(byref(free), byref(total))
+            return total.value
+
+    def skip_if_cc_major_lt(self, min_required, reason):
+        """
+        Skip the current test if the compute capability of the device is
+        less than `min_required`.
+        """
+        ctx = cuda.current_context()
+        cc_major = ctx.device.compute_capability[0]
+        if cc_major < min_required:
+            self.skipTest(reason)
+
+    # CUDA Unified Memory comes in two flavors. For GPUs in the Kepler and
+    # Maxwell generations, managed memory allocations work as opaque,
+    # contiguous segments that can either be on the device or the host. For
+    # GPUs in the Pascal or later generations, managed memory operates on a
+    # per-page basis, so we can have arrays larger than GPU memory, where only
+    # part of them is resident on the device at one time. To ensure that this
+    # test works correctly on all supported GPUs, we'll select the size of our
+    # memory such that we only oversubscribe the GPU memory if we're on a
+    # Pascal or newer GPU (compute capability at least 6.0).
+
+    def test_managed_alloc_driver_undersubscribe(self):
+        msg = "Managed memory unsupported prior to CC 3.0"
+        self.skip_if_cc_major_lt(3, msg)
+        self._test_managed_alloc_driver(0.5)
+
+    # This test is skipped by default because it is easy to hang the machine
+    # for a very long time or get OOM killed if the GPU memory size is >50% of
+    # the system memory size. Even if the system does have more than 2x the RAM
+    # of the GPU, this test runs for a very long time (in comparison to the
+    # rest of the tests in the suite).
+    #
+    # However, it is left in here for manual testing as required.
+
+    @unittest.skip
+    def test_managed_alloc_driver_oversubscribe(self):
+        msg = "Oversubscription of managed memory unsupported prior to CC 6.0"
+        self.skip_if_cc_major_lt(6, msg)
+        self._test_managed_alloc_driver(2.0)
+
+    def test_managed_alloc_driver_host_attach(self):
+        msg = "Host attached managed memory is not accessible prior to CC 6.0"
+        self.skip_if_cc_major_lt(6, msg)
+        # Only test with a small array (0.01 * memory size) to keep the test
+        # quick.
+        self._test_managed_alloc_driver(0.01, attach_global=False)
+
+    def _test_managed_alloc_driver(self, memory_factor, attach_global=True):
+        # Verify that we can allocate and operate on managed
+        # memory through the CUDA driver interface.
+
+        total_mem_size = self.get_total_gpu_memory()
+        n_bytes = int(memory_factor * total_mem_size)
+
+        ctx = cuda.current_context()
+        mem = ctx.memallocmanaged(n_bytes, attach_global=attach_global)
+
+        dtype = np.dtype(np.uint8)
+        n_elems = n_bytes // dtype.itemsize
+        ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem)
+
+        magic = 0xab
+        device_memset(mem, magic, n_bytes)
+        ctx.synchronize()
+
+        # Note that this assertion operates on the CPU, so this
+        # test effectively drives both the CPU and the GPU on
+        # managed memory.
+
+        self.assertTrue(np.all(ary == magic))
+
+    def _test_managed_array(self, attach_global=True):
+        # Check the managed_array interface on both host and device.
+
+        ary = cuda.managed_array(100, dtype=np.double)
+        ary.fill(123.456)
+        self.assertTrue(all(ary == 123.456))
+
+        @cuda.jit('void(double[:])')
+        def kernel(x):
+            i = cuda.grid(1)
+            if i < x.shape[0]:
+                x[i] = 1.0
+
+        kernel[10, 10](ary)
+        cuda.current_context().synchronize()
+
+        self.assertTrue(all(ary == 1.0))
+
+    def test_managed_array_attach_global(self):
+        self._test_managed_array()
+
+    def test_managed_array_attach_host(self):
+        self._test_managed_array()
+        msg = "Host attached managed memory is not accessible prior to CC 6.0"
+        self.skip_if_cc_major_lt(6, msg)
+        self._test_managed_array(attach_global=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_mvc.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_mvc.py
@@ -0,0 +1,54 @@
+import multiprocessing as mp
+import traceback
+from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
+                                skip_if_mvc_libraries_unavailable)
+from numba.tests.support import linux_only
+
+
+def child_test():
+    from numba import config, cuda
+
+    # Change the MVC config after importing numba.cuda
+    config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+
+    @cuda.jit
+    def f():
+        pass
+
+    f[1, 1]()
+
+
+def child_test_wrapper(result_queue):
+    try:
+        output = child_test()
+        success = True
+    # Catch anything raised so it can be propagated
+    except: # noqa: E722
+        output = traceback.format_exc()
+        success = False
+
+    result_queue.put((success, output))
+
+
+@linux_only
+@skip_under_cuda_memcheck('May hang CUDA memcheck')
+@skip_on_cudasim('Simulator does not require or implement MVC')
+@skip_if_mvc_libraries_unavailable
+class TestMinorVersionCompatibility(CUDATestCase):
+    def test_mvc(self):
+        # Run test with Minor Version Compatibility enabled in a child process
+        ctx = mp.get_context('spawn')
+        result_queue = ctx.Queue()
+        proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
+        proc.start()
+        proc.join()
+        success, output = result_queue.get()
+
+        # Ensure the child process ran to completion before checking its output
+        if not success:
+            self.fail(output)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_nvvm_driver.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_nvvm_driver.py
@@ -0,0 +1,199 @@
+import warnings
+
+from llvmlite import ir
+from numba.cuda.cudadrv import nvvm, runtime
+from numba.cuda.testing import unittest
+from numba.cuda.cudadrv.nvvm import LibDevice, NvvmError, NVVM
+from numba.cuda.testing import skip_on_cudasim
+
+
+@skip_on_cudasim('NVVM Driver unsupported in the simulator')
+class TestNvvmDriver(unittest.TestCase):
+    def get_nvvmir(self):
+        versions = NVVM().get_ir_version()
+        data_layout = NVVM().data_layout
+        return nvvmir_generic.format(data_layout=data_layout, v=versions)
+
+    def test_nvvm_compile_simple(self):
+        nvvmir = self.get_nvvmir()
+        ptx = nvvm.compile_ir(nvvmir).decode('utf8')
+        self.assertTrue('simple' in ptx)
+        self.assertTrue('ave' in ptx)
+
+    def test_nvvm_compile_nullary_option(self):
+        # Tests compilation with an option that doesn't take an argument
+        # ("-gen-lto") - all other NVVM options are of the form
+        # "-<name>=<value>"
+
+        # -gen-lto is not available prior to CUDA 11.5
+        if runtime.get_version() < (11, 5):
+            self.skipTest("-gen-lto unavailable in this toolkit version")
+
+        nvvmir = self.get_nvvmir()
+        ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch="compute_52")
+
+        # Verify we correctly passed the option by checking if we got LTOIR
+        # from NVVM (by looking for the expected magic number for LTOIR)
+        self.assertEqual(ltoir[:4], b'\xed\x43\x4e\x7f')
+
+    def test_nvvm_bad_option(self):
+        # Ensure that unsupported / non-existent options are reported as such
+        # to the user / caller
+        msg = "-made-up-option=2 is an unsupported option"
+        with self.assertRaisesRegex(NvvmError, msg):
+            nvvm.compile_ir("", made_up_option=2)
+
+    def test_nvvm_from_llvm(self):
+        m = ir.Module("test_nvvm_from_llvm")
+        m.triple = 'nvptx64-nvidia-cuda'
+        nvvm.add_ir_version(m)
+        fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
+        kernel = ir.Function(m, fty, name='mycudakernel')
+        bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
+        bldr.ret_void()
+        nvvm.set_cuda_kernel(kernel)
+
+        m.data_layout = NVVM().data_layout
+        ptx = nvvm.compile_ir(str(m)).decode('utf8')
+        self.assertTrue('mycudakernel' in ptx)
+        self.assertTrue('.address_size 64' in ptx)
+
+    def test_used_list(self):
+        # Construct a module
+        m = ir.Module("test_used_list")
+        m.triple = 'nvptx64-nvidia-cuda'
+        m.data_layout = NVVM().data_layout
+        nvvm.add_ir_version(m)
+
+        # Add a function and mark it as a kernel
+        fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
+        kernel = ir.Function(m, fty, name='mycudakernel')
+        bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
+        bldr.ret_void()
+        nvvm.set_cuda_kernel(kernel)
+
+        # Verify that the used list was correctly constructed
+        used_lines = [line for line in str(m).splitlines()
+                      if 'llvm.used' in line]
+        msg = 'Expected exactly one @"llvm.used" array'
+        self.assertEqual(len(used_lines), 1, msg)
+
+        used_line = used_lines[0]
+        # Kernel should be referenced in the used list
+        self.assertIn("mycudakernel", used_line)
+        # Check linkage of the used list
+        self.assertIn("appending global", used_line)
+        # Ensure used list is in the metadata section
+        self.assertIn('section "llvm.metadata"', used_line)
+
+    def test_nvvm_ir_verify_fail(self):
+        m = ir.Module("test_bad_ir")
+        m.triple = "unknown-unknown-unknown"
+        m.data_layout = NVVM().data_layout
+        nvvm.add_ir_version(m)
+        with self.assertRaisesRegex(NvvmError, 'Invalid target triple'):
+            nvvm.compile_ir(str(m))
+
+    def _test_nvvm_support(self, arch):
+        compute_xx = 'compute_{0}{1}'.format(*arch)
+        nvvmir = self.get_nvvmir()
+        ptx = nvvm.compile_ir(nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0,
+                              prec_div=0).decode('utf8')
+        self.assertIn(".target sm_{0}{1}".format(*arch), ptx)
+        self.assertIn('simple', ptx)
+        self.assertIn('ave', ptx)
+
+    def test_nvvm_support(self):
+        """Test supported CC by NVVM
+        """
+        for arch in nvvm.get_supported_ccs():
+            self._test_nvvm_support(arch=arch)
+
+    def test_nvvm_warning(self):
+        m = ir.Module("test_nvvm_warning")
+        m.triple = 'nvptx64-nvidia-cuda'
+        m.data_layout = NVVM().data_layout
+        nvvm.add_ir_version(m)
+
+        fty = ir.FunctionType(ir.VoidType(), [])
+        kernel = ir.Function(m, fty, name='inlinekernel')
+        builder = ir.IRBuilder(kernel.append_basic_block('entry'))
+        builder.ret_void()
+        nvvm.set_cuda_kernel(kernel)
+
+        # Add the noinline attribute to trigger NVVM to generate a warning
+        kernel.attributes.add('noinline')
+
+        with warnings.catch_warnings(record=True) as w:
+            nvvm.compile_ir(str(m))
+
+        self.assertEqual(len(w), 1)
+        self.assertIn('overriding noinline attribute', str(w[0]))
+
+
+@skip_on_cudasim('NVVM Driver unsupported in the simulator')
+class TestArchOption(unittest.TestCase):
+    def test_get_arch_option(self):
+        # Test returning the nearest lowest arch.
+        self.assertEqual(nvvm.get_arch_option(5, 3), 'compute_53')
+        self.assertEqual(nvvm.get_arch_option(7, 5), 'compute_75')
+        self.assertEqual(nvvm.get_arch_option(7, 7), 'compute_75')
+        # Test known arch.
+        supported_cc = nvvm.get_supported_ccs()
+        for arch in supported_cc:
+            self.assertEqual(nvvm.get_arch_option(*arch), 'compute_%d%d' % arch)
+        self.assertEqual(nvvm.get_arch_option(1000, 0),
+                         'compute_%d%d' % supported_cc[-1])
+
+
+@skip_on_cudasim('NVVM Driver unsupported in the simulator')
+class TestLibDevice(unittest.TestCase):
+    def test_libdevice_load(self):
+        # Test that constructing LibDevice gives a bitcode file
+        libdevice = LibDevice()
+        self.assertEqual(libdevice.bc[:4], b'BC\xc0\xde')
+
+
+nvvmir_generic = '''\
+target triple="nvptx64-nvidia-cuda"
+target datalayout = "{data_layout}"
+
+define i32 @ave(i32 %a, i32 %b) {{
+entry:
+%add = add nsw i32 %a, %b
+%div = sdiv i32 %add, 2
+ret i32 %div
+}}
+
+define void @simple(i32* %data) {{
+entry:
+%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+%1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+%mul = mul i32 %0, %1
+%2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+%add = add i32 %mul, %2
+%call = call i32 @ave(i32 %add, i32 %add)
+%idxprom = sext i32 %add to i64
+%arrayidx = getelementptr inbounds i32, i32* %data, i64 %idxprom
+store i32 %call, i32* %arrayidx, align 4
+ret void
+}}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {v[0]}, i32 {v[1]}, i32 {v[2]}, i32 {v[3]}}}
+
+!nvvm.annotations = !{{!2}}
+!2 = !{{void (i32*)* @simple, !"kernel", i32 1}}
+
+@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i32*)* @simple to i8*)], section "llvm.metadata"
+'''  # noqa: E501
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_pinned.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_pinned.py
@@ -0,0 +1,37 @@
+import numpy as np
+import platform
+
+from numba import cuda
+from numba.cuda.testing import unittest, ContextResettingTestCase
+
+
+class TestPinned(ContextResettingTestCase):
+
+    def _run_copies(self, A):
+        A0 = np.copy(A)
+
+        stream = cuda.stream()
+        ptr = cuda.to_device(A, copy=False, stream=stream)
+        ptr.copy_to_device(A, stream=stream)
+        ptr.copy_to_host(A, stream=stream)
+        stream.synchronize()
+
+        self.assertTrue(np.allclose(A, A0))
+
+    def test_pinned(self):
+        machine = platform.machine()
+        if machine.startswith('arm') or machine.startswith('aarch64'):
+            count = 262144   # 2MB
+        else:
+            count = 2097152  # 16MB
+        A = np.arange(count)
+        with cuda.pinned(A):
+            self._run_copies(A)
+
+    def test_unpinned(self):
+        A = np.arange(2 * 1024 * 1024) # 16 MB
+        self._run_copies(A)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_profiler.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_profiler.py
@@ -0,0 +1,20 @@
+import unittest
+from numba.cuda.testing import ContextResettingTestCase
+from numba import cuda
+from numba.cuda.testing import skip_on_cudasim
+
+
+@skip_on_cudasim('CUDA Profiler unsupported in the simulator')
+class TestProfiler(ContextResettingTestCase):
+    def test_profiling(self):
+        with cuda.profiling():
+            a = cuda.device_array(10)
+            del a
+
+        with cuda.profiling():
+            a = cuda.device_array(100)
+            del a
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_ptds.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_ptds.py
@@ -0,0 +1,149 @@
+import multiprocessing as mp
+import logging
+import traceback
+from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import (skip_on_cudasim, skip_with_cuda_python,
+                                skip_under_cuda_memcheck)
+from numba.tests.support import linux_only
+
+
+def child_test():
+    from numba import cuda, int32, void
+    from numba.core import config
+    import io
+    import numpy as np
+    import threading
+
+    # Enable PTDS before we make any CUDA driver calls.  Enabling it first
+    # ensures that PTDS APIs are used because the CUDA driver looks up API
+    # functions on first use and memoizes them.
+    config.CUDA_PER_THREAD_DEFAULT_STREAM = 1
+
+    # Set up log capture for the Driver API so we can see what API calls were
+    # used.
+    logbuf = io.StringIO()
+    handler = logging.StreamHandler(logbuf)
+    cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver')
+    cudadrv_logger.addHandler(handler)
+    cudadrv_logger.setLevel(logging.DEBUG)
+
+    # Set up data for our test, and copy over to the device
+    N = 2 ** 16
+    N_THREADS = 10
+    N_ADDITIONS = 4096
+
+    # Seed the RNG for repeatability
+    np.random.seed(1)
+    x = np.random.randint(low=0, high=1000, size=N, dtype=np.int32)
+    r = np.zeros_like(x)
+
+    # One input and output array for each thread
+    xs = [cuda.to_device(x) for _ in range(N_THREADS)]
+    rs = [cuda.to_device(r) for _ in range(N_THREADS)]
+
+    # Compute the grid size and get the [per-thread] default stream
+    n_threads = 256
+    n_blocks = N // n_threads
+    stream = cuda.default_stream()
+
+    # A simple multiplication-by-addition kernel. What it does exactly is not
+    # too important; only that we have a kernel that does something.
+    @cuda.jit(void(int32[::1], int32[::1]))
+    def f(r, x):
+        i = cuda.grid(1)
+
+        if i > len(r):
+            return
+
+        # Accumulate x into r
+        for j in range(N_ADDITIONS):
+            r[i] += x[i]
+
+    # This function will be used to launch the kernel from each thread on its
+    # own unique data.
+    def kernel_thread(n):
+        f[n_blocks, n_threads, stream](rs[n], xs[n])
+
+    # Create threads
+    threads = [threading.Thread(target=kernel_thread, args=(i,))
+               for i in range(N_THREADS)]
+
+    # Start all threads
+    for thread in threads:
+        thread.start()
+
+    # Wait for all threads to finish, to ensure that we don't synchronize with
+    # the device until all kernels are scheduled.
+    for thread in threads:
+        thread.join()
+
+    # Synchronize with the device
+    cuda.synchronize()
+
+    # Check output is as expected
+    expected = x * N_ADDITIONS
+    for i in range(N_THREADS):
+        np.testing.assert_equal(rs[i].copy_to_host(), expected)
+
+    # Return the driver log output to the calling process for checking
+    handler.flush()
+    return logbuf.getvalue()
+
+
+def child_test_wrapper(result_queue):
+    try:
+        output = child_test()
+        success = True
+    # Catch anything raised so it can be propagated
+    except: # noqa: E722
+        output = traceback.format_exc()
+        success = False
+
+    result_queue.put((success, output))
+
+
+# Run on Linux only until the reason for test hangs on Windows (Issue #8635,
+# https://github.com/numba/numba/issues/8635) is diagnosed
+@linux_only
+@skip_under_cuda_memcheck('Hangs cuda-memcheck')
+@skip_on_cudasim('Streams not supported on the simulator')
+class TestPTDS(CUDATestCase):
+    @skip_with_cuda_python('Function names unchanged for PTDS with NV Binding')
+    def test_ptds(self):
+        # Run a test with PTDS enabled in a child process
+        ctx = mp.get_context('spawn')
+        result_queue = ctx.Queue()
+        proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
+        proc.start()
+        proc.join()
+        success, output = result_queue.get()
+
+        # Ensure the child process ran to completion before checking its output
+        if not success:
+            self.fail(output)
+
+        # Functions with a per-thread default stream variant that we expect to
+        # see in the output
+        ptds_functions = ('cuMemcpyHtoD_v2_ptds', 'cuLaunchKernel_ptsz',
+                          'cuMemcpyDtoH_v2_ptds')
+
+        for fn in ptds_functions:
+            with self.subTest(fn=fn, expected=True):
+                self.assertIn(fn, output)
+
+        # Non-PTDS versions of the functions that we should not see in the
+        # output:
+        legacy_functions = ('cuMemcpyHtoD_v2', 'cuLaunchKernel',
+                            'cuMemcpyDtoH_v2')
+
+        for fn in legacy_functions:
+            with self.subTest(fn=fn, expected=False):
+                # Ensure we only spot these function names appearing without a
+                # _ptds or _ptsz suffix by checking including the end of the
+                # line in the log
+                fn_at_end = f'{fn}\n'
+                self.assertNotIn(fn_at_end, output)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_reset_device.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_reset_device.py
@@ -0,0 +1,36 @@
+import threading
+from numba import cuda
+from numba.cuda.cudadrv.driver import driver
+from numba.cuda.testing import unittest, ContextResettingTestCase
+from queue import Queue
+
+
+class TestResetDevice(ContextResettingTestCase):
+    def test_reset_device(self):
+
+        def newthread(exception_queue):
+            try:
+                devices = range(driver.get_device_count())
+                for _ in range(2):
+                    for d in devices:
+                        cuda.select_device(d)
+                        cuda.close()
+            except Exception as e:
+                exception_queue.put(e)
+
+        # Do test on a separate thread so that we don't affect
+        # the current context in the main thread.
+
+        exception_queue = Queue()
+        t = threading.Thread(target=newthread, args=(exception_queue,))
+        t.start()
+        t.join()
+
+        exceptions = []
+        while not exception_queue.empty():
+            exceptions.append(exception_queue.get())
+        self.assertEqual(exceptions, [])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_runtime.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_runtime.py
@@ -0,0 +1,85 @@
+import multiprocessing
+import os
+from numba.core import config
+from numba.cuda.cudadrv.runtime import runtime
+from numba.cuda.testing import unittest, SerialMixin, skip_on_cudasim
+from unittest.mock import patch
+
+
+def set_visible_devices_and_check(q):
+    try:
+        from numba import cuda
+        import os
+
+        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+        q.put(len(cuda.gpus.lst))
+    except: # noqa: E722
+        # Sentinel value for error executing test code
+        q.put(-1)
+
+
+if config.ENABLE_CUDASIM:
+    SUPPORTED_VERSIONS = (-1, -1),
+else:
+    SUPPORTED_VERSIONS = ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5),
+                          (11, 6), (11, 7))
+
+
+class TestRuntime(unittest.TestCase):
+    def test_is_supported_version_true(self):
+        for v in SUPPORTED_VERSIONS:
+            with patch.object(runtime, 'get_version', return_value=v):
+                self.assertTrue(runtime.is_supported_version())
+
+    @skip_on_cudasim('The simulator always simulates a supported runtime')
+    def test_is_supported_version_false(self):
+        # Check with an old unsupported version and some potential future
+        # versions
+        for v in ((10, 2), (11, 8), (12, 0)):
+            with patch.object(runtime, 'get_version', return_value=v):
+                self.assertFalse(runtime.is_supported_version())
+
+    def test_supported_versions(self):
+        self.assertEqual(SUPPORTED_VERSIONS, runtime.supported_versions)
+
+
+class TestVisibleDevices(unittest.TestCase, SerialMixin):
+    def test_visible_devices_set_after_import(self):
+        # See Issue #6149. This test checks that we can set
+        # CUDA_VISIBLE_DEVICES after importing Numba and have the value
+        # reflected in the available list of GPUs. Prior to the fix for this
+        # issue, Numba made a call to runtime.get_version() on import that
+        # initialized the driver and froze the list of available devices before
+        # CUDA_VISIBLE_DEVICES could be set by the user.
+
+        # Avoid importing cuda at the top level so that
+        # set_visible_devices_and_check gets to import it first in its process
+        from numba import cuda
+
+        if len(cuda.gpus.lst) in (0, 1):
+            self.skipTest('This test requires multiple GPUs')
+
+        if os.environ.get('CUDA_VISIBLE_DEVICES'):
+            msg = 'Cannot test when CUDA_VISIBLE_DEVICES already set'
+            self.skipTest(msg)
+
+        ctx = multiprocessing.get_context('spawn')
+        q = ctx.Queue()
+        p = ctx.Process(target=set_visible_devices_and_check, args=(q,))
+        p.start()
+        try:
+            visible_gpu_count = q.get()
+        finally:
+            p.join()
+
+        # Make an obvious distinction between an error running the test code
+        # and an incorrect number of GPUs in the list
+        msg = 'Error running set_visible_devices_and_check'
+        self.assertNotEqual(visible_gpu_count, -1, msg=msg)
+
+        # The actual check that we see only one GPU
+        self.assertEqual(visible_gpu_count, 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_select_device.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_select_device.py
@@ -0,0 +1,41 @@
+#
+# Test does not work on some cards.
+#
+import threading
+from queue import Queue
+
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import unittest, ContextResettingTestCase
+
+
+def newthread(exception_queue):
+    try:
+        cuda.select_device(0)
+        stream = cuda.stream()
+        A = np.arange(100)
+        dA = cuda.to_device(A, stream=stream)
+        stream.synchronize()
+        del dA
+        del stream
+        cuda.close()
+    except Exception as e:
+        exception_queue.put(e)
+
+
+class TestSelectDevice(ContextResettingTestCase):
+    def test_select_device(self):
+        exception_queue = Queue()
+        for i in range(10):
+            t = threading.Thread(target=newthread, args=(exception_queue,))
+            t.start()
+            t.join()
+
+        exceptions = []
+        while not exception_queue.empty():
+            exceptions.append(exception_queue.get())
+        self.assertEqual(exceptions, [])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_streams.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudadrv/test_streams.py
@@ -0,0 +1,122 @@
+import asyncio
+import functools
+import threading
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+
+
+def with_asyncio_loop(f):
+    @functools.wraps(f)
+    def runner(*args, **kwds):
+        loop = asyncio.new_event_loop()
+        loop.set_debug(True)
+        try:
+            return loop.run_until_complete(f(*args, **kwds))
+        finally:
+            loop.close()
+    return runner
+
+
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+class TestCudaStream(CUDATestCase):
+    def test_add_callback(self):
+        def callback(stream, status, event):
+            event.set()
+
+        stream = cuda.stream()
+        callback_event = threading.Event()
+        stream.add_callback(callback, callback_event)
+        self.assertTrue(callback_event.wait(1.0))
+
+    def test_add_callback_with_default_arg(self):
+        callback_event = threading.Event()
+
+        def callback(stream, status, arg):
+            self.assertIsNone(arg)
+            callback_event.set()
+
+        stream = cuda.stream()
+        stream.add_callback(callback)
+        self.assertTrue(callback_event.wait(1.0))
+
+    @with_asyncio_loop
+    async def test_async_done(self):
+        stream = cuda.stream()
+        await stream.async_done()
+
+    @with_asyncio_loop
+    async def test_parallel_tasks(self):
+        async def async_cuda_fn(value_in: float) -> float:
+            stream = cuda.stream()
+            h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
+            h_src[:] = value_in
+            d_ary = cuda.to_device(h_src, stream=stream)
+            d_ary.copy_to_host(h_dst, stream=stream)
+            done_result = await stream.async_done()
+            self.assertEqual(done_result, stream)
+            return h_dst.mean()
+
+        values_in = [1, 2, 3, 4]
+        tasks = [asyncio.create_task(async_cuda_fn(v)) for v in values_in]
+        values_out = await asyncio.gather(*tasks)
+        self.assertTrue(np.allclose(values_in, values_out))
+
+    @with_asyncio_loop
+    async def test_multiple_async_done(self):
+        stream = cuda.stream()
+        done_aws = [stream.async_done() for _ in range(4)]
+        done = await asyncio.gather(*done_aws)
+        for d in done:
+            self.assertEqual(d, stream)
+
+    @with_asyncio_loop
+    async def test_multiple_async_done_multiple_streams(self):
+        streams = [cuda.stream() for _ in range(4)]
+        done_aws = [stream.async_done() for stream in streams]
+        done = await asyncio.gather(*done_aws)
+
+        # Ensure we got the four original streams in done
+        self.assertSetEqual(set(done), set(streams))
+
+    @with_asyncio_loop
+    async def test_cancelled_future(self):
+        stream = cuda.stream()
+        done1, done2 = stream.async_done(), stream.async_done()
+        done1.cancel()
+        await done2
+        self.assertTrue(done1.cancelled())
+        self.assertTrue(done2.done())
+
+
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+class TestFailingStream(CUDATestCase):
+    # This test can only be run in isolation because it corrupts the CUDA
+    # context, which cannot be recovered from within the same process. It is
+    # left here so that it can be run manually for debugging / testing purposes
+    # - or may be re-enabled if in future there is infrastructure added for
+    # running tests in a separate process (a subprocess cannot be used because
+    # CUDA will have been initialized before the fork, so it cannot be used in
+    # the child process).
+    @unittest.skip
+    @with_asyncio_loop
+    async def test_failed_stream(self):
+        ctx = cuda.current_context()
+        module = ctx.create_module_ptx("""
+            .version 6.5
+            .target sm_30
+            .address_size 64
+            .visible .entry failing_kernel() { trap; }
+        """)
+        failing_kernel = module.get_function("failing_kernel")
+
+        stream = cuda.stream()
+        failing_kernel.configure((1,), (1,), stream=stream).__call__()
+        done = stream.async_done()
+        with self.assertRaises(Exception):
+            await done
+        self.assertIsNotNone(done.exception())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/init.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/init.py
@@ -0,0 +1,8 @@
+from numba.cuda.testing import ensure_supported_ccs_initialized
+from numba.testing import load_testsuite
+import os
+
+
+def load_tests(loader, tests, pattern):
+    ensure_supported_ccs_initialized()
+    return load_testsuite(loader, os.path.dirname(__file__))
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/init.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/init.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/cache_usecases.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/cache_usecases.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/cache_with_cpu_usecases.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/cache_with_cpu_usecases.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/extensions_usecases.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/extensions_usecases.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/recursion_usecases.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/recursion_usecases.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_alignment.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_alignment.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_array.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_array.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_array_args.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_array_args.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_array_methods.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_array_methods.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_atomics.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_atomics.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_blackscholes.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_blackscholes.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_boolean.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_boolean.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_caching.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_caching.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_casting.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_casting.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cffi.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cffi.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_compiler.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_compiler.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_complex.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_complex.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_complex_kernel.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_complex_kernel.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_const_string.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_const_string.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_constmem.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_constmem.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cooperative_groups.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cooperative_groups.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cuda_array_interface.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cuda_array_interface.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cuda_jit_no_types.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_cuda_jit_no_types.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_datetime.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_datetime.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_debug.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_debug.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_debuginfo.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_debuginfo.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_device_func.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_device_func.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_dispatcher.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_dispatcher.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_enums.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_enums.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_errors.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_errors.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_exception.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_exception.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_extending.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_extending.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_fastmath.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_fastmath.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_forall.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_forall.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_freevar.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_freevar.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_frexp_ldexp.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_frexp_ldexp.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_globals.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_globals.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_gufunc.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_gufunc.cpython-312.pyc
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_gufunc_scalar.cpython-312.pyc
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/tests/cudapy/pycache/test_gufunc_scalar.cpython-312.pyc
--- a/Show More
+++ b/Show More