This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,24 @@
from numba.cuda.testing import ensure_supported_ccs_initialized
from numba.testing import unittest
from numba.testing import load_testsuite
from numba import cuda
from os.path import dirname, join
def load_tests(loader, tests, pattern):
suite = unittest.TestSuite()
this_dir = dirname(__file__)
ensure_supported_ccs_initialized()
suite.addTests(load_testsuite(loader, join(this_dir, 'nocuda')))
if cuda.is_available():
suite.addTests(load_testsuite(loader, join(this_dir, 'cudasim')))
gpus = cuda.list_devices()
if gpus and gpus[0].compute_capability >= (2, 0):
suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
suite.addTests(load_testsuite(loader, join(this_dir, 'doc_examples')))
else:
print("skipped CUDA tests because GPU CC < 2.0")
else:
print("skipped CUDA tests")
return suite

View File

@@ -0,0 +1,8 @@
from numba.cuda.testing import ensure_supported_ccs_initialized
from numba.testing import load_testsuite
import os
def load_tests(loader, tests, pattern):
ensure_supported_ccs_initialized()
return load_testsuite(loader, os.path.dirname(__file__))

View File

@@ -0,0 +1,145 @@
import numpy as np
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
class TestArrayAttr(CUDATestCase):
def test_contigous_2d(self):
ary = np.arange(10)
cary = ary.reshape(2, 5)
fary = np.asfortranarray(cary)
dcary = cuda.to_device(cary)
dfary = cuda.to_device(fary)
self.assertTrue(dcary.is_c_contiguous())
self.assertTrue(not dfary.is_c_contiguous())
self.assertTrue(not dcary.is_f_contiguous())
self.assertTrue(dfary.is_f_contiguous())
def test_contigous_3d(self):
ary = np.arange(20)
cary = ary.reshape(2, 5, 2)
fary = np.asfortranarray(cary)
dcary = cuda.to_device(cary)
dfary = cuda.to_device(fary)
self.assertTrue(dcary.is_c_contiguous())
self.assertTrue(not dfary.is_c_contiguous())
self.assertTrue(not dcary.is_f_contiguous())
self.assertTrue(dfary.is_f_contiguous())
def test_contigous_4d(self):
ary = np.arange(60)
cary = ary.reshape(2, 5, 2, 3)
fary = np.asfortranarray(cary)
dcary = cuda.to_device(cary)
dfary = cuda.to_device(fary)
self.assertTrue(dcary.is_c_contiguous())
self.assertTrue(not dfary.is_c_contiguous())
self.assertTrue(not dcary.is_f_contiguous())
self.assertTrue(dfary.is_f_contiguous())
def test_ravel_1d(self):
ary = np.arange(60)
dary = cuda.to_device(ary)
for order in 'CFA':
expect = ary.ravel(order=order)
dflat = dary.ravel(order=order)
flat = dflat.copy_to_host()
self.assertTrue(dary is not dflat) # ravel returns new array
self.assertEqual(flat.ndim, 1)
self.assertPreciseEqual(expect, flat)
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
def test_ravel_stride_1d(self):
ary = np.arange(60)
dary = cuda.to_device(ary)
# No-copy stride device array
darystride = dary[::2]
dary_data = dary.__cuda_array_interface__['data'][0]
ddarystride_data = darystride.__cuda_array_interface__['data'][0]
self.assertEqual(dary_data, ddarystride_data)
# Fail on ravel on non-contiguous array
with self.assertRaises(NotImplementedError):
darystride.ravel()
def test_ravel_c(self):
ary = np.arange(60)
reshaped = ary.reshape(2, 5, 2, 3)
expect = reshaped.ravel(order='C')
dary = cuda.to_device(reshaped)
dflat = dary.ravel()
flat = dflat.copy_to_host()
self.assertTrue(dary is not dflat)
self.assertEqual(flat.ndim, 1)
self.assertPreciseEqual(expect, flat)
# explicit order kwarg
for order in 'CA':
expect = reshaped.ravel(order=order)
dary = cuda.to_device(reshaped)
dflat = dary.ravel(order=order)
flat = dflat.copy_to_host()
self.assertTrue(dary is not dflat)
self.assertEqual(flat.ndim, 1)
self.assertPreciseEqual(expect, flat)
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
def test_ravel_stride_c(self):
ary = np.arange(60)
reshaped = ary.reshape(2, 5, 2, 3)
dary = cuda.to_device(reshaped)
darystride = dary[::2, ::2, ::2, ::2]
dary_data = dary.__cuda_array_interface__['data'][0]
ddarystride_data = darystride.__cuda_array_interface__['data'][0]
self.assertEqual(dary_data, ddarystride_data)
with self.assertRaises(NotImplementedError):
darystride.ravel()
def test_ravel_f(self):
ary = np.arange(60)
reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
for order in 'FA':
expect = reshaped.ravel(order=order)
dary = cuda.to_device(reshaped)
dflat = dary.ravel(order=order)
flat = dflat.copy_to_host()
self.assertTrue(dary is not dflat)
self.assertEqual(flat.ndim, 1)
self.assertPreciseEqual(expect, flat)
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
def test_ravel_stride_f(self):
ary = np.arange(60)
reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
dary = cuda.to_device(reshaped)
darystride = dary[::2, ::2, ::2, ::2]
dary_data = dary.__cuda_array_interface__['data'][0]
ddarystride_data = darystride.__cuda_array_interface__['data'][0]
self.assertEqual(dary_data, ddarystride_data)
with self.assertRaises(NotImplementedError):
darystride.ravel()
def test_reshape_c(self):
ary = np.arange(10)
expect = ary.reshape(2, 5)
dary = cuda.to_device(ary)
dary_reshaped = dary.reshape(2, 5)
got = dary_reshaped.copy_to_host()
self.assertPreciseEqual(expect, got)
def test_reshape_f(self):
ary = np.arange(10)
expect = ary.reshape(2, 5, order='F')
dary = cuda.to_device(ary)
dary_reshaped = dary.reshape(2, 5, order='F')
got = dary_reshaped.copy_to_host()
self.assertPreciseEqual(expect, got)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,145 @@
import numbers
from ctypes import byref
import weakref
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
from numba.cuda.cudadrv import driver
class TestContextStack(CUDATestCase):
def setUp(self):
super().setUp()
# Reset before testing
cuda.close()
def test_gpus_current(self):
self.assertIs(cuda.gpus.current, None)
with cuda.gpus[0]:
self.assertEqual(int(cuda.gpus.current.id), 0)
def test_gpus_len(self):
self.assertGreater(len(cuda.gpus), 0)
def test_gpus_iter(self):
gpulist = list(cuda.gpus)
self.assertGreater(len(gpulist), 0)
class TestContextAPI(CUDATestCase):
def tearDown(self):
super().tearDown()
cuda.close()
def test_context_memory(self):
try:
mem = cuda.current_context().get_memory_info()
except NotImplementedError:
self.skipTest('EMM Plugin does not implement get_memory_info()')
self.assertIsInstance(mem.free, numbers.Number)
self.assertEqual(mem.free, mem[0])
self.assertIsInstance(mem.total, numbers.Number)
self.assertEqual(mem.total, mem[1])
self.assertLessEqual(mem.free, mem.total)
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
@skip_on_cudasim('CUDA HW required')
def test_forbidden_context_switch(self):
# Cannot switch context inside a `cuda.require_context`
@cuda.require_context
def switch_gpu():
with cuda.gpus[1]:
pass
with cuda.gpus[0]:
with self.assertRaises(RuntimeError) as raises:
switch_gpu()
self.assertIn("Cannot switch CUDA-context.", str(raises.exception))
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
def test_accepted_context_switch(self):
def switch_gpu():
with cuda.gpus[1]:
return cuda.current_context().device.id
with cuda.gpus[0]:
devid = switch_gpu()
self.assertEqual(int(devid), 1)
@skip_on_cudasim('CUDA HW required')
class Test3rdPartyContext(CUDATestCase):
def tearDown(self):
super().tearDown()
cuda.close()
def test_attached_primary(self, extra_work=lambda: None):
# Emulate primary context creation by 3rd party
the_driver = driver.driver
if driver.USE_NV_BINDING:
dev = driver.binding.CUdevice(0)
hctx = the_driver.cuDevicePrimaryCtxRetain(dev)
else:
dev = 0
hctx = driver.drvapi.cu_context()
the_driver.cuDevicePrimaryCtxRetain(byref(hctx), dev)
try:
ctx = driver.Context(weakref.proxy(self), hctx)
ctx.push()
# Check that the context from numba matches the created primary
# context.
my_ctx = cuda.current_context()
if driver.USE_NV_BINDING:
self.assertEqual(int(my_ctx.handle), int(ctx.handle))
else:
self.assertEqual(my_ctx.handle.value, ctx.handle.value)
extra_work()
finally:
ctx.pop()
the_driver.cuDevicePrimaryCtxRelease(dev)
def test_attached_non_primary(self):
# Emulate non-primary context creation by 3rd party
the_driver = driver.driver
if driver.USE_NV_BINDING:
flags = 0
dev = driver.binding.CUdevice(0)
hctx = the_driver.cuCtxCreate(flags, dev)
else:
hctx = driver.drvapi.cu_context()
the_driver.cuCtxCreate(byref(hctx), 0, 0)
try:
cuda.current_context()
except RuntimeError as e:
# Expecting an error about non-primary CUDA context
self.assertIn("Numba cannot operate on non-primary CUDA context ",
str(e))
else:
self.fail("No RuntimeError raised")
finally:
the_driver.cuCtxDestroy(hctx)
def test_cudajit_in_attached_primary_context(self):
def do():
from numba import cuda
@cuda.jit
def foo(a):
for i in range(a.size):
a[i] = i
a = cuda.device_array(10)
foo[1, 1](a)
self.assertEqual(list(a.copy_to_host()), list(range(10)))
self.test_attached_primary(do)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,376 @@
from itertools import product
import numpy as np
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
from unittest.mock import patch
class CudaArrayIndexing(CUDATestCase):
def test_index_1d(self):
arr = np.arange(10)
darr = cuda.to_device(arr)
x, = arr.shape
for i in range(-x, x):
self.assertEqual(arr[i], darr[i])
with self.assertRaises(IndexError):
darr[-x - 1]
with self.assertRaises(IndexError):
darr[x]
def test_index_2d(self):
arr = np.arange(3 * 4).reshape(3, 4)
darr = cuda.to_device(arr)
x, y = arr.shape
for i in range(-x, x):
for j in range(-y, y):
self.assertEqual(arr[i, j], darr[i, j])
with self.assertRaises(IndexError):
darr[-x - 1, 0]
with self.assertRaises(IndexError):
darr[x, 0]
with self.assertRaises(IndexError):
darr[0, -y - 1]
with self.assertRaises(IndexError):
darr[0, y]
def test_index_3d(self):
arr = np.arange(3 * 4 * 5).reshape(3, 4, 5)
darr = cuda.to_device(arr)
x, y, z = arr.shape
for i in range(-x, x):
for j in range(-y, y):
for k in range(-z, z):
self.assertEqual(arr[i, j, k], darr[i, j, k])
with self.assertRaises(IndexError):
darr[-x - 1, 0, 0]
with self.assertRaises(IndexError):
darr[x, 0, 0]
with self.assertRaises(IndexError):
darr[0, -y - 1, 0]
with self.assertRaises(IndexError):
darr[0, y, 0]
with self.assertRaises(IndexError):
darr[0, 0, -z - 1]
with self.assertRaises(IndexError):
darr[0, 0, z]
class CudaArrayStridedSlice(CUDATestCase):
def test_strided_index_1d(self):
arr = np.arange(10)
darr = cuda.to_device(arr)
for i in range(arr.size):
np.testing.assert_equal(arr[i::2], darr[i::2].copy_to_host())
def test_strided_index_2d(self):
arr = np.arange(6 * 7).reshape(6, 7)
darr = cuda.to_device(arr)
for i in range(arr.shape[0]):
for j in range(arr.shape[1]):
np.testing.assert_equal(arr[i::2, j::2],
darr[i::2, j::2].copy_to_host())
def test_strided_index_3d(self):
arr = np.arange(6 * 7 * 8).reshape(6, 7, 8)
darr = cuda.to_device(arr)
for i in range(arr.shape[0]):
for j in range(arr.shape[1]):
for k in range(arr.shape[2]):
np.testing.assert_equal(
arr[i::2, j::2, k::2],
darr[i::2, j::2, k::2].copy_to_host())
class CudaArraySlicing(CUDATestCase):
def test_prefix_1d(self):
arr = np.arange(5)
darr = cuda.to_device(arr)
for i in range(arr.size):
expect = arr[i:]
got = darr[i:].copy_to_host()
self.assertTrue(np.all(expect == got))
def test_prefix_2d(self):
arr = np.arange(3 ** 2).reshape(3, 3)
darr = cuda.to_device(arr)
for i in range(arr.shape[0]):
for j in range(arr.shape[1]):
expect = arr[i:, j:]
sliced = darr[i:, j:]
self.assertEqual(expect.shape, sliced.shape)
self.assertEqual(expect.strides, sliced.strides)
got = sliced.copy_to_host()
self.assertTrue(np.all(expect == got))
def test_select_3d_first_two_dim(self):
arr = np.arange(3 * 4 * 5).reshape(3, 4, 5)
darr = cuda.to_device(arr)
# Select first dimension
for i in range(arr.shape[0]):
expect = arr[i]
sliced = darr[i]
self.assertEqual(expect.shape, sliced.shape)
self.assertEqual(expect.strides, sliced.strides)
got = sliced.copy_to_host()
self.assertTrue(np.all(expect == got))
# Select second dimension
for i in range(arr.shape[0]):
for j in range(arr.shape[1]):
expect = arr[i, j]
sliced = darr[i, j]
self.assertEqual(expect.shape, sliced.shape)
self.assertEqual(expect.strides, sliced.strides)
got = sliced.copy_to_host()
self.assertTrue(np.all(expect == got))
def test_select_f(self):
a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='F')
da = cuda.to_device(a)
for i in range(a.shape[0]):
for j in range(a.shape[1]):
self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
a[i, j, :]))
for j in range(a.shape[2]):
self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
a[i, :, j]))
for i in range(a.shape[1]):
for j in range(a.shape[2]):
self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
a[:, i, j]))
def test_select_c(self):
a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='C')
da = cuda.to_device(a)
for i in range(a.shape[0]):
for j in range(a.shape[1]):
self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
a[i, j, :]))
for j in range(a.shape[2]):
self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
a[i, :, j]))
for i in range(a.shape[1]):
for j in range(a.shape[2]):
self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
a[:, i, j]))
def test_prefix_select(self):
arr = np.arange(5 * 7).reshape(5, 7, order='F')
darr = cuda.to_device(arr)
self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1]))
def test_negative_slicing_1d(self):
arr = np.arange(10)
darr = cuda.to_device(arr)
for i, j in product(range(-10, 10), repeat=2):
np.testing.assert_array_equal(arr[i:j],
darr[i:j].copy_to_host())
def test_negative_slicing_2d(self):
arr = np.arange(12).reshape(3, 4)
darr = cuda.to_device(arr)
for x, y, w, s in product(range(-4, 4), repeat=4):
np.testing.assert_array_equal(arr[x:y, w:s],
darr[x:y, w:s].copy_to_host())
def test_empty_slice_1d(self):
arr = np.arange(5)
darr = cuda.to_device(arr)
for i in range(darr.shape[0]):
np.testing.assert_array_equal(darr[i:i].copy_to_host(), arr[i:i])
# empty slice of empty slice
np.testing.assert_array_equal(darr[:0][:0].copy_to_host(), np.empty(0))
# out-of-bound slice just produces empty slices
np.testing.assert_array_equal(darr[:0][:1].copy_to_host(),
arr[:0][:1])
np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
arr[:0][-1:])
def test_empty_slice_2d(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
np.testing.assert_array_equal(darr[:0].copy_to_host(), arr[:0])
np.testing.assert_array_equal(darr[3, :0].copy_to_host(), arr[3, :0])
# empty slice of empty slice
np.testing.assert_array_equal(darr[:0][:0].copy_to_host(),
np.empty((0, 7)))
# out-of-bound slice just produces empty slices
np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1])
np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
arr[:0][-1:])
class CudaArraySetting(CUDATestCase):
"""
Most of the slicing logic is tested in the cases above, so these
tests focus on the setting logic.
"""
def test_scalar(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
arr[2, 2] = 500
darr[2, 2] = 500
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_rank(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
arr[2] = 500
darr[2] = 500
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_broadcast(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
arr[:, 2] = 500
darr[:, 2] = 500
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_array_assign_column(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
_400 = np.full(shape=7, fill_value=400)
arr[2] = _400
darr[2] = _400
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_array_assign_row(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
_400 = np.full(shape=5, fill_value=400)
arr[:, 2] = _400
darr[:, 2] = _400
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_array_assign_subarray(self):
arr = np.arange(5 * 6 * 7).reshape(5, 6, 7)
darr = cuda.to_device(arr)
_400 = np.full(shape=(6, 7), fill_value=400)
arr[2] = _400
darr[2] = _400
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_array_assign_deep_subarray(self):
arr = np.arange(5 * 6 * 7 * 8).reshape(5, 6, 7, 8)
darr = cuda.to_device(arr)
_400 = np.full(shape=(5, 6, 8), fill_value=400)
arr[:, :, 2] = _400
darr[:, :, 2] = _400
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_array_assign_all(self):
arr = np.arange(5 * 7).reshape(5, 7)
darr = cuda.to_device(arr)
_400 = np.full(shape=(5, 7), fill_value=400)
arr[:] = _400
darr[:] = _400
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_strides(self):
arr = np.ones(20)
darr = cuda.to_device(arr)
arr[::2] = 500
darr[::2] = 500
np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_incompatible_highdim(self):
darr = cuda.to_device(np.arange(5 * 7))
with self.assertRaises(ValueError) as e:
darr[:] = np.ones(shape=(1, 2, 3))
self.assertIn(
member=str(e.exception),
container=[
"Can't assign 3-D array to 1-D self", # device
"could not broadcast input array from shape (2,3) "
"into shape (35,)", # simulator, NP >= 1.20
])
def test_incompatible_shape(self):
darr = cuda.to_device(np.arange(5))
with self.assertRaises(ValueError) as e:
darr[:] = [1, 3]
self.assertIn(
member=str(e.exception),
container=[
"Can't copy sequence with size 2 to array axis 0 with "
"dimension 5", # device
"could not broadcast input array from shape (2,) into "
"shape (5,)", # simulator, NP >= 1.20
])
@skip_on_cudasim('cudasim does not use streams and operates synchronously')
def test_sync(self):
# There should be a synchronization when no stream is supplied
darr = cuda.to_device(np.arange(5))
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
return_value=None) as mock_sync:
darr[0] = 10
mock_sync.assert_called_once()
@skip_on_cudasim('cudasim does not use streams and operates synchronously')
def test_no_sync_default_stream(self):
# There should not be a synchronization when the array has a default
# stream, whether it is the default stream, the legacy default stream,
# the per-thread default stream, or another stream.
streams = (cuda.stream(), cuda.default_stream(),
cuda.legacy_default_stream(),
cuda.per_thread_default_stream())
for stream in streams:
darr = cuda.to_device(np.arange(5), stream=stream)
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
return_value=None) as mock_sync:
darr[0] = 10
mock_sync.assert_not_called()
@skip_on_cudasim('cudasim does not use streams and operates synchronously')
def test_no_sync_supplied_stream(self):
# There should not be a synchronization when a stream is supplied for
# the setitem call, whether it is the default stream, the legacy default
# stream, the per-thread default stream, or another stream.
streams = (cuda.stream(), cuda.default_stream(),
cuda.legacy_default_stream(),
cuda.per_thread_default_stream())
for stream in streams:
darr = cuda.to_device(np.arange(5))
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
return_value=None) as mock_sync:
darr.setitem(0, 10, stream=stream)
mock_sync.assert_not_called()
@unittest.skip('Requires PR #6367')
def test_issue_6505(self):
# On Windows, the writes to ary_v would not be visible prior to the
# assertion, due to the assignment being done with a kernel launch that
# returns asynchronously - there should now be a sync after the kernel
# launch to ensure that the writes are always visible.
ary = cuda.mapped_array(2, dtype=np.int32)
ary[:] = 0
ary_v = ary.view('u1')
ary_v[1] = 1
ary_v[5] = 1
self.assertEqual(sum(ary), 512)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,21 @@
import numpy as np
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase
class TestCudaAutoContext(CUDATestCase):
def test_auto_context(self):
"""A problem was revealed by a customer that the use cuda.to_device
does not create a CUDA context.
This tests the problem
"""
A = np.arange(10, dtype=np.float32)
newA = np.empty_like(A)
dA = cuda.to_device(A)
dA.copy_to_host(newA)
self.assertTrue(np.allclose(A, newA))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,179 @@
import numpy as np
import ctypes
from numba.cuda.cudadrv.devicearray import (DeviceRecord, from_record_like,
auto_device)
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import skip_on_cudasim
from numba.np import numpy_support
from numba import cuda
N_CHARS = 5
recordtype = np.dtype(
[
('a', np.float64),
('b', np.int32),
('c', np.complex64),
('d', (np.str_, N_CHARS))
],
align=True
)
recordwitharray = np.dtype(
[
('g', np.int32),
('h', np.float32, 2)
],
align=True
)
recwithmat = np.dtype([('i', np.int32),
('j', np.float32, (3, 3))])
recwithrecwithmat = np.dtype([('x', np.int32), ('y', recwithmat)])
@skip_on_cudasim('Device Record API unsupported in the simulator')
class TestCudaDeviceRecord(CUDATestCase):
"""
Tests the DeviceRecord class with np.void host types.
"""
def setUp(self):
super().setUp()
self._create_data(np.zeros)
def _create_data(self, array_ctor):
self.dtype = np.dtype([('a', np.int32), ('b', np.float32)], align=True)
self.hostz = array_ctor(1, self.dtype)[0]
self.hostnz = array_ctor(1, self.dtype)[0]
self.hostnz['a'] = 10
self.hostnz['b'] = 11.0
def _check_device_record(self, reference, rec):
self.assertEqual(rec.shape, tuple())
self.assertEqual(rec.strides, tuple())
self.assertEqual(rec.dtype, reference.dtype)
self.assertEqual(rec.alloc_size, reference.dtype.itemsize)
self.assertIsNotNone(rec.gpu_data)
self.assertNotEqual(rec.device_ctypes_pointer, ctypes.c_void_p(0))
numba_type = numpy_support.from_dtype(reference.dtype)
self.assertEqual(rec._numba_type_, numba_type)
def test_device_record_interface(self):
hostrec = self.hostz.copy()
devrec = DeviceRecord(self.dtype)
self._check_device_record(hostrec, devrec)
def test_device_record_copy(self):
hostrec = self.hostz.copy()
devrec = DeviceRecord(self.dtype)
devrec.copy_to_device(hostrec)
# Copy back and check values are all zeros
hostrec2 = self.hostnz.copy()
devrec.copy_to_host(hostrec2)
np.testing.assert_equal(self.hostz, hostrec2)
# Copy non-zero values to GPU and back and check values
hostrec3 = self.hostnz.copy()
devrec.copy_to_device(hostrec3)
hostrec4 = self.hostz.copy()
devrec.copy_to_host(hostrec4)
np.testing.assert_equal(hostrec4, self.hostnz)
def test_from_record_like(self):
# Create record from host record
hostrec = self.hostz.copy()
devrec = from_record_like(hostrec)
self._check_device_record(hostrec, devrec)
# Create record from device record and check for distinct data
devrec2 = from_record_like(devrec)
self._check_device_record(devrec, devrec2)
self.assertNotEqual(devrec.gpu_data, devrec2.gpu_data)
def test_auto_device(self):
# Create record from host record
hostrec = self.hostnz.copy()
devrec, new_gpu_obj = auto_device(hostrec)
self._check_device_record(hostrec, devrec)
self.assertTrue(new_gpu_obj)
# Copy data back and check it is equal to auto_device arg
hostrec2 = self.hostz.copy()
devrec.copy_to_host(hostrec2)
np.testing.assert_equal(hostrec2, hostrec)
class TestCudaDeviceRecordWithRecord(TestCudaDeviceRecord):
"""
Tests the DeviceRecord class with np.record host types
"""
def setUp(self):
CUDATestCase.setUp(self)
self._create_data(np.recarray)
@skip_on_cudasim('Structured array attr access not supported in simulator')
class TestRecordDtypeWithStructArrays(CUDATestCase):
'''
Test operation of device arrays on structured arrays.
'''
def _createSampleArrays(self):
self.sample1d = cuda.device_array(3, dtype=recordtype)
self.samplerec1darr = cuda.device_array(1, dtype=recordwitharray)[0]
self.samplerecmat = cuda.device_array(1,dtype=recwithmat)[0]
def setUp(self):
super().setUp()
self._createSampleArrays()
ary = self.sample1d
for i in range(ary.size):
x = i + 1
ary[i]['a'] = x / 2
ary[i]['b'] = x
ary[i]['c'] = x * 1j
ary[i]['d'] = str(x) * N_CHARS
def test_structured_array1(self):
ary = self.sample1d
for i in range(self.sample1d.size):
x = i + 1
self.assertEqual(ary[i]['a'], x / 2)
self.assertEqual(ary[i]['b'], x)
self.assertEqual(ary[i]['c'], x * 1j)
self.assertEqual(ary[i]['d'], str(x) * N_CHARS)
def test_structured_array2(self):
ary = self.samplerec1darr
ary['g'] = 2
ary['h'][0] = 3.0
ary['h'][1] = 4.0
self.assertEqual(ary['g'], 2)
self.assertEqual(ary['h'][0], 3.0)
self.assertEqual(ary['h'][1], 4.0)
def test_structured_array3(self):
ary = self.samplerecmat
mat = np.array([[5.0, 10.0, 15.0],
[20.0, 25.0, 30.0],
[35.0, 40.0, 45.0]],
dtype=np.float32).reshape(3,3)
ary['j'][:] = mat
np.testing.assert_equal(ary['j'], mat)
def test_structured_array4(self):
arr = np.zeros(1, dtype=recwithrecwithmat)
d_arr = cuda.to_device(arr)
d_arr[0]['y']['i'] = 1
self.assertEqual(d_arr[0]['y']['i'], 1)
d_arr[0]['y']['j'][0, 0] = 2.0
self.assertEqual(d_arr[0]['y']['j'][0, 0], 2.0)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,235 @@
from ctypes import byref, c_int, c_void_p, sizeof
from numba.cuda.cudadrv.driver import (host_to_device, device_to_host, driver,
launch_kernel)
from numba.cuda.cudadrv import devices, drvapi, driver as _driver
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import skip_on_cudasim
ptx1 = '''
.version 1.4
.target sm_10, map_f64_to_f32
.entry _Z10helloworldPi (
.param .u64 __cudaparm__Z10helloworldPi_A)
{
.reg .u32 %r<3>;
.reg .u64 %rd<6>;
.loc 14 4 0
$LDWbegin__Z10helloworldPi:
.loc 14 6 0
cvt.s32.u16 %r1, %tid.x;
ld.param.u64 %rd1, [__cudaparm__Z10helloworldPi_A];
cvt.u64.u16 %rd2, %tid.x;
mul.lo.u64 %rd3, %rd2, 4;
add.u64 %rd4, %rd1, %rd3;
st.global.s32 [%rd4+0], %r1;
.loc 14 7 0
exit;
$LDWend__Z10helloworldPi:
} // _Z10helloworldPi
'''
ptx2 = '''
.version 3.0
.target sm_20
.address_size 64
.file 1 "/tmp/tmpxft_000012c7_00000000-9_testcuda.cpp3.i"
.file 2 "testcuda.cu"
.entry _Z10helloworldPi(
.param .u64 _Z10helloworldPi_param_0
)
{
.reg .s32 %r<3>;
.reg .s64 %rl<5>;
ld.param.u64 %rl1, [_Z10helloworldPi_param_0];
cvta.to.global.u64 %rl2, %rl1;
.loc 2 6 1
mov.u32 %r1, %tid.x;
mul.wide.u32 %rl3, %r1, 4;
add.s64 %rl4, %rl2, %rl3;
st.global.u32 [%rl4], %r1;
.loc 2 7 2
ret;
}
'''
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
class TestCudaDriver(CUDATestCase):
def setUp(self):
super().setUp()
self.assertTrue(len(devices.gpus) > 0)
self.context = devices.get_context()
device = self.context.device
ccmajor, _ = device.compute_capability
if ccmajor >= 2:
self.ptx = ptx2
else:
self.ptx = ptx1
def tearDown(self):
super().tearDown()
del self.context
def test_cuda_driver_basic(self):
module = self.context.create_module_ptx(self.ptx)
function = module.get_function('_Z10helloworldPi')
array = (c_int * 100)()
memory = self.context.memalloc(sizeof(array))
host_to_device(memory, array, sizeof(array))
ptr = memory.device_ctypes_pointer
stream = 0
if _driver.USE_NV_BINDING:
ptr = c_void_p(int(ptr))
stream = _driver.binding.CUstream(stream)
launch_kernel(function.handle, # Kernel
1, 1, 1, # gx, gy, gz
100, 1, 1, # bx, by, bz
0, # dynamic shared mem
stream, # stream
[ptr]) # arguments
device_to_host(array, memory, sizeof(array))
for i, v in enumerate(array):
self.assertEqual(i, v)
module.unload()
def test_cuda_driver_stream_operations(self):
module = self.context.create_module_ptx(self.ptx)
function = module.get_function('_Z10helloworldPi')
array = (c_int * 100)()
stream = self.context.create_stream()
with stream.auto_synchronize():
memory = self.context.memalloc(sizeof(array))
host_to_device(memory, array, sizeof(array), stream=stream)
ptr = memory.device_ctypes_pointer
if _driver.USE_NV_BINDING:
ptr = c_void_p(int(ptr))
launch_kernel(function.handle, # Kernel
1, 1, 1, # gx, gy, gz
100, 1, 1, # bx, by, bz
0, # dynamic shared mem
stream.handle, # stream
[ptr]) # arguments
device_to_host(array, memory, sizeof(array), stream=stream)
for i, v in enumerate(array):
self.assertEqual(i, v)
def test_cuda_driver_default_stream(self):
# Test properties of the default stream
ds = self.context.get_default_stream()
self.assertIn("Default CUDA stream", repr(ds))
self.assertEqual(0, int(ds))
# bool(stream) is the check that is done in memcpy to decide if async
# version should be used. So the default (0) stream should be true-ish
# even though 0 is usually false-ish in Python.
self.assertTrue(ds)
self.assertFalse(ds.external)
def test_cuda_driver_legacy_default_stream(self):
# Test properties of the legacy default stream
ds = self.context.get_legacy_default_stream()
self.assertIn("Legacy default CUDA stream", repr(ds))
self.assertEqual(1, int(ds))
self.assertTrue(ds)
self.assertFalse(ds.external)
def test_cuda_driver_per_thread_default_stream(self):
# Test properties of the per-thread default stream
ds = self.context.get_per_thread_default_stream()
self.assertIn("Per-thread default CUDA stream", repr(ds))
self.assertEqual(2, int(ds))
self.assertTrue(ds)
self.assertFalse(ds.external)
def test_cuda_driver_stream(self):
# Test properties of non-default streams
s = self.context.create_stream()
self.assertIn("CUDA stream", repr(s))
self.assertNotIn("Default", repr(s))
self.assertNotIn("External", repr(s))
self.assertNotEqual(0, int(s))
self.assertTrue(s)
self.assertFalse(s.external)
def test_cuda_driver_external_stream(self):
# Test properties of a stream created from an external stream object.
# We use the driver API directly to create a stream, to emulate an
# external library creating a stream
if _driver.USE_NV_BINDING:
handle = driver.cuStreamCreate(0)
ptr = int(handle)
else:
handle = drvapi.cu_stream()
driver.cuStreamCreate(byref(handle), 0)
ptr = handle.value
s = self.context.create_external_stream(ptr)
self.assertIn("External CUDA stream", repr(s))
# Ensure neither "Default" nor "default"
self.assertNotIn("efault", repr(s))
self.assertEqual(ptr, int(s))
self.assertTrue(s)
self.assertTrue(s.external)
def test_cuda_driver_occupancy(self):
module = self.context.create_module_ptx(self.ptx)
function = module.get_function('_Z10helloworldPi')
value = self.context.get_active_blocks_per_multiprocessor(function,
128, 128)
self.assertTrue(value > 0)
def b2d(bs):
return bs
grid, block = self.context.get_max_potential_block_size(function, b2d,
128, 128)
self.assertTrue(grid > 0)
self.assertTrue(block > 0)
class TestDevice(CUDATestCase):
def test_device_get_uuid(self):
# A device UUID looks like:
#
# GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643
#
# To test, we construct an RE that matches this form and verify that
# the returned UUID matches.
#
# Device UUIDs may not conform to parts of the UUID specification (RFC
# 4122) pertaining to versions and variants, so we do not extract and
# validate the values of these bits.
h = '[0-9a-f]{%d}'
h4 = h % 4
h8 = h % 8
h12 = h % 12
uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$'
dev = devices.get_context().device
self.assertRegex(dev.uuid, uuid_format)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,22 @@
from numba.cuda.testing import unittest
from numba.cuda.testing import skip_on_cudasim, skip_unless_conda_cudatoolkit
from numba.misc.findlib import find_lib
@skip_on_cudasim('Library detection unsupported in the simulator')
@skip_unless_conda_cudatoolkit
class TestLibraryDetection(unittest.TestCase):
def test_detect(self):
"""
This test is solely present to ensure that shipped cudatoolkits have
additional core libraries in locations that Numba scans by default.
PyCulib (and potentially others) rely on Numba's library finding
capacity to find and subsequently load these libraries.
"""
core_libs = ['nvvm']
for l in core_libs:
self.assertNotEqual(find_lib(l), [])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,193 @@
import ctypes
import numpy as np
from numba.cuda.cudadrv import driver, drvapi, devices
from numba.cuda.testing import unittest, ContextResettingTestCase
from numba.cuda.testing import skip_on_cudasim
@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
class TestCudaMemory(ContextResettingTestCase):
def setUp(self):
super().setUp()
self.context = devices.get_context()
def tearDown(self):
del self.context
super(TestCudaMemory, self).tearDown()
def _template(self, obj):
self.assertTrue(driver.is_device_memory(obj))
driver.require_device_memory(obj)
if driver.USE_NV_BINDING:
expected_class = driver.binding.CUdeviceptr
else:
expected_class = drvapi.cu_device_ptr
self.assertTrue(isinstance(obj.device_ctypes_pointer,
expected_class))
def test_device_memory(self):
devmem = self.context.memalloc(1024)
self._template(devmem)
def test_device_view(self):
devmem = self.context.memalloc(1024)
self._template(devmem.view(10))
def test_host_alloc(self):
devmem = self.context.memhostalloc(1024, mapped=True)
self._template(devmem)
def test_pinned_memory(self):
ary = np.arange(10)
devmem = self.context.mempin(ary, ary.ctypes.data,
ary.size * ary.dtype.itemsize,
mapped=True)
self._template(devmem)
def test_managed_memory(self):
devmem = self.context.memallocmanaged(1024)
self._template(devmem)
def test_derived_pointer(self):
# Use MemoryPointer.view to create derived pointer
def handle_val(mem):
if driver.USE_NV_BINDING:
return int(mem.handle)
else:
return mem.handle.value
def check(m, offset):
# create view
v1 = m.view(offset)
self.assertEqual(handle_val(v1.owner), handle_val(m))
self.assertEqual(m.refct, 2)
self.assertEqual(handle_val(v1) - offset, handle_val(v1.owner))
# create a view
v2 = v1.view(offset)
self.assertEqual(handle_val(v2.owner), handle_val(m))
self.assertEqual(handle_val(v2.owner), handle_val(m))
self.assertEqual(handle_val(v2) - offset * 2,
handle_val(v2.owner))
self.assertEqual(m.refct, 3)
del v2
self.assertEqual(m.refct, 2)
del v1
self.assertEqual(m.refct, 1)
m = self.context.memalloc(1024)
check(m=m, offset=0)
check(m=m, offset=1)
def test_user_extension(self):
# User can use MemoryPointer to wrap externally defined pointers.
# This test checks if the finalizer is invokded at correct time
fake_ptr = ctypes.c_void_p(0xdeadbeef)
dtor_invoked = [0]
def dtor():
dtor_invoked[0] += 1
# Ensure finalizer is called when pointer is deleted
ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
size=40, finalizer=dtor)
self.assertEqual(dtor_invoked[0], 0)
del ptr
self.assertEqual(dtor_invoked[0], 1)
# Ensure removing derived pointer doesn't call finalizer
ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
size=40, finalizer=dtor)
owned = ptr.own()
del owned
self.assertEqual(dtor_invoked[0], 1)
del ptr
self.assertEqual(dtor_invoked[0], 2)
class TestCudaMemoryFunctions(ContextResettingTestCase):
def setUp(self):
super().setUp()
self.context = devices.get_context()
def tearDown(self):
del self.context
super(TestCudaMemoryFunctions, self).tearDown()
def test_memcpy(self):
hstary = np.arange(100, dtype=np.uint32)
hstary2 = np.arange(100, dtype=np.uint32)
sz = hstary.size * hstary.dtype.itemsize
devary = self.context.memalloc(sz)
driver.host_to_device(devary, hstary, sz)
driver.device_to_host(hstary2, devary, sz)
self.assertTrue(np.all(hstary == hstary2))
def test_memset(self):
dtype = np.dtype('uint32')
n = 10
sz = dtype.itemsize * 10
devary = self.context.memalloc(sz)
driver.device_memset(devary, 0xab, sz)
hstary = np.empty(n, dtype=dtype)
driver.device_to_host(hstary, devary, sz)
hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32'))
self.assertTrue(np.all(hstary == hstary2))
def test_d2d(self):
hst = np.arange(100, dtype=np.uint32)
hst2 = np.empty_like(hst)
sz = hst.size * hst.dtype.itemsize
dev1 = self.context.memalloc(sz)
dev2 = self.context.memalloc(sz)
driver.host_to_device(dev1, hst, sz)
driver.device_to_device(dev2, dev1, sz)
driver.device_to_host(hst2, dev2, sz)
self.assertTrue(np.all(hst == hst2))
@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
class TestMVExtent(ContextResettingTestCase):
def test_c_contiguous_array(self):
ary = np.arange(100)
arysz = ary.dtype.itemsize * ary.size
s, e = driver.host_memory_extents(ary)
self.assertTrue(ary.ctypes.data == s)
self.assertTrue(arysz == driver.host_memory_size(ary))
def test_f_contiguous_array(self):
ary = np.asfortranarray(np.arange(100).reshape(2, 50))
arysz = ary.dtype.itemsize * np.prod(ary.shape)
s, e = driver.host_memory_extents(ary)
self.assertTrue(ary.ctypes.data == s)
self.assertTrue(arysz == driver.host_memory_size(ary))
def test_single_element_array(self):
ary = np.asarray(np.uint32(1234))
arysz = ary.dtype.itemsize
s, e = driver.host_memory_extents(ary)
self.assertTrue(ary.ctypes.data == s)
self.assertTrue(arysz == driver.host_memory_size(ary))
def test_ctypes_struct(self):
class mystruct(ctypes.Structure):
_fields_ = [('x', ctypes.c_int), ('y', ctypes.c_int)]
data = mystruct(x=123, y=432)
sz = driver.host_memory_size(data)
self.assertTrue(ctypes.sizeof(data) == sz)
def test_ctypes_double(self):
data = ctypes.c_double(1.234)
sz = driver.host_memory_size(data)
self.assertTrue(ctypes.sizeof(data) == sz)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,547 @@
import itertools
import numpy as np
from numba.cuda.cudadrv import devicearray
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import skip_on_cudasim
class TestCudaNDArray(CUDATestCase):
def test_device_array_interface(self):
dary = cuda.device_array(shape=100)
devicearray.verify_cuda_ndarray_interface(dary)
ary = np.empty(100)
dary = cuda.to_device(ary)
devicearray.verify_cuda_ndarray_interface(dary)
ary = np.asarray(1.234)
dary = cuda.to_device(ary)
self.assertEqual(dary.ndim, 0)
devicearray.verify_cuda_ndarray_interface(dary)
def test_device_array_from_readonly(self):
ary = np.arange(100, dtype=np.float32)
# Make the array readonly
ary.flags.writeable = False
self.assertFalse(ary.flags.writeable)
# Ensure that we can copy the readonly array
dary = cuda.to_device(ary)
retr = dary.copy_to_host()
np.testing.assert_array_equal(retr, ary)
def test_devicearray_dtype(self):
dary = cuda.device_array(shape=(100,), dtype="f4")
self.assertEqual(dary.dtype, np.dtype("f4"))
def test_devicearray_no_copy(self):
array = np.arange(100, dtype=np.float32)
cuda.to_device(array, copy=False)
def test_devicearray_shape(self):
ary = np.arange(2 * 3 * 4).reshape(2, 3, 4)
dary = cuda.to_device(ary)
self.assertEqual(ary.shape, dary.shape)
self.assertEqual(ary.shape[1:], dary.shape[1:])
def test_devicearray(self):
array = np.arange(100, dtype=np.int32)
original = array.copy()
gpumem = cuda.to_device(array)
array[:] = 0
gpumem.copy_to_host(array)
np.testing.assert_array_equal(array, original)
def test_stream_bind(self):
stream = cuda.stream()
with stream.auto_synchronize():
arr = cuda.device_array(
(3, 3),
dtype=np.float64,
stream=stream)
self.assertEqual(arr.bind(stream).stream, stream)
self.assertEqual(arr.stream, stream)
def test_len_1d(self):
ary = np.empty((3,))
dary = cuda.device_array(3)
self.assertEqual(len(ary), len(dary))
def test_len_2d(self):
ary = np.empty((3, 5))
dary = cuda.device_array((3, 5))
self.assertEqual(len(ary), len(dary))
def test_len_3d(self):
ary = np.empty((3, 5, 7))
dary = cuda.device_array((3, 5, 7))
self.assertEqual(len(ary), len(dary))
def test_devicearray_partition(self):
N = 100
array = np.arange(N, dtype=np.int32)
original = array.copy()
gpumem = cuda.to_device(array)
left, right = gpumem.split(N // 2)
array[:] = 0
self.assertTrue(np.all(array == 0))
right.copy_to_host(array[N // 2:])
left.copy_to_host(array[:N // 2])
self.assertTrue(np.all(array == original))
def test_devicearray_replace(self):
N = 100
array = np.arange(N, dtype=np.int32)
original = array.copy()
gpumem = cuda.to_device(array)
cuda.to_device(array * 2, to=gpumem)
gpumem.copy_to_host(array)
np.testing.assert_array_equal(array, original * 2)
@skip_on_cudasim('This works in the simulator')
def test_devicearray_transpose_wrongdim(self):
gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4, 1))
with self.assertRaises(NotImplementedError) as e:
np.transpose(gpumem)
self.assertEqual(
"transposing a non-2D DeviceNDArray isn't supported",
str(e.exception))
def test_devicearray_transpose_identity(self):
# any-shape identities should work
original = np.array(np.arange(24)).reshape(3, 4, 2)
array = np.transpose(cuda.to_device(original),
axes=(0, 1, 2)).copy_to_host()
self.assertTrue(np.all(array == original))
def test_devicearray_transpose_duplicatedaxis(self):
gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
with self.assertRaises(ValueError) as e:
np.transpose(gpumem, axes=(0, 0))
self.assertIn(
str(e.exception),
container=[
'invalid axes list (0, 0)', # GPU
'repeated axis in transpose', # sim
])
def test_devicearray_transpose_wrongaxis(self):
gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
with self.assertRaises(ValueError) as e:
np.transpose(gpumem, axes=(0, 2))
self.assertIn(
str(e.exception),
container=[
'invalid axes list (0, 2)', # GPU
'invalid axis for this array',
'axis 2 is out of bounds for array of dimension 2', # sim
])
def test_devicearray_view_ok(self):
original = np.array(np.arange(12), dtype="i2").reshape(3, 4)
array = cuda.to_device(original)
for dtype in ("i4", "u4", "i8", "f8"):
with self.subTest(dtype=dtype):
np.testing.assert_array_equal(
array.view(dtype).copy_to_host(),
original.view(dtype)
)
def test_devicearray_view_ok_not_c_contig(self):
original = np.array(np.arange(32), dtype="i2").reshape(4, 8)
array = cuda.to_device(original)[:, ::2]
original = original[:, ::2]
np.testing.assert_array_equal(
array.view("u2").copy_to_host(),
original.view("u2")
)
def test_devicearray_view_bad_not_c_contig(self):
original = np.array(np.arange(32), dtype="i2").reshape(4, 8)
array = cuda.to_device(original)[:, ::2]
with self.assertRaises(ValueError) as e:
array.view("i4")
msg = str(e.exception)
self.assertIn('To change to a dtype of a different size,', msg)
contiguous_pre_np123 = 'the array must be C-contiguous' in msg
contiguous_post_np123 = 'the last axis must be contiguous' in msg
self.assertTrue(contiguous_pre_np123 or contiguous_post_np123,
'Expected message to mention contiguity')
def test_devicearray_view_bad_itemsize(self):
original = np.array(np.arange(12), dtype="i2").reshape(4, 3)
array = cuda.to_device(original)
with self.assertRaises(ValueError) as e:
array.view("i4")
self.assertEqual(
"When changing to a larger dtype,"
" its size must be a divisor of the total size in bytes"
" of the last axis of the array.",
str(e.exception))
def test_devicearray_transpose_ok(self):
original = np.array(np.arange(12)).reshape(3, 4)
array = np.transpose(cuda.to_device(original)).copy_to_host()
self.assertTrue(np.all(array == original.T))
def test_devicearray_transpose_T(self):
original = np.array(np.arange(12)).reshape(3, 4)
array = cuda.to_device(original).T.copy_to_host()
self.assertTrue(np.all(array == original.T))
def test_devicearray_contiguous_slice(self):
# memcpys are dumb ranges of bytes, so trying to
# copy to a non-contiguous range shouldn't work!
a = np.arange(25).reshape(5, 5, order='F')
s = np.full(fill_value=5, shape=(5,))
d = cuda.to_device(a)
a[2] = s
# d is in F-order (not C-order), so d[2] is not contiguous
# (40-byte strides). This means we can't memcpy to it!
with self.assertRaises(ValueError) as e:
d[2].copy_to_device(s)
self.assertEqual(
devicearray.errmsg_contiguous_buffer,
str(e.exception))
# if d[2].copy_to_device(s), then this would pass:
# self.assertTrue((a == d.copy_to_host()).all())
def _test_devicearray_contiguous_host_copy(self, a_c, a_f):
"""
Checks host->device memcpys
"""
self.assertTrue(a_c.flags.c_contiguous)
self.assertTrue(a_f.flags.f_contiguous)
for original, copy in [
(a_f, a_f),
(a_f, a_c),
(a_c, a_f),
(a_c, a_c),
]:
msg = '%s => %s' % (
'C' if original.flags.c_contiguous else 'F',
'C' if copy.flags.c_contiguous else 'F',
)
d = cuda.to_device(original)
d.copy_to_device(copy)
self.assertTrue(np.all(d.copy_to_host() == a_c), msg=msg)
self.assertTrue(np.all(d.copy_to_host() == a_f), msg=msg)
def test_devicearray_contiguous_copy_host_3d(self):
a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
a_f = np.array(a_c, order='F')
self._test_devicearray_contiguous_host_copy(a_c, a_f)
def test_devicearray_contiguous_copy_host_1d(self):
a_c = np.arange(5)
a_f = np.array(a_c, order='F')
self._test_devicearray_contiguous_host_copy(a_c, a_f)
def test_devicearray_contiguous_copy_device(self):
a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
a_f = np.array(a_c, order='F')
self.assertTrue(a_c.flags.c_contiguous)
self.assertTrue(a_f.flags.f_contiguous)
d = cuda.to_device(a_c)
with self.assertRaises(ValueError) as e:
d.copy_to_device(cuda.to_device(a_f))
self.assertEqual(
"incompatible strides: {} vs. {}".format(a_c.strides, a_f.strides),
str(e.exception))
d.copy_to_device(cuda.to_device(a_c))
self.assertTrue(np.all(d.copy_to_host() == a_c))
d = cuda.to_device(a_f)
with self.assertRaises(ValueError) as e:
d.copy_to_device(cuda.to_device(a_c))
self.assertEqual(
"incompatible strides: {} vs. {}".format(a_f.strides, a_c.strides),
str(e.exception))
d.copy_to_device(cuda.to_device(a_f))
self.assertTrue(np.all(d.copy_to_host() == a_f))
def test_devicearray_broadcast_host_copy(self):
broadsize = 4
coreshape = (2, 3)
coresize = np.prod(coreshape)
core_c = np.arange(coresize).reshape(coreshape, order='C')
core_f = np.arange(coresize).reshape(coreshape, order='F')
for dim in range(len(coreshape)):
newindex = (slice(None),) * dim + (np.newaxis,)
broadshape = coreshape[:dim] + (broadsize,) + coreshape[dim:]
broad_c = np.broadcast_to(core_c[newindex], broadshape)
broad_f = np.broadcast_to(core_f[newindex], broadshape)
dbroad_c = cuda.to_device(broad_c)
dbroad_f = cuda.to_device(broad_f)
np.testing.assert_array_equal(dbroad_c.copy_to_host(), broad_c)
np.testing.assert_array_equal(dbroad_f.copy_to_host(), broad_f)
# Also test copying across different core orderings
dbroad_c.copy_to_device(broad_f)
dbroad_f.copy_to_device(broad_c)
np.testing.assert_array_equal(dbroad_c.copy_to_host(), broad_f)
np.testing.assert_array_equal(dbroad_f.copy_to_host(), broad_c)
def test_devicearray_contiguous_host_strided(self):
a_c = np.arange(10)
d = cuda.to_device(a_c)
arr = np.arange(20)[::2]
d.copy_to_device(arr)
np.testing.assert_array_equal(d.copy_to_host(), arr)
def test_devicearray_contiguous_device_strided(self):
d = cuda.to_device(np.arange(20))
arr = np.arange(20)
with self.assertRaises(ValueError) as e:
d.copy_to_device(cuda.to_device(arr)[::2])
self.assertEqual(
devicearray.errmsg_contiguous_buffer,
str(e.exception))
@skip_on_cudasim('DeviceNDArray class not present in simulator')
def test_devicearray_relaxed_strides(self):
# From the reproducer in Issue #6824.
# Construct a device array that is contiguous even though
# the strides for the first axis (800) are not equal to
# the strides * size (10 * 8 = 80) for the previous axis,
# because the first axis size is 1.
arr = devicearray.DeviceNDArray((1, 10), (800, 8), np.float64)
# Ensure we still believe the array to be contiguous because
# strides checking is relaxed.
self.assertTrue(arr.flags['C_CONTIGUOUS'])
self.assertTrue(arr.flags['F_CONTIGUOUS'])
def test_c_f_contiguity_matches_numpy(self):
# From the reproducer in Issue #4943.
shapes = ((1, 4), (4, 1))
orders = ('C', 'F')
for shape, order in itertools.product(shapes, orders):
arr = np.ndarray(shape, order=order)
d_arr = cuda.to_device(arr)
self.assertEqual(arr.flags['C_CONTIGUOUS'],
d_arr.flags['C_CONTIGUOUS'])
self.assertEqual(arr.flags['F_CONTIGUOUS'],
d_arr.flags['F_CONTIGUOUS'])
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_simple_c(self):
# C-order 1D array
a = np.zeros(10, order='C')
d = cuda.to_device(a)
self.assertEqual(d._numba_type_.layout, 'C')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_simple_f(self):
# F-order array that is also C layout.
a = np.zeros(10, order='F')
d = cuda.to_device(a)
self.assertEqual(d._numba_type_.layout, 'C')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_2d_c(self):
# C-order 2D array
a = np.zeros((2, 10), order='C')
d = cuda.to_device(a)
self.assertEqual(d._numba_type_.layout, 'C')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_2d_f(self):
# F-order array that can only be F layout
a = np.zeros((2, 10), order='F')
d = cuda.to_device(a)
self.assertEqual(d._numba_type_.layout, 'F')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_noncontig_slice_c(self):
# Non-contiguous slice of C-order array
a = np.zeros((5, 5), order='C')
d = cuda.to_device(a)[:,2]
self.assertEqual(d._numba_type_.layout, 'A')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_noncontig_slice_f(self):
# Non-contiguous slice of F-order array
a = np.zeros((5, 5), order='F')
d = cuda.to_device(a)[2,:]
self.assertEqual(d._numba_type_.layout, 'A')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_contig_slice_c(self):
# Contiguous slice of C-order array
a = np.zeros((5, 5), order='C')
d = cuda.to_device(a)[2,:]
self.assertEqual(d._numba_type_.layout, 'C')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_contig_slice_f(self):
# Contiguous slice of F-order array - is both C- and F-contiguous, so
# types as 'C' layout
a = np.zeros((5, 5), order='F')
d = cuda.to_device(a)[:,2]
self.assertEqual(d._numba_type_.layout, 'C')
@skip_on_cudasim('Typing not done in the simulator')
def test_devicearray_typing_order_broadcasted(self):
# Broadcasted array, similar to that used for passing scalars to ufuncs
a = np.broadcast_to(np.array([1]), (10,))
d = cuda.to_device(a)
self.assertEqual(d._numba_type_.layout, 'A')
def test_bug6697(self):
ary = np.arange(10, dtype=np.int16)
dary = cuda.to_device(ary)
got = np.asarray(dary)
self.assertEqual(got.dtype, dary.dtype)
@skip_on_cudasim('DeviceNDArray class not present in simulator')
def test_issue_8477(self):
# Ensure that we can copy a zero-length device array to a zero-length
# host array when the strides of the device and host arrays differ -
# this should be possible because the strides are irrelevant when the
# length is zero. For more info see
# https://github.com/numba/numba/issues/8477.
# Create a device array with shape (0,) and strides (8,)
dev_array = devicearray.DeviceNDArray(shape=(0,), strides=(8,),
dtype=np.int8)
# Create a host array with shape (0,) and strides (0,)
host_array = np.ndarray(shape=(0,), strides=(0,), dtype=np.int8)
# Sanity check for this test - ensure our destination has the strides
# we expect, because strides can be ignored in some cases by the
# ndarray constructor - checking here ensures that we haven't failed to
# account for unexpected behaviour across different versions of NumPy
self.assertEqual(host_array.strides, (0,))
# Ensure that the copy succeeds in both directions
dev_array.copy_to_host(host_array)
dev_array.copy_to_device(host_array)
# Ensure that a device-to-device copy also succeeds when the strides
# differ - one way of doing this is to copy the host array across and
# use that for copies in both directions.
dev_array_from_host = cuda.to_device(host_array)
self.assertEqual(dev_array_from_host.shape, (0,))
self.assertEqual(dev_array_from_host.strides, (0,))
dev_array.copy_to_device(dev_array_from_host)
dev_array_from_host.copy_to_device(dev_array)
class TestRecarray(CUDATestCase):
def test_recarray(self):
# From issue #4111
a = np.recarray((16,), dtype=[
("value1", np.int64),
("value2", np.float64),
])
a.value1 = np.arange(a.size, dtype=np.int64)
a.value2 = np.arange(a.size, dtype=np.float64) / 100
expect1 = a.value1
expect2 = a.value2
def test(x, out1, out2):
i = cuda.grid(1)
if i < x.size:
out1[i] = x.value1[i]
out2[i] = x.value2[i]
got1 = np.zeros_like(expect1)
got2 = np.zeros_like(expect2)
cuda.jit(test)[1, a.size](a, got1, got2)
np.testing.assert_array_equal(expect1, got1)
np.testing.assert_array_equal(expect2, got2)
class TestCoreContiguous(CUDATestCase):
def _test_against_array_core(self, view):
self.assertEqual(
devicearray.is_contiguous(view),
devicearray.array_core(view).flags['C_CONTIGUOUS']
)
def test_device_array_like_1d(self):
d_a = cuda.device_array(10, order='C')
self._test_against_array_core(d_a)
def test_device_array_like_2d(self):
d_a = cuda.device_array((10, 12), order='C')
self._test_against_array_core(d_a)
def test_device_array_like_2d_transpose(self):
d_a = cuda.device_array((10, 12), order='C')
self._test_against_array_core(d_a.T)
def test_device_array_like_3d(self):
d_a = cuda.device_array((10, 12, 14), order='C')
self._test_against_array_core(d_a)
def test_device_array_like_1d_f(self):
d_a = cuda.device_array(10, order='F')
self._test_against_array_core(d_a)
def test_device_array_like_2d_f(self):
d_a = cuda.device_array((10, 12), order='F')
self._test_against_array_core(d_a)
def test_device_array_like_2d_f_transpose(self):
d_a = cuda.device_array((10, 12), order='F')
self._test_against_array_core(d_a.T)
def test_device_array_like_3d_f(self):
d_a = cuda.device_array((10, 12, 14), order='F')
self._test_against_array_core(d_a)
def test_1d_view(self):
shape = 10
view = np.zeros(shape)[::2]
self._test_against_array_core(view)
def test_1d_view_f(self):
shape = 10
view = np.zeros(shape, order='F')[::2]
self._test_against_array_core(view)
def test_2d_view(self):
shape = (10, 12)
view = np.zeros(shape)[::2, ::2]
self._test_against_array_core(view)
def test_2d_view_f(self):
shape = (10, 12)
view = np.zeros(shape, order='F')[::2, ::2]
self._test_against_array_core(view)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,249 @@
from contextlib import contextmanager
import numpy as np
from numba import cuda
from numba.cuda.testing import (unittest, skip_on_cudasim,
skip_if_external_memmgr, CUDATestCase)
from numba.tests.support import captured_stderr
from numba.core import config
@skip_on_cudasim('not supported on CUDASIM')
@skip_if_external_memmgr('Deallocation specific to Numba memory management')
class TestDeallocation(CUDATestCase):
def test_max_pending_count(self):
# get deallocation manager and flush it
deallocs = cuda.current_context().memory_manager.deallocations
deallocs.clear()
self.assertEqual(len(deallocs), 0)
# deallocate to maximum count
for i in range(config.CUDA_DEALLOCS_COUNT):
cuda.to_device(np.arange(1))
self.assertEqual(len(deallocs), i + 1)
# one more to trigger .clear()
cuda.to_device(np.arange(1))
self.assertEqual(len(deallocs), 0)
def test_max_pending_bytes(self):
# get deallocation manager and flush it
ctx = cuda.current_context()
deallocs = ctx.memory_manager.deallocations
deallocs.clear()
self.assertEqual(len(deallocs), 0)
mi = ctx.get_memory_info()
max_pending = 10**6 # 1MB
old_ratio = config.CUDA_DEALLOCS_RATIO
try:
# change to a smaller ratio
config.CUDA_DEALLOCS_RATIO = max_pending / mi.total
# due to round off error (floor is used in calculating
# _max_pending_bytes) it can be off by 1.
self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending,
delta=1)
# allocate half the max size
# this will not trigger deallocation
cuda.to_device(np.ones(max_pending // 2, dtype=np.int8))
self.assertEqual(len(deallocs), 1)
# allocate another remaining
# this will not trigger deallocation
cuda.to_device(np.ones(deallocs._max_pending_bytes -
deallocs._size, dtype=np.int8))
self.assertEqual(len(deallocs), 2)
# another byte to trigger .clear()
cuda.to_device(np.ones(1, dtype=np.int8))
self.assertEqual(len(deallocs), 0)
finally:
# restore old ratio
config.CUDA_DEALLOCS_RATIO = old_ratio
@skip_on_cudasim("defer_cleanup has no effect in CUDASIM")
@skip_if_external_memmgr('Deallocation specific to Numba memory management')
class TestDeferCleanup(CUDATestCase):
def test_basic(self):
harr = np.arange(5)
darr1 = cuda.to_device(harr)
deallocs = cuda.current_context().memory_manager.deallocations
deallocs.clear()
self.assertEqual(len(deallocs), 0)
with cuda.defer_cleanup():
darr2 = cuda.to_device(harr)
del darr1
self.assertEqual(len(deallocs), 1)
del darr2
self.assertEqual(len(deallocs), 2)
deallocs.clear()
self.assertEqual(len(deallocs), 2)
deallocs.clear()
self.assertEqual(len(deallocs), 0)
def test_nested(self):
harr = np.arange(5)
darr1 = cuda.to_device(harr)
deallocs = cuda.current_context().memory_manager.deallocations
deallocs.clear()
self.assertEqual(len(deallocs), 0)
with cuda.defer_cleanup():
with cuda.defer_cleanup():
darr2 = cuda.to_device(harr)
del darr1
self.assertEqual(len(deallocs), 1)
del darr2
self.assertEqual(len(deallocs), 2)
deallocs.clear()
self.assertEqual(len(deallocs), 2)
deallocs.clear()
self.assertEqual(len(deallocs), 2)
deallocs.clear()
self.assertEqual(len(deallocs), 0)
def test_exception(self):
harr = np.arange(5)
darr1 = cuda.to_device(harr)
deallocs = cuda.current_context().memory_manager.deallocations
deallocs.clear()
self.assertEqual(len(deallocs), 0)
class CustomError(Exception):
pass
with self.assertRaises(CustomError):
with cuda.defer_cleanup():
darr2 = cuda.to_device(harr)
del darr2
self.assertEqual(len(deallocs), 1)
deallocs.clear()
self.assertEqual(len(deallocs), 1)
raise CustomError
deallocs.clear()
self.assertEqual(len(deallocs), 0)
del darr1
self.assertEqual(len(deallocs), 1)
deallocs.clear()
self.assertEqual(len(deallocs), 0)
class TestDeferCleanupAvail(CUDATestCase):
def test_context_manager(self):
# just make sure the API is available
with cuda.defer_cleanup():
pass
@skip_on_cudasim('not supported on CUDASIM')
class TestDel(CUDATestCase):
"""
Ensure resources are deleted properly without ignored exception.
"""
@contextmanager
def check_ignored_exception(self, ctx):
with captured_stderr() as cap:
yield
ctx.deallocations.clear()
self.assertFalse(cap.getvalue())
def test_stream(self):
ctx = cuda.current_context()
stream = ctx.create_stream()
with self.check_ignored_exception(ctx):
del stream
def test_event(self):
ctx = cuda.current_context()
event = ctx.create_event()
with self.check_ignored_exception(ctx):
del event
def test_pinned_memory(self):
ctx = cuda.current_context()
mem = ctx.memhostalloc(32)
with self.check_ignored_exception(ctx):
del mem
def test_mapped_memory(self):
ctx = cuda.current_context()
mem = ctx.memhostalloc(32, mapped=True)
with self.check_ignored_exception(ctx):
del mem
def test_device_memory(self):
ctx = cuda.current_context()
mem = ctx.memalloc(32)
with self.check_ignored_exception(ctx):
del mem
def test_managed_memory(self):
ctx = cuda.current_context()
mem = ctx.memallocmanaged(32)
with self.check_ignored_exception(ctx):
del mem
def test_pinned_contextmanager(self):
# Check that temporarily pinned memory is unregistered immediately,
# such that it can be re-pinned at any time
class PinnedException(Exception):
pass
arr = np.zeros(1)
ctx = cuda.current_context()
ctx.deallocations.clear()
with self.check_ignored_exception(ctx):
with cuda.pinned(arr):
pass
with cuda.pinned(arr):
pass
# Should also work inside a `defer_cleanup` block
with cuda.defer_cleanup():
with cuda.pinned(arr):
pass
with cuda.pinned(arr):
pass
# Should also work when breaking out of the block due to an
# exception
try:
with cuda.pinned(arr):
raise PinnedException
except PinnedException:
with cuda.pinned(arr):
pass
def test_mapped_contextmanager(self):
# Check that temporarily mapped memory is unregistered immediately,
# such that it can be re-mapped at any time
class MappedException(Exception):
pass
arr = np.zeros(1)
ctx = cuda.current_context()
ctx.deallocations.clear()
with self.check_ignored_exception(ctx):
with cuda.mapped(arr):
pass
with cuda.mapped(arr):
pass
# Should also work inside a `defer_cleanup` block
with cuda.defer_cleanup():
with cuda.mapped(arr):
pass
with cuda.mapped(arr):
pass
# Should also work when breaking out of the block due to an
# exception
try:
with cuda.mapped(arr):
raise MappedException
except MappedException:
with cuda.mapped(arr):
pass
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,81 @@
import os
import sys
import subprocess
import threading
from numba import cuda
from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
skip_under_cuda_memcheck)
from numba.tests.support import captured_stdout
class TestCudaDetect(CUDATestCase):
def test_cuda_detect(self):
# exercise the code path
with captured_stdout() as out:
cuda.detect()
output = out.getvalue()
self.assertIn('Found', output)
self.assertIn('CUDA devices', output)
@skip_under_cuda_memcheck('Hangs cuda-memcheck')
class TestCUDAFindLibs(CUDATestCase):
def run_cmd(self, cmdline, env):
popen = subprocess.Popen(cmdline,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env)
# finish in 5 minutes or kill it
timeout = threading.Timer(5 * 60., popen.kill)
try:
timeout.start()
out, err = popen.communicate()
# the process should exit with an error
return out.decode(), err.decode()
finally:
timeout.cancel()
return None, None
def run_test_in_separate_process(self, envvar, envvar_value):
env_copy = os.environ.copy()
env_copy[envvar] = str(envvar_value)
code = """if 1:
from numba import cuda
@cuda.jit('(int64,)')
def kernel(x):
pass
kernel(1,)
"""
cmdline = [sys.executable, "-c", code]
return self.run_cmd(cmdline, env_copy)
@skip_on_cudasim('Simulator does not hit device library search code path')
@unittest.skipIf(not sys.platform.startswith('linux'), "linux only")
def test_cuda_find_lib_errors(self):
"""
This tests that the find_libs works as expected in the case of an
environment variable being used to set the path.
"""
# one of these is likely to exist on linux, it's also unlikely that
# someone has extracted the contents of libdevice into here!
locs = ['lib', 'lib64']
looking_for = None
for l in locs:
looking_for = os.path.join(os.path.sep, l)
if os.path.exists(looking_for):
break
# This is the testing part, the test will only run if there's a valid
# path in which to look
if looking_for is not None:
out, err = self.run_test_in_separate_process("NUMBA_CUDA_DRIVER",
looking_for)
self.assertTrue(out is not None)
self.assertTrue(err is not None)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,192 @@
import ctypes
import numpy as np
import weakref
from numba import cuda
from numba.core import config
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
from numba.tests.support import linux_only
if not config.ENABLE_CUDASIM:
class DeviceOnlyEMMPlugin(cuda.HostOnlyCUDAMemoryManager):
"""
Dummy EMM Plugin implementation for testing. It memorises which plugin
API methods have been called so that the tests can check that Numba
called into the plugin as expected.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# For tracking our dummy allocations
self.allocations = {}
self.count = 0
# For tracking which methods have been called
self.initialized = False
self.memalloc_called = False
self.reset_called = False
self.get_memory_info_called = False
self.get_ipc_handle_called = False
def memalloc(self, size):
# We maintain a list of allocations and keep track of them, so that
# we can test that the finalizers of objects returned by memalloc
# get called.
# Numba should have initialized the memory manager when preparing
# the context for use, prior to any memalloc call.
if not self.initialized:
raise RuntimeError("memalloc called before initialize")
self.memalloc_called = True
# Create an allocation and record it
self.count += 1
alloc_count = self.count
self.allocations[alloc_count] = size
# The finalizer deletes the record from our internal dict of
# allocations.
finalizer_allocs = self.allocations
def finalizer():
del finalizer_allocs[alloc_count]
# We use an AutoFreePointer so that the finalizer will be run when
# the reference count drops to zero.
ctx = weakref.proxy(self.context)
ptr = ctypes.c_void_p(alloc_count)
return cuda.cudadrv.driver.AutoFreePointer(ctx, ptr, size,
finalizer=finalizer)
def initialize(self):
# No special initialization needed.
self.initialized = True
def reset(self):
# We remove all allocations on reset, just as a real EMM Plugin
# would do. Note that our finalizers in memalloc don't check
# whether the allocations are still alive, so running them after
# reset will detect any allocations that are floating around at
# exit time; however, the atexit finalizer for weakref will only
# print a traceback, not terminate the interpreter abnormally.
self.reset_called = True
def get_memory_info(self):
# Return some dummy memory information
self.get_memory_info_called = True
return cuda.MemoryInfo(free=32, total=64)
def get_ipc_handle(self, memory):
# The dummy IPC handle is only a string, so it is important that
# the tests don't try to do too much with it (e.g. open / close
# it).
self.get_ipc_handle_called = True
return "Dummy IPC handle for alloc %s" % memory.device_pointer.value
@property
def interface_version(self):
# The expected version for an EMM Plugin.
return 1
class BadVersionEMMPlugin(DeviceOnlyEMMPlugin):
"""A plugin that claims to implement a different interface version"""
@property
def interface_version(self):
return 2
@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
class TestDeviceOnlyEMMPlugin(CUDATestCase):
"""
Tests that the API of an EMM Plugin that implements device allocations
only is used correctly by Numba.
"""
def setUp(self):
super().setUp()
# Always start afresh with a new context and memory manager
cuda.close()
cuda.set_memory_manager(DeviceOnlyEMMPlugin)
def tearDown(self):
super().tearDown()
# Unset the memory manager for subsequent tests
cuda.close()
cuda.cudadrv.driver._memory_manager = None
def test_memalloc(self):
mgr = cuda.current_context().memory_manager
# Allocate an array and check that memalloc was called with the correct
# size.
arr_1 = np.arange(10)
d_arr_1 = cuda.device_array_like(arr_1)
self.assertTrue(mgr.memalloc_called)
self.assertEqual(mgr.count, 1)
self.assertEqual(mgr.allocations[1], arr_1.nbytes)
# Allocate again, with a different size, and check that it is also
# correct.
arr_2 = np.arange(5)
d_arr_2 = cuda.device_array_like(arr_2)
self.assertEqual(mgr.count, 2)
self.assertEqual(mgr.allocations[2], arr_2.nbytes)
# Remove the first array, and check that our finalizer was called for
# the first array only.
del d_arr_1
self.assertNotIn(1, mgr.allocations)
self.assertIn(2, mgr.allocations)
# Remove the second array and check that its finalizer was also
# called.
del d_arr_2
self.assertNotIn(2, mgr.allocations)
def test_initialized_in_context(self):
# If we have a CUDA context, it should already have initialized its
# memory manager.
self.assertTrue(cuda.current_context().memory_manager.initialized)
def test_reset(self):
ctx = cuda.current_context()
ctx.reset()
self.assertTrue(ctx.memory_manager.reset_called)
def test_get_memory_info(self):
ctx = cuda.current_context()
meminfo = ctx.get_memory_info()
self.assertTrue(ctx.memory_manager.get_memory_info_called)
self.assertEqual(meminfo.free, 32)
self.assertEqual(meminfo.total, 64)
@linux_only
def test_get_ipc_handle(self):
# We don't attempt to close the IPC handle in this test because Numba
# will be expecting a real IpcHandle object to have been returned from
# get_ipc_handle, and it would cause problems to do so.
arr = np.arange(2)
d_arr = cuda.device_array_like(arr)
ipch = d_arr.get_ipc_handle()
ctx = cuda.current_context()
self.assertTrue(ctx.memory_manager.get_ipc_handle_called)
self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle)
@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
class TestBadEMMPluginVersion(CUDATestCase):
"""
Ensure that Numba rejects EMM Plugins with incompatible version
numbers.
"""
def test_bad_plugin_version(self):
with self.assertRaises(RuntimeError) as raises:
cuda.set_memory_manager(BadVersionEMMPlugin)
self.assertIn('version 1 required', str(raises.exception))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,38 @@
import numpy as np
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase
class TestCudaEvent(CUDATestCase):
def test_event_elapsed(self):
N = 32
dary = cuda.device_array(N, dtype=np.double)
evtstart = cuda.event()
evtend = cuda.event()
evtstart.record()
cuda.to_device(np.arange(N, dtype=np.double), to=dary)
evtend.record()
evtend.wait()
evtend.synchronize()
# Exercise the code path
evtstart.elapsed_time(evtend)
def test_event_elapsed_stream(self):
N = 32
stream = cuda.stream()
dary = cuda.device_array(N, dtype=np.double)
evtstart = cuda.event()
evtend = cuda.event()
evtstart.record(stream=stream)
cuda.to_device(np.arange(N, dtype=np.double), to=dary, stream=stream)
evtend.record(stream=stream)
evtend.wait(stream=stream)
evtend.synchronize()
# Exercise the code path
evtstart.elapsed_time(evtend)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,65 @@
import numpy as np
from numba.cuda.cudadrv import driver
from numba import cuda
from numba.cuda.testing import unittest, ContextResettingTestCase
class TestHostAlloc(ContextResettingTestCase):
def test_host_alloc_driver(self):
n = 32
mem = cuda.current_context().memhostalloc(n, mapped=True)
dtype = np.dtype(np.uint8)
ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype,
buffer=mem)
magic = 0xab
driver.device_memset(mem, magic, n)
self.assertTrue(np.all(ary == magic))
ary.fill(n)
recv = np.empty_like(ary)
driver.device_to_host(recv, mem, ary.size)
self.assertTrue(np.all(ary == recv))
self.assertTrue(np.all(recv == n))
def test_host_alloc_pinned(self):
ary = cuda.pinned_array(10, dtype=np.uint32)
ary.fill(123)
self.assertTrue(all(ary == 123))
devary = cuda.to_device(ary)
driver.device_memset(devary, 0, driver.device_memory_size(devary))
self.assertTrue(all(ary == 123))
devary.copy_to_host(ary)
self.assertTrue(all(ary == 0))
def test_host_alloc_mapped(self):
ary = cuda.mapped_array(10, dtype=np.uint32)
ary.fill(123)
self.assertTrue(all(ary == 123))
driver.device_memset(ary, 0, driver.device_memory_size(ary))
self.assertTrue(all(ary == 0))
self.assertTrue(sum(ary != 0) == 0)
def test_host_operators(self):
for ary in [cuda.mapped_array(10, dtype=np.uint32),
cuda.pinned_array(10, dtype=np.uint32)]:
ary[:] = range(10)
self.assertTrue(sum(ary + 1) == 55)
self.assertTrue(sum((ary + 1) * 2 - 1) == 100)
self.assertTrue(sum(ary < 5) == 5)
self.assertTrue(sum(ary <= 5) == 6)
self.assertTrue(sum(ary > 6) == 3)
self.assertTrue(sum(ary >= 6) == 4)
self.assertTrue(sum(ary ** 2) == 285)
self.assertTrue(sum(ary // 2) == 20)
self.assertTrue(sum(ary / 2.0) == 22.5)
self.assertTrue(sum(ary % 2) == 5)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,139 @@
import multiprocessing as mp
import os
from numba import cuda
from numba.cuda.cudadrv.driver import CudaAPIError, driver
from numba.cuda.cudadrv.error import CudaSupportError
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
# A mock of cuInit that always raises a CudaAPIError
def cuInit_raising(arg):
raise CudaAPIError(999, 'CUDA_ERROR_UNKNOWN')
# Test code to run in a child that patches driver.cuInit to a variant that
# always raises. We can't use mock.patch.object here because driver.cuInit is
# not assigned until we attempt to initialize - mock.patch.object cannot locate
# the non-existent original method, and so fails. Instead we patch
# driver.cuInit with our raising version prior to any attempt to initialize.
def cuInit_raising_test(result_queue):
driver.cuInit = cuInit_raising
success = False
msg = None
try:
# A CUDA operation that forces initialization of the device
cuda.device_array(1)
except CudaSupportError as e:
success = True
msg = e.msg
result_queue.put((success, msg))
# Similar to cuInit_raising_test above, but for testing that the string
# returned by cuda_error() is as expected.
def initialization_error_test(result_queue):
driver.cuInit = cuInit_raising
success = False
msg = None
try:
# A CUDA operation that forces initialization of the device
cuda.device_array(1)
except CudaSupportError:
success = True
msg = cuda.cuda_error()
result_queue.put((success, msg))
# For testing the path where Driver.__init__() catches a CudaSupportError
def cuda_disabled_test(result_queue):
success = False
msg = None
try:
# A CUDA operation that forces initialization of the device
cuda.device_array(1)
except CudaSupportError as e:
success = True
msg = e.msg
result_queue.put((success, msg))
# Similar to cuda_disabled_test, but checks cuda.cuda_error() instead of the
# exception raised on initialization
def cuda_disabled_error_test(result_queue):
success = False
msg = None
try:
# A CUDA operation that forces initialization of the device
cuda.device_array(1)
except CudaSupportError:
success = True
msg = cuda.cuda_error()
result_queue.put((success, msg))
@skip_on_cudasim('CUDA Simulator does not initialize driver')
class TestInit(CUDATestCase):
def _test_init_failure(self, target, expected):
# Run the initialization failure test in a separate subprocess
ctx = mp.get_context('spawn')
result_queue = ctx.Queue()
proc = ctx.Process(target=target, args=(result_queue,))
proc.start()
proc.join(30) # should complete within 30s
success, msg = result_queue.get()
# Ensure the child process raised an exception during initialization
# before checking the message
if not success:
self.fail('CudaSupportError not raised')
self.assertIn(expected, msg)
def test_init_failure_raising(self):
expected = 'Error at driver init: CUDA_ERROR_UNKNOWN (999)'
self._test_init_failure(cuInit_raising_test, expected)
def test_init_failure_error(self):
expected = 'CUDA_ERROR_UNKNOWN (999)'
self._test_init_failure(initialization_error_test, expected)
def _test_cuda_disabled(self, target):
# Uses _test_init_failure to launch the test in a separate subprocess
# with CUDA disabled.
cuda_disabled = os.environ.get('NUMBA_DISABLE_CUDA')
os.environ['NUMBA_DISABLE_CUDA'] = "1"
try:
expected = 'CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1'
self._test_init_failure(cuda_disabled_test, expected)
finally:
if cuda_disabled is not None:
os.environ['NUMBA_DISABLE_CUDA'] = cuda_disabled
else:
os.environ.pop('NUMBA_DISABLE_CUDA')
def test_cuda_disabled_raising(self):
self._test_cuda_disabled(cuda_disabled_test)
def test_cuda_disabled_error(self):
self._test_cuda_disabled(cuda_disabled_error_test)
def test_init_success(self):
# Here we assume that initialization is successful (because many bad
# things will happen with the test suite if it is not) and check that
# there is no error recorded.
self.assertIsNone(cuda.cuda_error())
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,37 @@
from llvmlite import ir
from numba.cuda.cudadrv import nvvm
from numba.cuda.testing import unittest, ContextResettingTestCase
from numba.cuda.testing import skip_on_cudasim
@skip_on_cudasim('Inline PTX cannot be used in the simulator')
class TestCudaInlineAsm(ContextResettingTestCase):
def test_inline_rsqrt(self):
mod = ir.Module(__name__)
mod.triple = 'nvptx64-nvidia-cuda'
nvvm.add_ir_version(mod)
fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())])
fn = ir.Function(mod, fnty, 'cu_rsqrt')
bldr = ir.IRBuilder(fn.append_basic_block('entry'))
rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
inlineasm = ir.InlineAsm(rsqrt_approx_fnty,
'rsqrt.approx.f32 $0, $1;',
'=f,f', side_effect=True)
val = bldr.load(fn.args[0])
res = bldr.call(inlineasm, [val])
bldr.store(res, fn.args[0])
bldr.ret_void()
# generate ptx
mod.data_layout = nvvm.NVVM().data_layout
nvvm.set_cuda_kernel(fn)
nvvmir = str(mod)
ptx = nvvm.compile_ir(nvvmir)
self.assertTrue('rsqrt.approx.f32' in str(ptx))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,12 @@
from numba import cuda
from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
class TestIsFP16Supported(CUDATestCase):
def test_is_fp16_supported(self):
self.assertTrue(cuda.is_float16_supported())
@skip_on_cudasim
@skip_unless_cc_53
def test_device_supports_float16(self):
self.assertTrue(cuda.get_current_device().supports_float16)

View File

@@ -0,0 +1,317 @@
import numpy as np
import warnings
from numba.cuda.testing import unittest
from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
from numba.cuda.testing import CUDATestCase, test_data_dir
from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
LinkerError)
from numba.cuda.cudadrv.error import NvrtcError
from numba.cuda import require_context
from numba.tests.support import ignore_internal_warnings
from numba import cuda, void, float64, int64, int32, typeof, float32
CONST1D = np.arange(10, dtype=np.float64)
def simple_const_mem(A):
C = cuda.const.array_like(CONST1D)
i = cuda.grid(1)
A[i] = C[i] + 1.0
def func_with_lots_of_registers(x, a, b, c, d, e, f):
a1 = 1.0
a2 = 1.0
a3 = 1.0
a4 = 1.0
a5 = 1.0
b1 = 1.0
b2 = 1.0
b3 = 1.0
b4 = 1.0
b5 = 1.0
c1 = 1.0
c2 = 1.0
c3 = 1.0
c4 = 1.0
c5 = 1.0
d1 = 10
d2 = 10
d3 = 10
d4 = 10
d5 = 10
for i in range(a):
a1 += b
a2 += c
a3 += d
a4 += e
a5 += f
b1 *= b
b2 *= c
b3 *= d
b4 *= e
b5 *= f
c1 /= b
c2 /= c
c3 /= d
c4 /= e
c5 /= f
d1 <<= b
d2 <<= c
d3 <<= d
d4 <<= e
d5 <<= f
x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5
def simple_smem(ary, dty):
sm = cuda.shared.array(100, dty)
i = cuda.grid(1)
if i == 0:
for j in range(100):
sm[j] = j
cuda.syncthreads()
ary[i] = sm[i]
def coop_smem2d(ary):
i, j = cuda.grid(2)
sm = cuda.shared.array((10, 20), float32)
sm[i, j] = (i + 1) / (j + 1)
cuda.syncthreads()
ary[i, j] = sm[i, j]
def simple_maxthreads(ary):
i = cuda.grid(1)
ary[i] = i
LMEM_SIZE = 1000
def simple_lmem(A, B, dty):
C = cuda.local.array(LMEM_SIZE, dty)
for i in range(C.shape[0]):
C[i] = A[i]
for i in range(C.shape[0]):
B[i] = C[i]
@skip_on_cudasim('Linking unsupported in the simulator')
class TestLinker(CUDATestCase):
_NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}
@require_context
def test_linker_basic(self):
'''Simply go through the constructor and destructor
'''
linker = Linker.new(cc=(5, 3))
del linker
def _test_linking(self, eager):
global bar # must be a global; other it is recognized as a freevar
bar = cuda.declare_device('bar', 'int32(int32)')
link = str(test_data_dir / 'jitlink.ptx')
if eager:
args = ['void(int32[:], int32[:])']
else:
args = []
@cuda.jit(*args, link=[link])
def foo(x, y):
i = cuda.grid(1)
x[i] += bar(y[i])
A = np.array([123], dtype=np.int32)
B = np.array([321], dtype=np.int32)
foo[1, 1](A, B)
self.assertTrue(A[0] == 123 + 2 * 321)
def test_linking_lazy_compile(self):
self._test_linking(eager=False)
def test_linking_eager_compile(self):
self._test_linking(eager=True)
def test_linking_cu(self):
bar = cuda.declare_device('bar', 'int32(int32)')
link = str(test_data_dir / 'jitlink.cu')
@cuda.jit(link=[link])
def kernel(r, x):
i = cuda.grid(1)
if i < len(r):
r[i] = bar(x[i])
x = np.arange(10, dtype=np.int32)
r = np.zeros_like(x)
kernel[1, 32](r, x)
# Matches the operation of bar() in jitlink.cu
expected = x * 2
np.testing.assert_array_equal(r, expected)
def test_linking_cu_log_warning(self):
bar = cuda.declare_device('bar', 'int32(int32)')
link = str(test_data_dir / 'warn.cu')
with warnings.catch_warnings(record=True) as w:
ignore_internal_warnings()
@cuda.jit('void(int32)', link=[link])
def kernel(x):
bar(x)
self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
# Check the warning refers to the log messages
self.assertIn('NVRTC log messages', str(w[0].message))
# Check the message pertaining to the unused variable is provided
self.assertIn('declared but never referenced', str(w[0].message))
def test_linking_cu_error(self):
bar = cuda.declare_device('bar', 'int32(int32)')
link = str(test_data_dir / 'error.cu')
with self.assertRaises(NvrtcError) as e:
@cuda.jit('void(int32)', link=[link])
def kernel(x):
bar(x)
msg = e.exception.args[0]
# Check the error message refers to the NVRTC compile
self.assertIn('NVRTC Compilation failure', msg)
# Check the expected error in the CUDA source is reported
self.assertIn('identifier "SYNTAX" is undefined', msg)
# Check the filename is reported correctly
self.assertIn('in the compilation of "error.cu"', msg)
def test_linking_unknown_filetype_error(self):
expected_err = "Don't know how to link file with extension .cuh"
with self.assertRaisesRegex(RuntimeError, expected_err):
@cuda.jit('void()', link=['header.cuh'])
def kernel():
pass
def test_linking_file_with_no_extension_error(self):
expected_err = "Don't know how to link file with no extension"
with self.assertRaisesRegex(RuntimeError, expected_err):
@cuda.jit('void()', link=['data'])
def kernel():
pass
@skip_if_cuda_includes_missing
def test_linking_cu_cuda_include(self):
link = str(test_data_dir / 'cuda_include.cu')
# An exception will be raised when linking this kernel due to the
# compile failure if CUDA includes cannot be found by Nvrtc.
@cuda.jit('void()', link=[link])
def kernel():
pass
def test_try_to_link_nonexistent(self):
with self.assertRaises(LinkerError) as e:
@cuda.jit('void(int32[::1])', link=['nonexistent.a'])
def f(x):
x[0] = 0
self.assertIn('nonexistent.a not found', e.exception.args)
def test_set_registers_no_max(self):
"""Ensure that the jitted kernel used in the test_set_registers_* tests
uses more than 57 registers - this ensures that test_set_registers_*
are really checking that they reduced the number of registers used from
something greater than the maximum."""
compiled = cuda.jit(func_with_lots_of_registers)
compiled = compiled.specialize(np.empty(32), *range(6))
self.assertGreater(compiled.get_regs_per_thread(), 57)
def test_set_registers_57(self):
compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)
compiled = compiled.specialize(np.empty(32), *range(6))
self.assertLessEqual(compiled.get_regs_per_thread(), 57)
def test_set_registers_38(self):
compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)
compiled = compiled.specialize(np.empty(32), *range(6))
self.assertLessEqual(compiled.get_regs_per_thread(), 38)
def test_set_registers_eager(self):
sig = void(float64[::1], int64, int64, int64, int64, int64, int64)
compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)
self.assertLessEqual(compiled.get_regs_per_thread(), 38)
def test_get_const_mem_size(self):
sig = void(float64[::1])
compiled = cuda.jit(sig)(simple_const_mem)
const_mem_size = compiled.get_const_mem_size()
self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)
def test_get_no_shared_memory(self):
compiled = cuda.jit(func_with_lots_of_registers)
compiled = compiled.specialize(np.empty(32), *range(6))
shared_mem_size = compiled.get_shared_mem_per_block()
self.assertEqual(shared_mem_size, 0)
def test_get_shared_mem_per_block(self):
sig = void(int32[::1], typeof(np.int32))
compiled = cuda.jit(sig)(simple_smem)
shared_mem_size = compiled.get_shared_mem_per_block()
self.assertEqual(shared_mem_size, 400)
def test_get_shared_mem_per_specialized(self):
compiled = cuda.jit(simple_smem)
compiled_specialized = compiled.specialize(
np.zeros(100, dtype=np.int32), np.float64)
shared_mem_size = compiled_specialized.get_shared_mem_per_block()
self.assertEqual(shared_mem_size, 800)
def test_get_max_threads_per_block(self):
compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)
max_threads = compiled.get_max_threads_per_block()
self.assertGreater(max_threads, 0)
def test_max_threads_exceeded(self):
compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)
max_threads = compiled.get_max_threads_per_block()
nelem = max_threads + 1
ary = np.empty(nelem, dtype=np.int32)
try:
compiled[1, nelem](ary)
except CudaAPIError as e:
self.assertIn("cuLaunchKernel", e.msg)
def test_get_local_mem_per_thread(self):
sig = void(int32[::1], int32[::1], typeof(np.int32))
compiled = cuda.jit(sig)(simple_lmem)
local_mem_size = compiled.get_local_mem_per_thread()
calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE
self.assertGreaterEqual(local_mem_size, calc_size)
def test_get_local_mem_per_specialized(self):
compiled = cuda.jit(simple_lmem)
compiled_specialized = compiled.specialize(
np.zeros(LMEM_SIZE, dtype=np.int32),
np.zeros(LMEM_SIZE, dtype=np.int32),
np.float64)
local_mem_size = compiled_specialized.get_local_mem_per_thread()
calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
self.assertGreaterEqual(local_mem_size, calc_size)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,127 @@
import numpy as np
from ctypes import byref, c_size_t
from numba.cuda.cudadrv.driver import device_memset, driver, USE_NV_BINDING
from numba import cuda
from numba.cuda.testing import unittest, ContextResettingTestCase
from numba.cuda.testing import skip_on_cudasim, skip_on_arm
from numba.tests.support import linux_only
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
@linux_only
@skip_on_arm('Managed Alloc support is experimental/untested on ARM')
class TestManagedAlloc(ContextResettingTestCase):
def get_total_gpu_memory(self):
# We use a driver function to directly get the total GPU memory because
# an EMM plugin may report something different (or not implement
# get_memory_info at all).
if USE_NV_BINDING:
free, total = driver.cuMemGetInfo()
return total
else:
free = c_size_t()
total = c_size_t()
driver.cuMemGetInfo(byref(free), byref(total))
return total.value
def skip_if_cc_major_lt(self, min_required, reason):
"""
Skip the current test if the compute capability of the device is
less than `min_required`.
"""
ctx = cuda.current_context()
cc_major = ctx.device.compute_capability[0]
if cc_major < min_required:
self.skipTest(reason)
# CUDA Unified Memory comes in two flavors. For GPUs in the Kepler and
# Maxwell generations, managed memory allocations work as opaque,
# contiguous segments that can either be on the device or the host. For
# GPUs in the Pascal or later generations, managed memory operates on a
# per-page basis, so we can have arrays larger than GPU memory, where only
# part of them is resident on the device at one time. To ensure that this
# test works correctly on all supported GPUs, we'll select the size of our
# memory such that we only oversubscribe the GPU memory if we're on a
# Pascal or newer GPU (compute capability at least 6.0).
def test_managed_alloc_driver_undersubscribe(self):
msg = "Managed memory unsupported prior to CC 3.0"
self.skip_if_cc_major_lt(3, msg)
self._test_managed_alloc_driver(0.5)
# This test is skipped by default because it is easy to hang the machine
# for a very long time or get OOM killed if the GPU memory size is >50% of
# the system memory size. Even if the system does have more than 2x the RAM
# of the GPU, this test runs for a very long time (in comparison to the
# rest of the tests in the suite).
#
# However, it is left in here for manual testing as required.
@unittest.skip
def test_managed_alloc_driver_oversubscribe(self):
msg = "Oversubscription of managed memory unsupported prior to CC 6.0"
self.skip_if_cc_major_lt(6, msg)
self._test_managed_alloc_driver(2.0)
def test_managed_alloc_driver_host_attach(self):
msg = "Host attached managed memory is not accessible prior to CC 6.0"
self.skip_if_cc_major_lt(6, msg)
# Only test with a small array (0.01 * memory size) to keep the test
# quick.
self._test_managed_alloc_driver(0.01, attach_global=False)
def _test_managed_alloc_driver(self, memory_factor, attach_global=True):
# Verify that we can allocate and operate on managed
# memory through the CUDA driver interface.
total_mem_size = self.get_total_gpu_memory()
n_bytes = int(memory_factor * total_mem_size)
ctx = cuda.current_context()
mem = ctx.memallocmanaged(n_bytes, attach_global=attach_global)
dtype = np.dtype(np.uint8)
n_elems = n_bytes // dtype.itemsize
ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem)
magic = 0xab
device_memset(mem, magic, n_bytes)
ctx.synchronize()
# Note that this assertion operates on the CPU, so this
# test effectively drives both the CPU and the GPU on
# managed memory.
self.assertTrue(np.all(ary == magic))
def _test_managed_array(self, attach_global=True):
# Check the managed_array interface on both host and device.
ary = cuda.managed_array(100, dtype=np.double)
ary.fill(123.456)
self.assertTrue(all(ary == 123.456))
@cuda.jit('void(double[:])')
def kernel(x):
i = cuda.grid(1)
if i < x.shape[0]:
x[i] = 1.0
kernel[10, 10](ary)
cuda.current_context().synchronize()
self.assertTrue(all(ary == 1.0))
def test_managed_array_attach_global(self):
self._test_managed_array()
def test_managed_array_attach_host(self):
self._test_managed_array()
msg = "Host attached managed memory is not accessible prior to CC 6.0"
self.skip_if_cc_major_lt(6, msg)
self._test_managed_array(attach_global=False)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,54 @@
import multiprocessing as mp
import traceback
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
skip_if_mvc_libraries_unavailable)
from numba.tests.support import linux_only
def child_test():
from numba import config, cuda
# Change the MVC config after importing numba.cuda
config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
@cuda.jit
def f():
pass
f[1, 1]()
def child_test_wrapper(result_queue):
try:
output = child_test()
success = True
# Catch anything raised so it can be propagated
except: # noqa: E722
output = traceback.format_exc()
success = False
result_queue.put((success, output))
@linux_only
@skip_under_cuda_memcheck('May hang CUDA memcheck')
@skip_on_cudasim('Simulator does not require or implement MVC')
@skip_if_mvc_libraries_unavailable
class TestMinorVersionCompatibility(CUDATestCase):
def test_mvc(self):
# Run test with Minor Version Compatibility enabled in a child process
ctx = mp.get_context('spawn')
result_queue = ctx.Queue()
proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
proc.start()
proc.join()
success, output = result_queue.get()
# Ensure the child process ran to completion before checking its output
if not success:
self.fail(output)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,199 @@
import warnings
from llvmlite import ir
from numba.cuda.cudadrv import nvvm, runtime
from numba.cuda.testing import unittest
from numba.cuda.cudadrv.nvvm import LibDevice, NvvmError, NVVM
from numba.cuda.testing import skip_on_cudasim
@skip_on_cudasim('NVVM Driver unsupported in the simulator')
class TestNvvmDriver(unittest.TestCase):
def get_nvvmir(self):
versions = NVVM().get_ir_version()
data_layout = NVVM().data_layout
return nvvmir_generic.format(data_layout=data_layout, v=versions)
def test_nvvm_compile_simple(self):
nvvmir = self.get_nvvmir()
ptx = nvvm.compile_ir(nvvmir).decode('utf8')
self.assertTrue('simple' in ptx)
self.assertTrue('ave' in ptx)
def test_nvvm_compile_nullary_option(self):
# Tests compilation with an option that doesn't take an argument
# ("-gen-lto") - all other NVVM options are of the form
# "-<name>=<value>"
# -gen-lto is not available prior to CUDA 11.5
if runtime.get_version() < (11, 5):
self.skipTest("-gen-lto unavailable in this toolkit version")
nvvmir = self.get_nvvmir()
ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch="compute_52")
# Verify we correctly passed the option by checking if we got LTOIR
# from NVVM (by looking for the expected magic number for LTOIR)
self.assertEqual(ltoir[:4], b'\xed\x43\x4e\x7f')
def test_nvvm_bad_option(self):
# Ensure that unsupported / non-existent options are reported as such
# to the user / caller
msg = "-made-up-option=2 is an unsupported option"
with self.assertRaisesRegex(NvvmError, msg):
nvvm.compile_ir("", made_up_option=2)
def test_nvvm_from_llvm(self):
m = ir.Module("test_nvvm_from_llvm")
m.triple = 'nvptx64-nvidia-cuda'
nvvm.add_ir_version(m)
fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
kernel = ir.Function(m, fty, name='mycudakernel')
bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
bldr.ret_void()
nvvm.set_cuda_kernel(kernel)
m.data_layout = NVVM().data_layout
ptx = nvvm.compile_ir(str(m)).decode('utf8')
self.assertTrue('mycudakernel' in ptx)
self.assertTrue('.address_size 64' in ptx)
def test_used_list(self):
# Construct a module
m = ir.Module("test_used_list")
m.triple = 'nvptx64-nvidia-cuda'
m.data_layout = NVVM().data_layout
nvvm.add_ir_version(m)
# Add a function and mark it as a kernel
fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
kernel = ir.Function(m, fty, name='mycudakernel')
bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
bldr.ret_void()
nvvm.set_cuda_kernel(kernel)
# Verify that the used list was correctly constructed
used_lines = [line for line in str(m).splitlines()
if 'llvm.used' in line]
msg = 'Expected exactly one @"llvm.used" array'
self.assertEqual(len(used_lines), 1, msg)
used_line = used_lines[0]
# Kernel should be referenced in the used list
self.assertIn("mycudakernel", used_line)
# Check linkage of the used list
self.assertIn("appending global", used_line)
# Ensure used list is in the metadata section
self.assertIn('section "llvm.metadata"', used_line)
def test_nvvm_ir_verify_fail(self):
m = ir.Module("test_bad_ir")
m.triple = "unknown-unknown-unknown"
m.data_layout = NVVM().data_layout
nvvm.add_ir_version(m)
with self.assertRaisesRegex(NvvmError, 'Invalid target triple'):
nvvm.compile_ir(str(m))
def _test_nvvm_support(self, arch):
compute_xx = 'compute_{0}{1}'.format(*arch)
nvvmir = self.get_nvvmir()
ptx = nvvm.compile_ir(nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0,
prec_div=0).decode('utf8')
self.assertIn(".target sm_{0}{1}".format(*arch), ptx)
self.assertIn('simple', ptx)
self.assertIn('ave', ptx)
def test_nvvm_support(self):
"""Test supported CC by NVVM
"""
for arch in nvvm.get_supported_ccs():
self._test_nvvm_support(arch=arch)
def test_nvvm_warning(self):
m = ir.Module("test_nvvm_warning")
m.triple = 'nvptx64-nvidia-cuda'
m.data_layout = NVVM().data_layout
nvvm.add_ir_version(m)
fty = ir.FunctionType(ir.VoidType(), [])
kernel = ir.Function(m, fty, name='inlinekernel')
builder = ir.IRBuilder(kernel.append_basic_block('entry'))
builder.ret_void()
nvvm.set_cuda_kernel(kernel)
# Add the noinline attribute to trigger NVVM to generate a warning
kernel.attributes.add('noinline')
with warnings.catch_warnings(record=True) as w:
nvvm.compile_ir(str(m))
self.assertEqual(len(w), 1)
self.assertIn('overriding noinline attribute', str(w[0]))
@skip_on_cudasim('NVVM Driver unsupported in the simulator')
class TestArchOption(unittest.TestCase):
def test_get_arch_option(self):
# Test returning the nearest lowest arch.
self.assertEqual(nvvm.get_arch_option(5, 3), 'compute_53')
self.assertEqual(nvvm.get_arch_option(7, 5), 'compute_75')
self.assertEqual(nvvm.get_arch_option(7, 7), 'compute_75')
# Test known arch.
supported_cc = nvvm.get_supported_ccs()
for arch in supported_cc:
self.assertEqual(nvvm.get_arch_option(*arch), 'compute_%d%d' % arch)
self.assertEqual(nvvm.get_arch_option(1000, 0),
'compute_%d%d' % supported_cc[-1])
@skip_on_cudasim('NVVM Driver unsupported in the simulator')
class TestLibDevice(unittest.TestCase):
def test_libdevice_load(self):
# Test that constructing LibDevice gives a bitcode file
libdevice = LibDevice()
self.assertEqual(libdevice.bc[:4], b'BC\xc0\xde')
nvvmir_generic = '''\
target triple="nvptx64-nvidia-cuda"
target datalayout = "{data_layout}"
define i32 @ave(i32 %a, i32 %b) {{
entry:
%add = add nsw i32 %a, %b
%div = sdiv i32 %add, 2
ret i32 %div
}}
define void @simple(i32* %data) {{
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%mul = mul i32 %0, %1
%2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%add = add i32 %mul, %2
%call = call i32 @ave(i32 %add, i32 %add)
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds i32, i32* %data, i64 %idxprom
store i32 %call, i32* %arrayidx, align 4
ret void
}}
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
!nvvmir.version = !{{!1}}
!1 = !{{i32 {v[0]}, i32 {v[1]}, i32 {v[2]}, i32 {v[3]}}}
!nvvm.annotations = !{{!2}}
!2 = !{{void (i32*)* @simple, !"kernel", i32 1}}
@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i32*)* @simple to i8*)], section "llvm.metadata"
''' # noqa: E501
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,37 @@
import numpy as np
import platform
from numba import cuda
from numba.cuda.testing import unittest, ContextResettingTestCase
class TestPinned(ContextResettingTestCase):
def _run_copies(self, A):
A0 = np.copy(A)
stream = cuda.stream()
ptr = cuda.to_device(A, copy=False, stream=stream)
ptr.copy_to_device(A, stream=stream)
ptr.copy_to_host(A, stream=stream)
stream.synchronize()
self.assertTrue(np.allclose(A, A0))
def test_pinned(self):
machine = platform.machine()
if machine.startswith('arm') or machine.startswith('aarch64'):
count = 262144 # 2MB
else:
count = 2097152 # 16MB
A = np.arange(count)
with cuda.pinned(A):
self._run_copies(A)
def test_unpinned(self):
A = np.arange(2 * 1024 * 1024) # 16 MB
self._run_copies(A)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,20 @@
import unittest
from numba.cuda.testing import ContextResettingTestCase
from numba import cuda
from numba.cuda.testing import skip_on_cudasim
@skip_on_cudasim('CUDA Profiler unsupported in the simulator')
class TestProfiler(ContextResettingTestCase):
def test_profiling(self):
with cuda.profiling():
a = cuda.device_array(10)
del a
with cuda.profiling():
a = cuda.device_array(100)
del a
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,149 @@
import multiprocessing as mp
import logging
import traceback
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import (skip_on_cudasim, skip_with_cuda_python,
skip_under_cuda_memcheck)
from numba.tests.support import linux_only
def child_test():
from numba import cuda, int32, void
from numba.core import config
import io
import numpy as np
import threading
# Enable PTDS before we make any CUDA driver calls. Enabling it first
# ensures that PTDS APIs are used because the CUDA driver looks up API
# functions on first use and memoizes them.
config.CUDA_PER_THREAD_DEFAULT_STREAM = 1
# Set up log capture for the Driver API so we can see what API calls were
# used.
logbuf = io.StringIO()
handler = logging.StreamHandler(logbuf)
cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver')
cudadrv_logger.addHandler(handler)
cudadrv_logger.setLevel(logging.DEBUG)
# Set up data for our test, and copy over to the device
N = 2 ** 16
N_THREADS = 10
N_ADDITIONS = 4096
# Seed the RNG for repeatability
np.random.seed(1)
x = np.random.randint(low=0, high=1000, size=N, dtype=np.int32)
r = np.zeros_like(x)
# One input and output array for each thread
xs = [cuda.to_device(x) for _ in range(N_THREADS)]
rs = [cuda.to_device(r) for _ in range(N_THREADS)]
# Compute the grid size and get the [per-thread] default stream
n_threads = 256
n_blocks = N // n_threads
stream = cuda.default_stream()
# A simple multiplication-by-addition kernel. What it does exactly is not
# too important; only that we have a kernel that does something.
@cuda.jit(void(int32[::1], int32[::1]))
def f(r, x):
i = cuda.grid(1)
if i > len(r):
return
# Accumulate x into r
for j in range(N_ADDITIONS):
r[i] += x[i]
# This function will be used to launch the kernel from each thread on its
# own unique data.
def kernel_thread(n):
f[n_blocks, n_threads, stream](rs[n], xs[n])
# Create threads
threads = [threading.Thread(target=kernel_thread, args=(i,))
for i in range(N_THREADS)]
# Start all threads
for thread in threads:
thread.start()
# Wait for all threads to finish, to ensure that we don't synchronize with
# the device until all kernels are scheduled.
for thread in threads:
thread.join()
# Synchronize with the device
cuda.synchronize()
# Check output is as expected
expected = x * N_ADDITIONS
for i in range(N_THREADS):
np.testing.assert_equal(rs[i].copy_to_host(), expected)
# Return the driver log output to the calling process for checking
handler.flush()
return logbuf.getvalue()
def child_test_wrapper(result_queue):
try:
output = child_test()
success = True
# Catch anything raised so it can be propagated
except: # noqa: E722
output = traceback.format_exc()
success = False
result_queue.put((success, output))
# Run on Linux only until the reason for test hangs on Windows (Issue #8635,
# https://github.com/numba/numba/issues/8635) is diagnosed
@linux_only
@skip_under_cuda_memcheck('Hangs cuda-memcheck')
@skip_on_cudasim('Streams not supported on the simulator')
class TestPTDS(CUDATestCase):
@skip_with_cuda_python('Function names unchanged for PTDS with NV Binding')
def test_ptds(self):
# Run a test with PTDS enabled in a child process
ctx = mp.get_context('spawn')
result_queue = ctx.Queue()
proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
proc.start()
proc.join()
success, output = result_queue.get()
# Ensure the child process ran to completion before checking its output
if not success:
self.fail(output)
# Functions with a per-thread default stream variant that we expect to
# see in the output
ptds_functions = ('cuMemcpyHtoD_v2_ptds', 'cuLaunchKernel_ptsz',
'cuMemcpyDtoH_v2_ptds')
for fn in ptds_functions:
with self.subTest(fn=fn, expected=True):
self.assertIn(fn, output)
# Non-PTDS versions of the functions that we should not see in the
# output:
legacy_functions = ('cuMemcpyHtoD_v2', 'cuLaunchKernel',
'cuMemcpyDtoH_v2')
for fn in legacy_functions:
with self.subTest(fn=fn, expected=False):
# Ensure we only spot these function names appearing without a
# _ptds or _ptsz suffix by checking including the end of the
# line in the log
fn_at_end = f'{fn}\n'
self.assertNotIn(fn_at_end, output)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,36 @@
import threading
from numba import cuda
from numba.cuda.cudadrv.driver import driver
from numba.cuda.testing import unittest, ContextResettingTestCase
from queue import Queue
class TestResetDevice(ContextResettingTestCase):
def test_reset_device(self):
def newthread(exception_queue):
try:
devices = range(driver.get_device_count())
for _ in range(2):
for d in devices:
cuda.select_device(d)
cuda.close()
except Exception as e:
exception_queue.put(e)
# Do test on a separate thread so that we don't affect
# the current context in the main thread.
exception_queue = Queue()
t = threading.Thread(target=newthread, args=(exception_queue,))
t.start()
t.join()
exceptions = []
while not exception_queue.empty():
exceptions.append(exception_queue.get())
self.assertEqual(exceptions, [])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,85 @@
import multiprocessing
import os
from numba.core import config
from numba.cuda.cudadrv.runtime import runtime
from numba.cuda.testing import unittest, SerialMixin, skip_on_cudasim
from unittest.mock import patch
def set_visible_devices_and_check(q):
try:
from numba import cuda
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
q.put(len(cuda.gpus.lst))
except: # noqa: E722
# Sentinel value for error executing test code
q.put(-1)
if config.ENABLE_CUDASIM:
SUPPORTED_VERSIONS = (-1, -1),
else:
SUPPORTED_VERSIONS = ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5),
(11, 6), (11, 7))
class TestRuntime(unittest.TestCase):
def test_is_supported_version_true(self):
for v in SUPPORTED_VERSIONS:
with patch.object(runtime, 'get_version', return_value=v):
self.assertTrue(runtime.is_supported_version())
@skip_on_cudasim('The simulator always simulates a supported runtime')
def test_is_supported_version_false(self):
# Check with an old unsupported version and some potential future
# versions
for v in ((10, 2), (11, 8), (12, 0)):
with patch.object(runtime, 'get_version', return_value=v):
self.assertFalse(runtime.is_supported_version())
def test_supported_versions(self):
self.assertEqual(SUPPORTED_VERSIONS, runtime.supported_versions)
class TestVisibleDevices(unittest.TestCase, SerialMixin):
def test_visible_devices_set_after_import(self):
# See Issue #6149. This test checks that we can set
# CUDA_VISIBLE_DEVICES after importing Numba and have the value
# reflected in the available list of GPUs. Prior to the fix for this
# issue, Numba made a call to runtime.get_version() on import that
# initialized the driver and froze the list of available devices before
# CUDA_VISIBLE_DEVICES could be set by the user.
# Avoid importing cuda at the top level so that
# set_visible_devices_and_check gets to import it first in its process
from numba import cuda
if len(cuda.gpus.lst) in (0, 1):
self.skipTest('This test requires multiple GPUs')
if os.environ.get('CUDA_VISIBLE_DEVICES'):
msg = 'Cannot test when CUDA_VISIBLE_DEVICES already set'
self.skipTest(msg)
ctx = multiprocessing.get_context('spawn')
q = ctx.Queue()
p = ctx.Process(target=set_visible_devices_and_check, args=(q,))
p.start()
try:
visible_gpu_count = q.get()
finally:
p.join()
# Make an obvious distinction between an error running the test code
# and an incorrect number of GPUs in the list
msg = 'Error running set_visible_devices_and_check'
self.assertNotEqual(visible_gpu_count, -1, msg=msg)
# The actual check that we see only one GPU
self.assertEqual(visible_gpu_count, 1)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,41 @@
#
# Test does not work on some cards.
#
import threading
from queue import Queue
import numpy as np
from numba import cuda
from numba.cuda.testing import unittest, ContextResettingTestCase
def newthread(exception_queue):
try:
cuda.select_device(0)
stream = cuda.stream()
A = np.arange(100)
dA = cuda.to_device(A, stream=stream)
stream.synchronize()
del dA
del stream
cuda.close()
except Exception as e:
exception_queue.put(e)
class TestSelectDevice(ContextResettingTestCase):
def test_select_device(self):
exception_queue = Queue()
for i in range(10):
t = threading.Thread(target=newthread, args=(exception_queue,))
t.start()
t.join()
exceptions = []
while not exception_queue.empty():
exceptions.append(exception_queue.get())
self.assertEqual(exceptions, [])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,122 @@
import asyncio
import functools
import threading
import numpy as np
from numba import cuda
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
def with_asyncio_loop(f):
@functools.wraps(f)
def runner(*args, **kwds):
loop = asyncio.new_event_loop()
loop.set_debug(True)
try:
return loop.run_until_complete(f(*args, **kwds))
finally:
loop.close()
return runner
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
class TestCudaStream(CUDATestCase):
def test_add_callback(self):
def callback(stream, status, event):
event.set()
stream = cuda.stream()
callback_event = threading.Event()
stream.add_callback(callback, callback_event)
self.assertTrue(callback_event.wait(1.0))
def test_add_callback_with_default_arg(self):
callback_event = threading.Event()
def callback(stream, status, arg):
self.assertIsNone(arg)
callback_event.set()
stream = cuda.stream()
stream.add_callback(callback)
self.assertTrue(callback_event.wait(1.0))
@with_asyncio_loop
async def test_async_done(self):
stream = cuda.stream()
await stream.async_done()
@with_asyncio_loop
async def test_parallel_tasks(self):
async def async_cuda_fn(value_in: float) -> float:
stream = cuda.stream()
h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
h_src[:] = value_in
d_ary = cuda.to_device(h_src, stream=stream)
d_ary.copy_to_host(h_dst, stream=stream)
done_result = await stream.async_done()
self.assertEqual(done_result, stream)
return h_dst.mean()
values_in = [1, 2, 3, 4]
tasks = [asyncio.create_task(async_cuda_fn(v)) for v in values_in]
values_out = await asyncio.gather(*tasks)
self.assertTrue(np.allclose(values_in, values_out))
@with_asyncio_loop
async def test_multiple_async_done(self):
stream = cuda.stream()
done_aws = [stream.async_done() for _ in range(4)]
done = await asyncio.gather(*done_aws)
for d in done:
self.assertEqual(d, stream)
@with_asyncio_loop
async def test_multiple_async_done_multiple_streams(self):
streams = [cuda.stream() for _ in range(4)]
done_aws = [stream.async_done() for stream in streams]
done = await asyncio.gather(*done_aws)
# Ensure we got the four original streams in done
self.assertSetEqual(set(done), set(streams))
@with_asyncio_loop
async def test_cancelled_future(self):
stream = cuda.stream()
done1, done2 = stream.async_done(), stream.async_done()
done1.cancel()
await done2
self.assertTrue(done1.cancelled())
self.assertTrue(done2.done())
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
class TestFailingStream(CUDATestCase):
# This test can only be run in isolation because it corrupts the CUDA
# context, which cannot be recovered from within the same process. It is
# left here so that it can be run manually for debugging / testing purposes
# - or may be re-enabled if in future there is infrastructure added for
# running tests in a separate process (a subprocess cannot be used because
# CUDA will have been initialized before the fork, so it cannot be used in
# the child process).
@unittest.skip
@with_asyncio_loop
async def test_failed_stream(self):
ctx = cuda.current_context()
module = ctx.create_module_ptx("""
.version 6.5
.target sm_30
.address_size 64
.visible .entry failing_kernel() { trap; }
""")
failing_kernel = module.get_function("failing_kernel")
stream = cuda.stream()
failing_kernel.configure((1,), (1,), stream=stream).__call__()
done = stream.async_done()
with self.assertRaises(Exception):
await done
self.assertIsNotNone(done.exception())
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,8 @@
from numba.cuda.testing import ensure_supported_ccs_initialized
from numba.testing import load_testsuite
import os
def load_tests(loader, tests, pattern):
ensure_supported_ccs_initialized()
return load_testsuite(loader, os.path.dirname(__file__))

Some files were not shown because too many files have changed in this diff Show More