Videre
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
from numba.cuda.testing import ensure_supported_ccs_initialized
|
||||
from numba.testing import unittest
|
||||
from numba.testing import load_testsuite
|
||||
from numba import cuda
|
||||
from os.path import dirname, join
|
||||
|
||||
|
||||
def load_tests(loader, tests, pattern):
|
||||
suite = unittest.TestSuite()
|
||||
this_dir = dirname(__file__)
|
||||
ensure_supported_ccs_initialized()
|
||||
suite.addTests(load_testsuite(loader, join(this_dir, 'nocuda')))
|
||||
if cuda.is_available():
|
||||
suite.addTests(load_testsuite(loader, join(this_dir, 'cudasim')))
|
||||
gpus = cuda.list_devices()
|
||||
if gpus and gpus[0].compute_capability >= (2, 0):
|
||||
suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
|
||||
suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
|
||||
suite.addTests(load_testsuite(loader, join(this_dir, 'doc_examples')))
|
||||
else:
|
||||
print("skipped CUDA tests because GPU CC < 2.0")
|
||||
else:
|
||||
print("skipped CUDA tests")
|
||||
return suite
|
||||
Binary file not shown.
@@ -0,0 +1,8 @@
|
||||
from numba.cuda.testing import ensure_supported_ccs_initialized
|
||||
from numba.testing import load_testsuite
|
||||
import os
|
||||
|
||||
|
||||
def load_tests(loader, tests, pattern):
|
||||
ensure_supported_ccs_initialized()
|
||||
return load_testsuite(loader, os.path.dirname(__file__))
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,145 @@
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
||||
|
||||
|
||||
class TestArrayAttr(CUDATestCase):
|
||||
|
||||
def test_contigous_2d(self):
|
||||
ary = np.arange(10)
|
||||
cary = ary.reshape(2, 5)
|
||||
fary = np.asfortranarray(cary)
|
||||
|
||||
dcary = cuda.to_device(cary)
|
||||
dfary = cuda.to_device(fary)
|
||||
self.assertTrue(dcary.is_c_contiguous())
|
||||
self.assertTrue(not dfary.is_c_contiguous())
|
||||
self.assertTrue(not dcary.is_f_contiguous())
|
||||
self.assertTrue(dfary.is_f_contiguous())
|
||||
|
||||
def test_contigous_3d(self):
|
||||
ary = np.arange(20)
|
||||
cary = ary.reshape(2, 5, 2)
|
||||
fary = np.asfortranarray(cary)
|
||||
|
||||
dcary = cuda.to_device(cary)
|
||||
dfary = cuda.to_device(fary)
|
||||
self.assertTrue(dcary.is_c_contiguous())
|
||||
self.assertTrue(not dfary.is_c_contiguous())
|
||||
self.assertTrue(not dcary.is_f_contiguous())
|
||||
self.assertTrue(dfary.is_f_contiguous())
|
||||
|
||||
def test_contigous_4d(self):
|
||||
ary = np.arange(60)
|
||||
cary = ary.reshape(2, 5, 2, 3)
|
||||
fary = np.asfortranarray(cary)
|
||||
|
||||
dcary = cuda.to_device(cary)
|
||||
dfary = cuda.to_device(fary)
|
||||
self.assertTrue(dcary.is_c_contiguous())
|
||||
self.assertTrue(not dfary.is_c_contiguous())
|
||||
self.assertTrue(not dcary.is_f_contiguous())
|
||||
self.assertTrue(dfary.is_f_contiguous())
|
||||
|
||||
def test_ravel_1d(self):
|
||||
ary = np.arange(60)
|
||||
dary = cuda.to_device(ary)
|
||||
for order in 'CFA':
|
||||
expect = ary.ravel(order=order)
|
||||
dflat = dary.ravel(order=order)
|
||||
flat = dflat.copy_to_host()
|
||||
self.assertTrue(dary is not dflat) # ravel returns new array
|
||||
self.assertEqual(flat.ndim, 1)
|
||||
self.assertPreciseEqual(expect, flat)
|
||||
|
||||
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
|
||||
def test_ravel_stride_1d(self):
|
||||
ary = np.arange(60)
|
||||
dary = cuda.to_device(ary)
|
||||
# No-copy stride device array
|
||||
darystride = dary[::2]
|
||||
dary_data = dary.__cuda_array_interface__['data'][0]
|
||||
ddarystride_data = darystride.__cuda_array_interface__['data'][0]
|
||||
self.assertEqual(dary_data, ddarystride_data)
|
||||
# Fail on ravel on non-contiguous array
|
||||
with self.assertRaises(NotImplementedError):
|
||||
darystride.ravel()
|
||||
|
||||
def test_ravel_c(self):
|
||||
ary = np.arange(60)
|
||||
reshaped = ary.reshape(2, 5, 2, 3)
|
||||
|
||||
expect = reshaped.ravel(order='C')
|
||||
dary = cuda.to_device(reshaped)
|
||||
dflat = dary.ravel()
|
||||
flat = dflat.copy_to_host()
|
||||
self.assertTrue(dary is not dflat)
|
||||
self.assertEqual(flat.ndim, 1)
|
||||
self.assertPreciseEqual(expect, flat)
|
||||
|
||||
# explicit order kwarg
|
||||
for order in 'CA':
|
||||
expect = reshaped.ravel(order=order)
|
||||
dary = cuda.to_device(reshaped)
|
||||
dflat = dary.ravel(order=order)
|
||||
flat = dflat.copy_to_host()
|
||||
self.assertTrue(dary is not dflat)
|
||||
self.assertEqual(flat.ndim, 1)
|
||||
self.assertPreciseEqual(expect, flat)
|
||||
|
||||
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
|
||||
def test_ravel_stride_c(self):
|
||||
ary = np.arange(60)
|
||||
reshaped = ary.reshape(2, 5, 2, 3)
|
||||
|
||||
dary = cuda.to_device(reshaped)
|
||||
darystride = dary[::2, ::2, ::2, ::2]
|
||||
dary_data = dary.__cuda_array_interface__['data'][0]
|
||||
ddarystride_data = darystride.__cuda_array_interface__['data'][0]
|
||||
self.assertEqual(dary_data, ddarystride_data)
|
||||
with self.assertRaises(NotImplementedError):
|
||||
darystride.ravel()
|
||||
|
||||
def test_ravel_f(self):
|
||||
ary = np.arange(60)
|
||||
reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
|
||||
for order in 'FA':
|
||||
expect = reshaped.ravel(order=order)
|
||||
dary = cuda.to_device(reshaped)
|
||||
dflat = dary.ravel(order=order)
|
||||
flat = dflat.copy_to_host()
|
||||
self.assertTrue(dary is not dflat)
|
||||
self.assertEqual(flat.ndim, 1)
|
||||
self.assertPreciseEqual(expect, flat)
|
||||
|
||||
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
|
||||
def test_ravel_stride_f(self):
|
||||
ary = np.arange(60)
|
||||
reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
|
||||
dary = cuda.to_device(reshaped)
|
||||
darystride = dary[::2, ::2, ::2, ::2]
|
||||
dary_data = dary.__cuda_array_interface__['data'][0]
|
||||
ddarystride_data = darystride.__cuda_array_interface__['data'][0]
|
||||
self.assertEqual(dary_data, ddarystride_data)
|
||||
with self.assertRaises(NotImplementedError):
|
||||
darystride.ravel()
|
||||
|
||||
def test_reshape_c(self):
|
||||
ary = np.arange(10)
|
||||
expect = ary.reshape(2, 5)
|
||||
dary = cuda.to_device(ary)
|
||||
dary_reshaped = dary.reshape(2, 5)
|
||||
got = dary_reshaped.copy_to_host()
|
||||
self.assertPreciseEqual(expect, got)
|
||||
|
||||
def test_reshape_f(self):
|
||||
ary = np.arange(10)
|
||||
expect = ary.reshape(2, 5, order='F')
|
||||
dary = cuda.to_device(ary)
|
||||
dary_reshaped = dary.reshape(2, 5, order='F')
|
||||
got = dary_reshaped.copy_to_host()
|
||||
self.assertPreciseEqual(expect, got)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,145 @@
|
||||
import numbers
|
||||
from ctypes import byref
|
||||
import weakref
|
||||
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
||||
from numba.cuda.cudadrv import driver
|
||||
|
||||
|
||||
class TestContextStack(CUDATestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Reset before testing
|
||||
cuda.close()
|
||||
|
||||
def test_gpus_current(self):
|
||||
self.assertIs(cuda.gpus.current, None)
|
||||
with cuda.gpus[0]:
|
||||
self.assertEqual(int(cuda.gpus.current.id), 0)
|
||||
|
||||
def test_gpus_len(self):
|
||||
self.assertGreater(len(cuda.gpus), 0)
|
||||
|
||||
def test_gpus_iter(self):
|
||||
gpulist = list(cuda.gpus)
|
||||
self.assertGreater(len(gpulist), 0)
|
||||
|
||||
|
||||
class TestContextAPI(CUDATestCase):
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
cuda.close()
|
||||
|
||||
def test_context_memory(self):
|
||||
try:
|
||||
mem = cuda.current_context().get_memory_info()
|
||||
except NotImplementedError:
|
||||
self.skipTest('EMM Plugin does not implement get_memory_info()')
|
||||
|
||||
self.assertIsInstance(mem.free, numbers.Number)
|
||||
self.assertEqual(mem.free, mem[0])
|
||||
|
||||
self.assertIsInstance(mem.total, numbers.Number)
|
||||
self.assertEqual(mem.total, mem[1])
|
||||
|
||||
self.assertLessEqual(mem.free, mem.total)
|
||||
|
||||
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
||||
@skip_on_cudasim('CUDA HW required')
|
||||
def test_forbidden_context_switch(self):
|
||||
# Cannot switch context inside a `cuda.require_context`
|
||||
@cuda.require_context
|
||||
def switch_gpu():
|
||||
with cuda.gpus[1]:
|
||||
pass
|
||||
|
||||
with cuda.gpus[0]:
|
||||
with self.assertRaises(RuntimeError) as raises:
|
||||
switch_gpu()
|
||||
|
||||
self.assertIn("Cannot switch CUDA-context.", str(raises.exception))
|
||||
|
||||
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
||||
def test_accepted_context_switch(self):
|
||||
def switch_gpu():
|
||||
with cuda.gpus[1]:
|
||||
return cuda.current_context().device.id
|
||||
|
||||
with cuda.gpus[0]:
|
||||
devid = switch_gpu()
|
||||
self.assertEqual(int(devid), 1)
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA HW required')
|
||||
class Test3rdPartyContext(CUDATestCase):
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
cuda.close()
|
||||
|
||||
def test_attached_primary(self, extra_work=lambda: None):
|
||||
# Emulate primary context creation by 3rd party
|
||||
the_driver = driver.driver
|
||||
if driver.USE_NV_BINDING:
|
||||
dev = driver.binding.CUdevice(0)
|
||||
hctx = the_driver.cuDevicePrimaryCtxRetain(dev)
|
||||
else:
|
||||
dev = 0
|
||||
hctx = driver.drvapi.cu_context()
|
||||
the_driver.cuDevicePrimaryCtxRetain(byref(hctx), dev)
|
||||
try:
|
||||
ctx = driver.Context(weakref.proxy(self), hctx)
|
||||
ctx.push()
|
||||
# Check that the context from numba matches the created primary
|
||||
# context.
|
||||
my_ctx = cuda.current_context()
|
||||
if driver.USE_NV_BINDING:
|
||||
self.assertEqual(int(my_ctx.handle), int(ctx.handle))
|
||||
else:
|
||||
self.assertEqual(my_ctx.handle.value, ctx.handle.value)
|
||||
|
||||
extra_work()
|
||||
finally:
|
||||
ctx.pop()
|
||||
the_driver.cuDevicePrimaryCtxRelease(dev)
|
||||
|
||||
def test_attached_non_primary(self):
|
||||
# Emulate non-primary context creation by 3rd party
|
||||
the_driver = driver.driver
|
||||
if driver.USE_NV_BINDING:
|
||||
flags = 0
|
||||
dev = driver.binding.CUdevice(0)
|
||||
hctx = the_driver.cuCtxCreate(flags, dev)
|
||||
else:
|
||||
hctx = driver.drvapi.cu_context()
|
||||
the_driver.cuCtxCreate(byref(hctx), 0, 0)
|
||||
try:
|
||||
cuda.current_context()
|
||||
except RuntimeError as e:
|
||||
# Expecting an error about non-primary CUDA context
|
||||
self.assertIn("Numba cannot operate on non-primary CUDA context ",
|
||||
str(e))
|
||||
else:
|
||||
self.fail("No RuntimeError raised")
|
||||
finally:
|
||||
the_driver.cuCtxDestroy(hctx)
|
||||
|
||||
def test_cudajit_in_attached_primary_context(self):
|
||||
def do():
|
||||
from numba import cuda
|
||||
|
||||
@cuda.jit
|
||||
def foo(a):
|
||||
for i in range(a.size):
|
||||
a[i] = i
|
||||
|
||||
a = cuda.device_array(10)
|
||||
foo[1, 1](a)
|
||||
self.assertEqual(list(a.copy_to_host()), list(range(10)))
|
||||
|
||||
self.test_attached_primary(do)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,376 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
||||
from unittest.mock import patch
|
||||
|
||||
|
||||
class CudaArrayIndexing(CUDATestCase):
|
||||
def test_index_1d(self):
|
||||
arr = np.arange(10)
|
||||
darr = cuda.to_device(arr)
|
||||
x, = arr.shape
|
||||
for i in range(-x, x):
|
||||
self.assertEqual(arr[i], darr[i])
|
||||
with self.assertRaises(IndexError):
|
||||
darr[-x - 1]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[x]
|
||||
|
||||
def test_index_2d(self):
|
||||
arr = np.arange(3 * 4).reshape(3, 4)
|
||||
darr = cuda.to_device(arr)
|
||||
x, y = arr.shape
|
||||
for i in range(-x, x):
|
||||
for j in range(-y, y):
|
||||
self.assertEqual(arr[i, j], darr[i, j])
|
||||
with self.assertRaises(IndexError):
|
||||
darr[-x - 1, 0]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[x, 0]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[0, -y - 1]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[0, y]
|
||||
|
||||
def test_index_3d(self):
|
||||
arr = np.arange(3 * 4 * 5).reshape(3, 4, 5)
|
||||
darr = cuda.to_device(arr)
|
||||
x, y, z = arr.shape
|
||||
for i in range(-x, x):
|
||||
for j in range(-y, y):
|
||||
for k in range(-z, z):
|
||||
self.assertEqual(arr[i, j, k], darr[i, j, k])
|
||||
with self.assertRaises(IndexError):
|
||||
darr[-x - 1, 0, 0]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[x, 0, 0]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[0, -y - 1, 0]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[0, y, 0]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[0, 0, -z - 1]
|
||||
with self.assertRaises(IndexError):
|
||||
darr[0, 0, z]
|
||||
|
||||
|
||||
class CudaArrayStridedSlice(CUDATestCase):
|
||||
|
||||
def test_strided_index_1d(self):
|
||||
arr = np.arange(10)
|
||||
darr = cuda.to_device(arr)
|
||||
for i in range(arr.size):
|
||||
np.testing.assert_equal(arr[i::2], darr[i::2].copy_to_host())
|
||||
|
||||
def test_strided_index_2d(self):
|
||||
arr = np.arange(6 * 7).reshape(6, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
|
||||
for i in range(arr.shape[0]):
|
||||
for j in range(arr.shape[1]):
|
||||
np.testing.assert_equal(arr[i::2, j::2],
|
||||
darr[i::2, j::2].copy_to_host())
|
||||
|
||||
def test_strided_index_3d(self):
|
||||
arr = np.arange(6 * 7 * 8).reshape(6, 7, 8)
|
||||
darr = cuda.to_device(arr)
|
||||
|
||||
for i in range(arr.shape[0]):
|
||||
for j in range(arr.shape[1]):
|
||||
for k in range(arr.shape[2]):
|
||||
np.testing.assert_equal(
|
||||
arr[i::2, j::2, k::2],
|
||||
darr[i::2, j::2, k::2].copy_to_host())
|
||||
|
||||
|
||||
class CudaArraySlicing(CUDATestCase):
|
||||
def test_prefix_1d(self):
|
||||
arr = np.arange(5)
|
||||
darr = cuda.to_device(arr)
|
||||
for i in range(arr.size):
|
||||
expect = arr[i:]
|
||||
got = darr[i:].copy_to_host()
|
||||
self.assertTrue(np.all(expect == got))
|
||||
|
||||
def test_prefix_2d(self):
|
||||
arr = np.arange(3 ** 2).reshape(3, 3)
|
||||
darr = cuda.to_device(arr)
|
||||
for i in range(arr.shape[0]):
|
||||
for j in range(arr.shape[1]):
|
||||
expect = arr[i:, j:]
|
||||
sliced = darr[i:, j:]
|
||||
self.assertEqual(expect.shape, sliced.shape)
|
||||
self.assertEqual(expect.strides, sliced.strides)
|
||||
got = sliced.copy_to_host()
|
||||
self.assertTrue(np.all(expect == got))
|
||||
|
||||
def test_select_3d_first_two_dim(self):
|
||||
arr = np.arange(3 * 4 * 5).reshape(3, 4, 5)
|
||||
darr = cuda.to_device(arr)
|
||||
# Select first dimension
|
||||
for i in range(arr.shape[0]):
|
||||
expect = arr[i]
|
||||
sliced = darr[i]
|
||||
self.assertEqual(expect.shape, sliced.shape)
|
||||
self.assertEqual(expect.strides, sliced.strides)
|
||||
got = sliced.copy_to_host()
|
||||
self.assertTrue(np.all(expect == got))
|
||||
# Select second dimension
|
||||
for i in range(arr.shape[0]):
|
||||
for j in range(arr.shape[1]):
|
||||
expect = arr[i, j]
|
||||
sliced = darr[i, j]
|
||||
self.assertEqual(expect.shape, sliced.shape)
|
||||
self.assertEqual(expect.strides, sliced.strides)
|
||||
got = sliced.copy_to_host()
|
||||
self.assertTrue(np.all(expect == got))
|
||||
|
||||
def test_select_f(self):
|
||||
a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='F')
|
||||
da = cuda.to_device(a)
|
||||
|
||||
for i in range(a.shape[0]):
|
||||
for j in range(a.shape[1]):
|
||||
self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
|
||||
a[i, j, :]))
|
||||
for j in range(a.shape[2]):
|
||||
self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
|
||||
a[i, :, j]))
|
||||
for i in range(a.shape[1]):
|
||||
for j in range(a.shape[2]):
|
||||
self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
|
||||
a[:, i, j]))
|
||||
|
||||
def test_select_c(self):
|
||||
a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='C')
|
||||
da = cuda.to_device(a)
|
||||
|
||||
for i in range(a.shape[0]):
|
||||
for j in range(a.shape[1]):
|
||||
self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
|
||||
a[i, j, :]))
|
||||
for j in range(a.shape[2]):
|
||||
self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
|
||||
a[i, :, j]))
|
||||
for i in range(a.shape[1]):
|
||||
for j in range(a.shape[2]):
|
||||
self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
|
||||
a[:, i, j]))
|
||||
|
||||
def test_prefix_select(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7, order='F')
|
||||
|
||||
darr = cuda.to_device(arr)
|
||||
self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1]))
|
||||
|
||||
def test_negative_slicing_1d(self):
|
||||
arr = np.arange(10)
|
||||
darr = cuda.to_device(arr)
|
||||
for i, j in product(range(-10, 10), repeat=2):
|
||||
np.testing.assert_array_equal(arr[i:j],
|
||||
darr[i:j].copy_to_host())
|
||||
|
||||
def test_negative_slicing_2d(self):
|
||||
arr = np.arange(12).reshape(3, 4)
|
||||
darr = cuda.to_device(arr)
|
||||
for x, y, w, s in product(range(-4, 4), repeat=4):
|
||||
np.testing.assert_array_equal(arr[x:y, w:s],
|
||||
darr[x:y, w:s].copy_to_host())
|
||||
|
||||
def test_empty_slice_1d(self):
|
||||
arr = np.arange(5)
|
||||
darr = cuda.to_device(arr)
|
||||
for i in range(darr.shape[0]):
|
||||
np.testing.assert_array_equal(darr[i:i].copy_to_host(), arr[i:i])
|
||||
# empty slice of empty slice
|
||||
np.testing.assert_array_equal(darr[:0][:0].copy_to_host(), np.empty(0))
|
||||
# out-of-bound slice just produces empty slices
|
||||
np.testing.assert_array_equal(darr[:0][:1].copy_to_host(),
|
||||
arr[:0][:1])
|
||||
np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
|
||||
arr[:0][-1:])
|
||||
|
||||
def test_empty_slice_2d(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
np.testing.assert_array_equal(darr[:0].copy_to_host(), arr[:0])
|
||||
np.testing.assert_array_equal(darr[3, :0].copy_to_host(), arr[3, :0])
|
||||
# empty slice of empty slice
|
||||
np.testing.assert_array_equal(darr[:0][:0].copy_to_host(),
|
||||
np.empty((0, 7)))
|
||||
# out-of-bound slice just produces empty slices
|
||||
np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1])
|
||||
np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
|
||||
arr[:0][-1:])
|
||||
|
||||
|
||||
class CudaArraySetting(CUDATestCase):
|
||||
"""
|
||||
Most of the slicing logic is tested in the cases above, so these
|
||||
tests focus on the setting logic.
|
||||
"""
|
||||
|
||||
def test_scalar(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
arr[2, 2] = 500
|
||||
darr[2, 2] = 500
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_rank(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
arr[2] = 500
|
||||
darr[2] = 500
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_broadcast(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
arr[:, 2] = 500
|
||||
darr[:, 2] = 500
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_array_assign_column(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
_400 = np.full(shape=7, fill_value=400)
|
||||
arr[2] = _400
|
||||
darr[2] = _400
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_array_assign_row(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
_400 = np.full(shape=5, fill_value=400)
|
||||
arr[:, 2] = _400
|
||||
darr[:, 2] = _400
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_array_assign_subarray(self):
|
||||
arr = np.arange(5 * 6 * 7).reshape(5, 6, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
_400 = np.full(shape=(6, 7), fill_value=400)
|
||||
arr[2] = _400
|
||||
darr[2] = _400
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_array_assign_deep_subarray(self):
|
||||
arr = np.arange(5 * 6 * 7 * 8).reshape(5, 6, 7, 8)
|
||||
darr = cuda.to_device(arr)
|
||||
_400 = np.full(shape=(5, 6, 8), fill_value=400)
|
||||
arr[:, :, 2] = _400
|
||||
darr[:, :, 2] = _400
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_array_assign_all(self):
|
||||
arr = np.arange(5 * 7).reshape(5, 7)
|
||||
darr = cuda.to_device(arr)
|
||||
_400 = np.full(shape=(5, 7), fill_value=400)
|
||||
arr[:] = _400
|
||||
darr[:] = _400
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_strides(self):
|
||||
arr = np.ones(20)
|
||||
darr = cuda.to_device(arr)
|
||||
arr[::2] = 500
|
||||
darr[::2] = 500
|
||||
np.testing.assert_array_equal(darr.copy_to_host(), arr)
|
||||
|
||||
def test_incompatible_highdim(self):
|
||||
darr = cuda.to_device(np.arange(5 * 7))
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
darr[:] = np.ones(shape=(1, 2, 3))
|
||||
|
||||
self.assertIn(
|
||||
member=str(e.exception),
|
||||
container=[
|
||||
"Can't assign 3-D array to 1-D self", # device
|
||||
"could not broadcast input array from shape (2,3) "
|
||||
"into shape (35,)", # simulator, NP >= 1.20
|
||||
])
|
||||
|
||||
def test_incompatible_shape(self):
|
||||
darr = cuda.to_device(np.arange(5))
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
darr[:] = [1, 3]
|
||||
|
||||
self.assertIn(
|
||||
member=str(e.exception),
|
||||
container=[
|
||||
"Can't copy sequence with size 2 to array axis 0 with "
|
||||
"dimension 5", # device
|
||||
"could not broadcast input array from shape (2,) into "
|
||||
"shape (5,)", # simulator, NP >= 1.20
|
||||
])
|
||||
|
||||
@skip_on_cudasim('cudasim does not use streams and operates synchronously')
|
||||
def test_sync(self):
|
||||
# There should be a synchronization when no stream is supplied
|
||||
darr = cuda.to_device(np.arange(5))
|
||||
|
||||
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
||||
return_value=None) as mock_sync:
|
||||
darr[0] = 10
|
||||
|
||||
mock_sync.assert_called_once()
|
||||
|
||||
@skip_on_cudasim('cudasim does not use streams and operates synchronously')
|
||||
def test_no_sync_default_stream(self):
|
||||
# There should not be a synchronization when the array has a default
|
||||
# stream, whether it is the default stream, the legacy default stream,
|
||||
# the per-thread default stream, or another stream.
|
||||
streams = (cuda.stream(), cuda.default_stream(),
|
||||
cuda.legacy_default_stream(),
|
||||
cuda.per_thread_default_stream())
|
||||
|
||||
for stream in streams:
|
||||
darr = cuda.to_device(np.arange(5), stream=stream)
|
||||
|
||||
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
||||
return_value=None) as mock_sync:
|
||||
darr[0] = 10
|
||||
|
||||
mock_sync.assert_not_called()
|
||||
|
||||
@skip_on_cudasim('cudasim does not use streams and operates synchronously')
|
||||
def test_no_sync_supplied_stream(self):
|
||||
# There should not be a synchronization when a stream is supplied for
|
||||
# the setitem call, whether it is the default stream, the legacy default
|
||||
# stream, the per-thread default stream, or another stream.
|
||||
streams = (cuda.stream(), cuda.default_stream(),
|
||||
cuda.legacy_default_stream(),
|
||||
cuda.per_thread_default_stream())
|
||||
|
||||
for stream in streams:
|
||||
darr = cuda.to_device(np.arange(5))
|
||||
|
||||
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
||||
return_value=None) as mock_sync:
|
||||
darr.setitem(0, 10, stream=stream)
|
||||
|
||||
mock_sync.assert_not_called()
|
||||
|
||||
@unittest.skip('Requires PR #6367')
|
||||
def test_issue_6505(self):
|
||||
# On Windows, the writes to ary_v would not be visible prior to the
|
||||
# assertion, due to the assignment being done with a kernel launch that
|
||||
# returns asynchronously - there should now be a sync after the kernel
|
||||
# launch to ensure that the writes are always visible.
|
||||
ary = cuda.mapped_array(2, dtype=np.int32)
|
||||
ary[:] = 0
|
||||
|
||||
ary_v = ary.view('u1')
|
||||
ary_v[1] = 1
|
||||
ary_v[5] = 1
|
||||
self.assertEqual(sum(ary), 512)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,21 @@
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
|
||||
|
||||
class TestCudaAutoContext(CUDATestCase):
|
||||
def test_auto_context(self):
|
||||
"""A problem was revealed by a customer that the use cuda.to_device
|
||||
does not create a CUDA context.
|
||||
This tests the problem
|
||||
"""
|
||||
A = np.arange(10, dtype=np.float32)
|
||||
newA = np.empty_like(A)
|
||||
dA = cuda.to_device(A)
|
||||
|
||||
dA.copy_to_host(newA)
|
||||
self.assertTrue(np.allclose(A, newA))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,179 @@
|
||||
import numpy as np
|
||||
import ctypes
|
||||
from numba.cuda.cudadrv.devicearray import (DeviceRecord, from_record_like,
|
||||
auto_device)
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
from numba.np import numpy_support
|
||||
from numba import cuda
|
||||
|
||||
N_CHARS = 5
|
||||
|
||||
recordtype = np.dtype(
|
||||
[
|
||||
('a', np.float64),
|
||||
('b', np.int32),
|
||||
('c', np.complex64),
|
||||
('d', (np.str_, N_CHARS))
|
||||
],
|
||||
align=True
|
||||
)
|
||||
|
||||
recordwitharray = np.dtype(
|
||||
[
|
||||
('g', np.int32),
|
||||
('h', np.float32, 2)
|
||||
],
|
||||
align=True
|
||||
)
|
||||
|
||||
recwithmat = np.dtype([('i', np.int32),
|
||||
('j', np.float32, (3, 3))])
|
||||
|
||||
recwithrecwithmat = np.dtype([('x', np.int32), ('y', recwithmat)])
|
||||
|
||||
|
||||
@skip_on_cudasim('Device Record API unsupported in the simulator')
|
||||
class TestCudaDeviceRecord(CUDATestCase):
|
||||
"""
|
||||
Tests the DeviceRecord class with np.void host types.
|
||||
"""
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self._create_data(np.zeros)
|
||||
|
||||
def _create_data(self, array_ctor):
|
||||
self.dtype = np.dtype([('a', np.int32), ('b', np.float32)], align=True)
|
||||
self.hostz = array_ctor(1, self.dtype)[0]
|
||||
self.hostnz = array_ctor(1, self.dtype)[0]
|
||||
self.hostnz['a'] = 10
|
||||
self.hostnz['b'] = 11.0
|
||||
|
||||
def _check_device_record(self, reference, rec):
|
||||
self.assertEqual(rec.shape, tuple())
|
||||
self.assertEqual(rec.strides, tuple())
|
||||
self.assertEqual(rec.dtype, reference.dtype)
|
||||
self.assertEqual(rec.alloc_size, reference.dtype.itemsize)
|
||||
self.assertIsNotNone(rec.gpu_data)
|
||||
self.assertNotEqual(rec.device_ctypes_pointer, ctypes.c_void_p(0))
|
||||
|
||||
numba_type = numpy_support.from_dtype(reference.dtype)
|
||||
self.assertEqual(rec._numba_type_, numba_type)
|
||||
|
||||
def test_device_record_interface(self):
|
||||
hostrec = self.hostz.copy()
|
||||
devrec = DeviceRecord(self.dtype)
|
||||
self._check_device_record(hostrec, devrec)
|
||||
|
||||
def test_device_record_copy(self):
|
||||
hostrec = self.hostz.copy()
|
||||
devrec = DeviceRecord(self.dtype)
|
||||
devrec.copy_to_device(hostrec)
|
||||
|
||||
# Copy back and check values are all zeros
|
||||
hostrec2 = self.hostnz.copy()
|
||||
devrec.copy_to_host(hostrec2)
|
||||
np.testing.assert_equal(self.hostz, hostrec2)
|
||||
|
||||
# Copy non-zero values to GPU and back and check values
|
||||
hostrec3 = self.hostnz.copy()
|
||||
devrec.copy_to_device(hostrec3)
|
||||
|
||||
hostrec4 = self.hostz.copy()
|
||||
devrec.copy_to_host(hostrec4)
|
||||
np.testing.assert_equal(hostrec4, self.hostnz)
|
||||
|
||||
def test_from_record_like(self):
|
||||
# Create record from host record
|
||||
hostrec = self.hostz.copy()
|
||||
devrec = from_record_like(hostrec)
|
||||
self._check_device_record(hostrec, devrec)
|
||||
|
||||
# Create record from device record and check for distinct data
|
||||
devrec2 = from_record_like(devrec)
|
||||
self._check_device_record(devrec, devrec2)
|
||||
self.assertNotEqual(devrec.gpu_data, devrec2.gpu_data)
|
||||
|
||||
def test_auto_device(self):
|
||||
# Create record from host record
|
||||
hostrec = self.hostnz.copy()
|
||||
devrec, new_gpu_obj = auto_device(hostrec)
|
||||
self._check_device_record(hostrec, devrec)
|
||||
self.assertTrue(new_gpu_obj)
|
||||
|
||||
# Copy data back and check it is equal to auto_device arg
|
||||
hostrec2 = self.hostz.copy()
|
||||
devrec.copy_to_host(hostrec2)
|
||||
np.testing.assert_equal(hostrec2, hostrec)
|
||||
|
||||
|
||||
class TestCudaDeviceRecordWithRecord(TestCudaDeviceRecord):
|
||||
"""
|
||||
Tests the DeviceRecord class with np.record host types
|
||||
"""
|
||||
def setUp(self):
|
||||
CUDATestCase.setUp(self)
|
||||
self._create_data(np.recarray)
|
||||
|
||||
|
||||
@skip_on_cudasim('Structured array attr access not supported in simulator')
|
||||
class TestRecordDtypeWithStructArrays(CUDATestCase):
|
||||
'''
|
||||
Test operation of device arrays on structured arrays.
|
||||
'''
|
||||
|
||||
def _createSampleArrays(self):
|
||||
self.sample1d = cuda.device_array(3, dtype=recordtype)
|
||||
self.samplerec1darr = cuda.device_array(1, dtype=recordwitharray)[0]
|
||||
self.samplerecmat = cuda.device_array(1,dtype=recwithmat)[0]
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self._createSampleArrays()
|
||||
|
||||
ary = self.sample1d
|
||||
for i in range(ary.size):
|
||||
x = i + 1
|
||||
ary[i]['a'] = x / 2
|
||||
ary[i]['b'] = x
|
||||
ary[i]['c'] = x * 1j
|
||||
ary[i]['d'] = str(x) * N_CHARS
|
||||
|
||||
def test_structured_array1(self):
|
||||
ary = self.sample1d
|
||||
for i in range(self.sample1d.size):
|
||||
x = i + 1
|
||||
self.assertEqual(ary[i]['a'], x / 2)
|
||||
self.assertEqual(ary[i]['b'], x)
|
||||
self.assertEqual(ary[i]['c'], x * 1j)
|
||||
self.assertEqual(ary[i]['d'], str(x) * N_CHARS)
|
||||
|
||||
def test_structured_array2(self):
|
||||
ary = self.samplerec1darr
|
||||
ary['g'] = 2
|
||||
ary['h'][0] = 3.0
|
||||
ary['h'][1] = 4.0
|
||||
self.assertEqual(ary['g'], 2)
|
||||
self.assertEqual(ary['h'][0], 3.0)
|
||||
self.assertEqual(ary['h'][1], 4.0)
|
||||
|
||||
def test_structured_array3(self):
|
||||
ary = self.samplerecmat
|
||||
mat = np.array([[5.0, 10.0, 15.0],
|
||||
[20.0, 25.0, 30.0],
|
||||
[35.0, 40.0, 45.0]],
|
||||
dtype=np.float32).reshape(3,3)
|
||||
ary['j'][:] = mat
|
||||
np.testing.assert_equal(ary['j'], mat)
|
||||
|
||||
def test_structured_array4(self):
|
||||
arr = np.zeros(1, dtype=recwithrecwithmat)
|
||||
d_arr = cuda.to_device(arr)
|
||||
d_arr[0]['y']['i'] = 1
|
||||
self.assertEqual(d_arr[0]['y']['i'], 1)
|
||||
d_arr[0]['y']['j'][0, 0] = 2.0
|
||||
self.assertEqual(d_arr[0]['y']['j'][0, 0], 2.0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,235 @@
|
||||
from ctypes import byref, c_int, c_void_p, sizeof
|
||||
|
||||
from numba.cuda.cudadrv.driver import (host_to_device, device_to_host, driver,
|
||||
launch_kernel)
|
||||
from numba.cuda.cudadrv import devices, drvapi, driver as _driver
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
|
||||
|
||||
ptx1 = '''
|
||||
.version 1.4
|
||||
.target sm_10, map_f64_to_f32
|
||||
|
||||
.entry _Z10helloworldPi (
|
||||
.param .u64 __cudaparm__Z10helloworldPi_A)
|
||||
{
|
||||
.reg .u32 %r<3>;
|
||||
.reg .u64 %rd<6>;
|
||||
.loc 14 4 0
|
||||
$LDWbegin__Z10helloworldPi:
|
||||
.loc 14 6 0
|
||||
cvt.s32.u16 %r1, %tid.x;
|
||||
ld.param.u64 %rd1, [__cudaparm__Z10helloworldPi_A];
|
||||
cvt.u64.u16 %rd2, %tid.x;
|
||||
mul.lo.u64 %rd3, %rd2, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
st.global.s32 [%rd4+0], %r1;
|
||||
.loc 14 7 0
|
||||
exit;
|
||||
$LDWend__Z10helloworldPi:
|
||||
} // _Z10helloworldPi
|
||||
'''
|
||||
|
||||
ptx2 = '''
|
||||
.version 3.0
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
.file 1 "/tmp/tmpxft_000012c7_00000000-9_testcuda.cpp3.i"
|
||||
.file 2 "testcuda.cu"
|
||||
|
||||
.entry _Z10helloworldPi(
|
||||
.param .u64 _Z10helloworldPi_param_0
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<3>;
|
||||
.reg .s64 %rl<5>;
|
||||
|
||||
|
||||
ld.param.u64 %rl1, [_Z10helloworldPi_param_0];
|
||||
cvta.to.global.u64 %rl2, %rl1;
|
||||
.loc 2 6 1
|
||||
mov.u32 %r1, %tid.x;
|
||||
mul.wide.u32 %rl3, %r1, 4;
|
||||
add.s64 %rl4, %rl2, %rl3;
|
||||
st.global.u32 [%rl4], %r1;
|
||||
.loc 2 7 2
|
||||
ret;
|
||||
}
|
||||
'''
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
|
||||
class TestCudaDriver(CUDATestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.assertTrue(len(devices.gpus) > 0)
|
||||
self.context = devices.get_context()
|
||||
device = self.context.device
|
||||
ccmajor, _ = device.compute_capability
|
||||
if ccmajor >= 2:
|
||||
self.ptx = ptx2
|
||||
else:
|
||||
self.ptx = ptx1
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
del self.context
|
||||
|
||||
def test_cuda_driver_basic(self):
|
||||
module = self.context.create_module_ptx(self.ptx)
|
||||
function = module.get_function('_Z10helloworldPi')
|
||||
|
||||
array = (c_int * 100)()
|
||||
|
||||
memory = self.context.memalloc(sizeof(array))
|
||||
host_to_device(memory, array, sizeof(array))
|
||||
|
||||
ptr = memory.device_ctypes_pointer
|
||||
stream = 0
|
||||
|
||||
if _driver.USE_NV_BINDING:
|
||||
ptr = c_void_p(int(ptr))
|
||||
stream = _driver.binding.CUstream(stream)
|
||||
|
||||
launch_kernel(function.handle, # Kernel
|
||||
1, 1, 1, # gx, gy, gz
|
||||
100, 1, 1, # bx, by, bz
|
||||
0, # dynamic shared mem
|
||||
stream, # stream
|
||||
[ptr]) # arguments
|
||||
|
||||
device_to_host(array, memory, sizeof(array))
|
||||
for i, v in enumerate(array):
|
||||
self.assertEqual(i, v)
|
||||
|
||||
module.unload()
|
||||
|
||||
def test_cuda_driver_stream_operations(self):
|
||||
module = self.context.create_module_ptx(self.ptx)
|
||||
function = module.get_function('_Z10helloworldPi')
|
||||
|
||||
array = (c_int * 100)()
|
||||
|
||||
stream = self.context.create_stream()
|
||||
|
||||
with stream.auto_synchronize():
|
||||
memory = self.context.memalloc(sizeof(array))
|
||||
host_to_device(memory, array, sizeof(array), stream=stream)
|
||||
|
||||
ptr = memory.device_ctypes_pointer
|
||||
if _driver.USE_NV_BINDING:
|
||||
ptr = c_void_p(int(ptr))
|
||||
|
||||
launch_kernel(function.handle, # Kernel
|
||||
1, 1, 1, # gx, gy, gz
|
||||
100, 1, 1, # bx, by, bz
|
||||
0, # dynamic shared mem
|
||||
stream.handle, # stream
|
||||
[ptr]) # arguments
|
||||
|
||||
device_to_host(array, memory, sizeof(array), stream=stream)
|
||||
|
||||
for i, v in enumerate(array):
|
||||
self.assertEqual(i, v)
|
||||
|
||||
def test_cuda_driver_default_stream(self):
|
||||
# Test properties of the default stream
|
||||
ds = self.context.get_default_stream()
|
||||
self.assertIn("Default CUDA stream", repr(ds))
|
||||
self.assertEqual(0, int(ds))
|
||||
# bool(stream) is the check that is done in memcpy to decide if async
|
||||
# version should be used. So the default (0) stream should be true-ish
|
||||
# even though 0 is usually false-ish in Python.
|
||||
self.assertTrue(ds)
|
||||
self.assertFalse(ds.external)
|
||||
|
||||
def test_cuda_driver_legacy_default_stream(self):
|
||||
# Test properties of the legacy default stream
|
||||
ds = self.context.get_legacy_default_stream()
|
||||
self.assertIn("Legacy default CUDA stream", repr(ds))
|
||||
self.assertEqual(1, int(ds))
|
||||
self.assertTrue(ds)
|
||||
self.assertFalse(ds.external)
|
||||
|
||||
def test_cuda_driver_per_thread_default_stream(self):
|
||||
# Test properties of the per-thread default stream
|
||||
ds = self.context.get_per_thread_default_stream()
|
||||
self.assertIn("Per-thread default CUDA stream", repr(ds))
|
||||
self.assertEqual(2, int(ds))
|
||||
self.assertTrue(ds)
|
||||
self.assertFalse(ds.external)
|
||||
|
||||
def test_cuda_driver_stream(self):
|
||||
# Test properties of non-default streams
|
||||
s = self.context.create_stream()
|
||||
self.assertIn("CUDA stream", repr(s))
|
||||
self.assertNotIn("Default", repr(s))
|
||||
self.assertNotIn("External", repr(s))
|
||||
self.assertNotEqual(0, int(s))
|
||||
self.assertTrue(s)
|
||||
self.assertFalse(s.external)
|
||||
|
||||
def test_cuda_driver_external_stream(self):
|
||||
# Test properties of a stream created from an external stream object.
|
||||
# We use the driver API directly to create a stream, to emulate an
|
||||
# external library creating a stream
|
||||
if _driver.USE_NV_BINDING:
|
||||
handle = driver.cuStreamCreate(0)
|
||||
ptr = int(handle)
|
||||
else:
|
||||
handle = drvapi.cu_stream()
|
||||
driver.cuStreamCreate(byref(handle), 0)
|
||||
ptr = handle.value
|
||||
s = self.context.create_external_stream(ptr)
|
||||
|
||||
self.assertIn("External CUDA stream", repr(s))
|
||||
# Ensure neither "Default" nor "default"
|
||||
self.assertNotIn("efault", repr(s))
|
||||
self.assertEqual(ptr, int(s))
|
||||
self.assertTrue(s)
|
||||
self.assertTrue(s.external)
|
||||
|
||||
def test_cuda_driver_occupancy(self):
|
||||
module = self.context.create_module_ptx(self.ptx)
|
||||
function = module.get_function('_Z10helloworldPi')
|
||||
|
||||
value = self.context.get_active_blocks_per_multiprocessor(function,
|
||||
128, 128)
|
||||
self.assertTrue(value > 0)
|
||||
|
||||
def b2d(bs):
|
||||
return bs
|
||||
|
||||
grid, block = self.context.get_max_potential_block_size(function, b2d,
|
||||
128, 128)
|
||||
self.assertTrue(grid > 0)
|
||||
self.assertTrue(block > 0)
|
||||
|
||||
|
||||
class TestDevice(CUDATestCase):
|
||||
def test_device_get_uuid(self):
|
||||
# A device UUID looks like:
|
||||
#
|
||||
# GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643
|
||||
#
|
||||
# To test, we construct an RE that matches this form and verify that
|
||||
# the returned UUID matches.
|
||||
#
|
||||
# Device UUIDs may not conform to parts of the UUID specification (RFC
|
||||
# 4122) pertaining to versions and variants, so we do not extract and
|
||||
# validate the values of these bits.
|
||||
|
||||
h = '[0-9a-f]{%d}'
|
||||
h4 = h % 4
|
||||
h8 = h % 8
|
||||
h12 = h % 12
|
||||
uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$'
|
||||
|
||||
dev = devices.get_context().device
|
||||
self.assertRegex(dev.uuid, uuid_format)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,22 @@
|
||||
from numba.cuda.testing import unittest
|
||||
from numba.cuda.testing import skip_on_cudasim, skip_unless_conda_cudatoolkit
|
||||
from numba.misc.findlib import find_lib
|
||||
|
||||
|
||||
@skip_on_cudasim('Library detection unsupported in the simulator')
|
||||
@skip_unless_conda_cudatoolkit
|
||||
class TestLibraryDetection(unittest.TestCase):
|
||||
def test_detect(self):
|
||||
"""
|
||||
This test is solely present to ensure that shipped cudatoolkits have
|
||||
additional core libraries in locations that Numba scans by default.
|
||||
PyCulib (and potentially others) rely on Numba's library finding
|
||||
capacity to find and subsequently load these libraries.
|
||||
"""
|
||||
core_libs = ['nvvm']
|
||||
for l in core_libs:
|
||||
self.assertNotEqual(find_lib(l), [])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,193 @@
|
||||
import ctypes
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numba.cuda.cudadrv import driver, drvapi, devices
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
|
||||
class TestCudaMemory(ContextResettingTestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.context = devices.get_context()
|
||||
|
||||
def tearDown(self):
|
||||
del self.context
|
||||
super(TestCudaMemory, self).tearDown()
|
||||
|
||||
def _template(self, obj):
|
||||
self.assertTrue(driver.is_device_memory(obj))
|
||||
driver.require_device_memory(obj)
|
||||
if driver.USE_NV_BINDING:
|
||||
expected_class = driver.binding.CUdeviceptr
|
||||
else:
|
||||
expected_class = drvapi.cu_device_ptr
|
||||
self.assertTrue(isinstance(obj.device_ctypes_pointer,
|
||||
expected_class))
|
||||
|
||||
def test_device_memory(self):
|
||||
devmem = self.context.memalloc(1024)
|
||||
self._template(devmem)
|
||||
|
||||
def test_device_view(self):
|
||||
devmem = self.context.memalloc(1024)
|
||||
self._template(devmem.view(10))
|
||||
|
||||
def test_host_alloc(self):
|
||||
devmem = self.context.memhostalloc(1024, mapped=True)
|
||||
self._template(devmem)
|
||||
|
||||
def test_pinned_memory(self):
|
||||
ary = np.arange(10)
|
||||
devmem = self.context.mempin(ary, ary.ctypes.data,
|
||||
ary.size * ary.dtype.itemsize,
|
||||
mapped=True)
|
||||
self._template(devmem)
|
||||
|
||||
def test_managed_memory(self):
|
||||
devmem = self.context.memallocmanaged(1024)
|
||||
self._template(devmem)
|
||||
|
||||
def test_derived_pointer(self):
|
||||
# Use MemoryPointer.view to create derived pointer
|
||||
|
||||
def handle_val(mem):
|
||||
if driver.USE_NV_BINDING:
|
||||
return int(mem.handle)
|
||||
else:
|
||||
return mem.handle.value
|
||||
|
||||
def check(m, offset):
|
||||
# create view
|
||||
v1 = m.view(offset)
|
||||
self.assertEqual(handle_val(v1.owner), handle_val(m))
|
||||
self.assertEqual(m.refct, 2)
|
||||
self.assertEqual(handle_val(v1) - offset, handle_val(v1.owner))
|
||||
# create a view
|
||||
v2 = v1.view(offset)
|
||||
self.assertEqual(handle_val(v2.owner), handle_val(m))
|
||||
self.assertEqual(handle_val(v2.owner), handle_val(m))
|
||||
self.assertEqual(handle_val(v2) - offset * 2,
|
||||
handle_val(v2.owner))
|
||||
self.assertEqual(m.refct, 3)
|
||||
del v2
|
||||
self.assertEqual(m.refct, 2)
|
||||
del v1
|
||||
self.assertEqual(m.refct, 1)
|
||||
|
||||
m = self.context.memalloc(1024)
|
||||
check(m=m, offset=0)
|
||||
check(m=m, offset=1)
|
||||
|
||||
def test_user_extension(self):
|
||||
# User can use MemoryPointer to wrap externally defined pointers.
|
||||
# This test checks if the finalizer is invokded at correct time
|
||||
fake_ptr = ctypes.c_void_p(0xdeadbeef)
|
||||
dtor_invoked = [0]
|
||||
|
||||
def dtor():
|
||||
dtor_invoked[0] += 1
|
||||
|
||||
# Ensure finalizer is called when pointer is deleted
|
||||
ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
|
||||
size=40, finalizer=dtor)
|
||||
self.assertEqual(dtor_invoked[0], 0)
|
||||
del ptr
|
||||
self.assertEqual(dtor_invoked[0], 1)
|
||||
|
||||
# Ensure removing derived pointer doesn't call finalizer
|
||||
ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
|
||||
size=40, finalizer=dtor)
|
||||
owned = ptr.own()
|
||||
del owned
|
||||
self.assertEqual(dtor_invoked[0], 1)
|
||||
del ptr
|
||||
self.assertEqual(dtor_invoked[0], 2)
|
||||
|
||||
|
||||
class TestCudaMemoryFunctions(ContextResettingTestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.context = devices.get_context()
|
||||
|
||||
def tearDown(self):
|
||||
del self.context
|
||||
super(TestCudaMemoryFunctions, self).tearDown()
|
||||
|
||||
def test_memcpy(self):
|
||||
hstary = np.arange(100, dtype=np.uint32)
|
||||
hstary2 = np.arange(100, dtype=np.uint32)
|
||||
sz = hstary.size * hstary.dtype.itemsize
|
||||
devary = self.context.memalloc(sz)
|
||||
|
||||
driver.host_to_device(devary, hstary, sz)
|
||||
driver.device_to_host(hstary2, devary, sz)
|
||||
|
||||
self.assertTrue(np.all(hstary == hstary2))
|
||||
|
||||
def test_memset(self):
|
||||
dtype = np.dtype('uint32')
|
||||
n = 10
|
||||
sz = dtype.itemsize * 10
|
||||
devary = self.context.memalloc(sz)
|
||||
driver.device_memset(devary, 0xab, sz)
|
||||
|
||||
hstary = np.empty(n, dtype=dtype)
|
||||
driver.device_to_host(hstary, devary, sz)
|
||||
|
||||
hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32'))
|
||||
self.assertTrue(np.all(hstary == hstary2))
|
||||
|
||||
def test_d2d(self):
|
||||
hst = np.arange(100, dtype=np.uint32)
|
||||
hst2 = np.empty_like(hst)
|
||||
sz = hst.size * hst.dtype.itemsize
|
||||
dev1 = self.context.memalloc(sz)
|
||||
dev2 = self.context.memalloc(sz)
|
||||
driver.host_to_device(dev1, hst, sz)
|
||||
driver.device_to_device(dev2, dev1, sz)
|
||||
driver.device_to_host(hst2, dev2, sz)
|
||||
self.assertTrue(np.all(hst == hst2))
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
|
||||
class TestMVExtent(ContextResettingTestCase):
|
||||
def test_c_contiguous_array(self):
|
||||
ary = np.arange(100)
|
||||
arysz = ary.dtype.itemsize * ary.size
|
||||
s, e = driver.host_memory_extents(ary)
|
||||
self.assertTrue(ary.ctypes.data == s)
|
||||
self.assertTrue(arysz == driver.host_memory_size(ary))
|
||||
|
||||
def test_f_contiguous_array(self):
|
||||
ary = np.asfortranarray(np.arange(100).reshape(2, 50))
|
||||
arysz = ary.dtype.itemsize * np.prod(ary.shape)
|
||||
s, e = driver.host_memory_extents(ary)
|
||||
self.assertTrue(ary.ctypes.data == s)
|
||||
self.assertTrue(arysz == driver.host_memory_size(ary))
|
||||
|
||||
def test_single_element_array(self):
|
||||
ary = np.asarray(np.uint32(1234))
|
||||
arysz = ary.dtype.itemsize
|
||||
s, e = driver.host_memory_extents(ary)
|
||||
self.assertTrue(ary.ctypes.data == s)
|
||||
self.assertTrue(arysz == driver.host_memory_size(ary))
|
||||
|
||||
def test_ctypes_struct(self):
|
||||
class mystruct(ctypes.Structure):
|
||||
_fields_ = [('x', ctypes.c_int), ('y', ctypes.c_int)]
|
||||
|
||||
data = mystruct(x=123, y=432)
|
||||
sz = driver.host_memory_size(data)
|
||||
self.assertTrue(ctypes.sizeof(data) == sz)
|
||||
|
||||
def test_ctypes_double(self):
|
||||
data = ctypes.c_double(1.234)
|
||||
sz = driver.host_memory_size(data)
|
||||
self.assertTrue(ctypes.sizeof(data) == sz)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,547 @@
|
||||
import itertools
|
||||
import numpy as np
|
||||
from numba.cuda.cudadrv import devicearray
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
|
||||
|
||||
class TestCudaNDArray(CUDATestCase):
|
||||
def test_device_array_interface(self):
|
||||
dary = cuda.device_array(shape=100)
|
||||
devicearray.verify_cuda_ndarray_interface(dary)
|
||||
|
||||
ary = np.empty(100)
|
||||
dary = cuda.to_device(ary)
|
||||
devicearray.verify_cuda_ndarray_interface(dary)
|
||||
|
||||
ary = np.asarray(1.234)
|
||||
dary = cuda.to_device(ary)
|
||||
self.assertEqual(dary.ndim, 0)
|
||||
devicearray.verify_cuda_ndarray_interface(dary)
|
||||
|
||||
def test_device_array_from_readonly(self):
|
||||
ary = np.arange(100, dtype=np.float32)
|
||||
# Make the array readonly
|
||||
ary.flags.writeable = False
|
||||
self.assertFalse(ary.flags.writeable)
|
||||
# Ensure that we can copy the readonly array
|
||||
dary = cuda.to_device(ary)
|
||||
retr = dary.copy_to_host()
|
||||
np.testing.assert_array_equal(retr, ary)
|
||||
|
||||
def test_devicearray_dtype(self):
|
||||
dary = cuda.device_array(shape=(100,), dtype="f4")
|
||||
self.assertEqual(dary.dtype, np.dtype("f4"))
|
||||
|
||||
def test_devicearray_no_copy(self):
|
||||
array = np.arange(100, dtype=np.float32)
|
||||
cuda.to_device(array, copy=False)
|
||||
|
||||
def test_devicearray_shape(self):
|
||||
ary = np.arange(2 * 3 * 4).reshape(2, 3, 4)
|
||||
dary = cuda.to_device(ary)
|
||||
self.assertEqual(ary.shape, dary.shape)
|
||||
self.assertEqual(ary.shape[1:], dary.shape[1:])
|
||||
|
||||
def test_devicearray(self):
|
||||
array = np.arange(100, dtype=np.int32)
|
||||
original = array.copy()
|
||||
gpumem = cuda.to_device(array)
|
||||
array[:] = 0
|
||||
gpumem.copy_to_host(array)
|
||||
|
||||
np.testing.assert_array_equal(array, original)
|
||||
|
||||
def test_stream_bind(self):
|
||||
stream = cuda.stream()
|
||||
with stream.auto_synchronize():
|
||||
arr = cuda.device_array(
|
||||
(3, 3),
|
||||
dtype=np.float64,
|
||||
stream=stream)
|
||||
self.assertEqual(arr.bind(stream).stream, stream)
|
||||
self.assertEqual(arr.stream, stream)
|
||||
|
||||
def test_len_1d(self):
|
||||
ary = np.empty((3,))
|
||||
dary = cuda.device_array(3)
|
||||
self.assertEqual(len(ary), len(dary))
|
||||
|
||||
def test_len_2d(self):
|
||||
ary = np.empty((3, 5))
|
||||
dary = cuda.device_array((3, 5))
|
||||
self.assertEqual(len(ary), len(dary))
|
||||
|
||||
def test_len_3d(self):
|
||||
ary = np.empty((3, 5, 7))
|
||||
dary = cuda.device_array((3, 5, 7))
|
||||
self.assertEqual(len(ary), len(dary))
|
||||
|
||||
def test_devicearray_partition(self):
|
||||
N = 100
|
||||
array = np.arange(N, dtype=np.int32)
|
||||
original = array.copy()
|
||||
gpumem = cuda.to_device(array)
|
||||
left, right = gpumem.split(N // 2)
|
||||
|
||||
array[:] = 0
|
||||
|
||||
self.assertTrue(np.all(array == 0))
|
||||
|
||||
right.copy_to_host(array[N // 2:])
|
||||
left.copy_to_host(array[:N // 2])
|
||||
|
||||
self.assertTrue(np.all(array == original))
|
||||
|
||||
def test_devicearray_replace(self):
|
||||
N = 100
|
||||
array = np.arange(N, dtype=np.int32)
|
||||
original = array.copy()
|
||||
gpumem = cuda.to_device(array)
|
||||
cuda.to_device(array * 2, to=gpumem)
|
||||
gpumem.copy_to_host(array)
|
||||
np.testing.assert_array_equal(array, original * 2)
|
||||
|
||||
@skip_on_cudasim('This works in the simulator')
|
||||
def test_devicearray_transpose_wrongdim(self):
|
||||
gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4, 1))
|
||||
|
||||
with self.assertRaises(NotImplementedError) as e:
|
||||
np.transpose(gpumem)
|
||||
|
||||
self.assertEqual(
|
||||
"transposing a non-2D DeviceNDArray isn't supported",
|
||||
str(e.exception))
|
||||
|
||||
def test_devicearray_transpose_identity(self):
|
||||
# any-shape identities should work
|
||||
original = np.array(np.arange(24)).reshape(3, 4, 2)
|
||||
array = np.transpose(cuda.to_device(original),
|
||||
axes=(0, 1, 2)).copy_to_host()
|
||||
self.assertTrue(np.all(array == original))
|
||||
|
||||
def test_devicearray_transpose_duplicatedaxis(self):
|
||||
gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
np.transpose(gpumem, axes=(0, 0))
|
||||
|
||||
self.assertIn(
|
||||
str(e.exception),
|
||||
container=[
|
||||
'invalid axes list (0, 0)', # GPU
|
||||
'repeated axis in transpose', # sim
|
||||
])
|
||||
|
||||
def test_devicearray_transpose_wrongaxis(self):
|
||||
gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
np.transpose(gpumem, axes=(0, 2))
|
||||
|
||||
self.assertIn(
|
||||
str(e.exception),
|
||||
container=[
|
||||
'invalid axes list (0, 2)', # GPU
|
||||
'invalid axis for this array',
|
||||
'axis 2 is out of bounds for array of dimension 2', # sim
|
||||
])
|
||||
|
||||
def test_devicearray_view_ok(self):
|
||||
original = np.array(np.arange(12), dtype="i2").reshape(3, 4)
|
||||
array = cuda.to_device(original)
|
||||
for dtype in ("i4", "u4", "i8", "f8"):
|
||||
with self.subTest(dtype=dtype):
|
||||
np.testing.assert_array_equal(
|
||||
array.view(dtype).copy_to_host(),
|
||||
original.view(dtype)
|
||||
)
|
||||
|
||||
def test_devicearray_view_ok_not_c_contig(self):
|
||||
original = np.array(np.arange(32), dtype="i2").reshape(4, 8)
|
||||
array = cuda.to_device(original)[:, ::2]
|
||||
original = original[:, ::2]
|
||||
np.testing.assert_array_equal(
|
||||
array.view("u2").copy_to_host(),
|
||||
original.view("u2")
|
||||
)
|
||||
|
||||
def test_devicearray_view_bad_not_c_contig(self):
|
||||
original = np.array(np.arange(32), dtype="i2").reshape(4, 8)
|
||||
array = cuda.to_device(original)[:, ::2]
|
||||
with self.assertRaises(ValueError) as e:
|
||||
array.view("i4")
|
||||
|
||||
msg = str(e.exception)
|
||||
self.assertIn('To change to a dtype of a different size,', msg)
|
||||
|
||||
contiguous_pre_np123 = 'the array must be C-contiguous' in msg
|
||||
contiguous_post_np123 = 'the last axis must be contiguous' in msg
|
||||
self.assertTrue(contiguous_pre_np123 or contiguous_post_np123,
|
||||
'Expected message to mention contiguity')
|
||||
|
||||
def test_devicearray_view_bad_itemsize(self):
|
||||
original = np.array(np.arange(12), dtype="i2").reshape(4, 3)
|
||||
array = cuda.to_device(original)
|
||||
with self.assertRaises(ValueError) as e:
|
||||
array.view("i4")
|
||||
self.assertEqual(
|
||||
"When changing to a larger dtype,"
|
||||
" its size must be a divisor of the total size in bytes"
|
||||
" of the last axis of the array.",
|
||||
str(e.exception))
|
||||
|
||||
def test_devicearray_transpose_ok(self):
|
||||
original = np.array(np.arange(12)).reshape(3, 4)
|
||||
array = np.transpose(cuda.to_device(original)).copy_to_host()
|
||||
self.assertTrue(np.all(array == original.T))
|
||||
|
||||
def test_devicearray_transpose_T(self):
|
||||
original = np.array(np.arange(12)).reshape(3, 4)
|
||||
array = cuda.to_device(original).T.copy_to_host()
|
||||
self.assertTrue(np.all(array == original.T))
|
||||
|
||||
def test_devicearray_contiguous_slice(self):
|
||||
# memcpys are dumb ranges of bytes, so trying to
|
||||
# copy to a non-contiguous range shouldn't work!
|
||||
a = np.arange(25).reshape(5, 5, order='F')
|
||||
s = np.full(fill_value=5, shape=(5,))
|
||||
|
||||
d = cuda.to_device(a)
|
||||
a[2] = s
|
||||
|
||||
# d is in F-order (not C-order), so d[2] is not contiguous
|
||||
# (40-byte strides). This means we can't memcpy to it!
|
||||
with self.assertRaises(ValueError) as e:
|
||||
d[2].copy_to_device(s)
|
||||
self.assertEqual(
|
||||
devicearray.errmsg_contiguous_buffer,
|
||||
str(e.exception))
|
||||
|
||||
# if d[2].copy_to_device(s), then this would pass:
|
||||
# self.assertTrue((a == d.copy_to_host()).all())
|
||||
|
||||
def _test_devicearray_contiguous_host_copy(self, a_c, a_f):
|
||||
"""
|
||||
Checks host->device memcpys
|
||||
"""
|
||||
self.assertTrue(a_c.flags.c_contiguous)
|
||||
self.assertTrue(a_f.flags.f_contiguous)
|
||||
|
||||
for original, copy in [
|
||||
(a_f, a_f),
|
||||
(a_f, a_c),
|
||||
(a_c, a_f),
|
||||
(a_c, a_c),
|
||||
]:
|
||||
msg = '%s => %s' % (
|
||||
'C' if original.flags.c_contiguous else 'F',
|
||||
'C' if copy.flags.c_contiguous else 'F',
|
||||
)
|
||||
|
||||
d = cuda.to_device(original)
|
||||
d.copy_to_device(copy)
|
||||
self.assertTrue(np.all(d.copy_to_host() == a_c), msg=msg)
|
||||
self.assertTrue(np.all(d.copy_to_host() == a_f), msg=msg)
|
||||
|
||||
def test_devicearray_contiguous_copy_host_3d(self):
|
||||
a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
|
||||
a_f = np.array(a_c, order='F')
|
||||
self._test_devicearray_contiguous_host_copy(a_c, a_f)
|
||||
|
||||
def test_devicearray_contiguous_copy_host_1d(self):
|
||||
a_c = np.arange(5)
|
||||
a_f = np.array(a_c, order='F')
|
||||
self._test_devicearray_contiguous_host_copy(a_c, a_f)
|
||||
|
||||
def test_devicearray_contiguous_copy_device(self):
|
||||
a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
|
||||
a_f = np.array(a_c, order='F')
|
||||
self.assertTrue(a_c.flags.c_contiguous)
|
||||
self.assertTrue(a_f.flags.f_contiguous)
|
||||
|
||||
d = cuda.to_device(a_c)
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
d.copy_to_device(cuda.to_device(a_f))
|
||||
self.assertEqual(
|
||||
"incompatible strides: {} vs. {}".format(a_c.strides, a_f.strides),
|
||||
str(e.exception))
|
||||
|
||||
d.copy_to_device(cuda.to_device(a_c))
|
||||
self.assertTrue(np.all(d.copy_to_host() == a_c))
|
||||
|
||||
d = cuda.to_device(a_f)
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
d.copy_to_device(cuda.to_device(a_c))
|
||||
self.assertEqual(
|
||||
"incompatible strides: {} vs. {}".format(a_f.strides, a_c.strides),
|
||||
str(e.exception))
|
||||
|
||||
d.copy_to_device(cuda.to_device(a_f))
|
||||
self.assertTrue(np.all(d.copy_to_host() == a_f))
|
||||
|
||||
def test_devicearray_broadcast_host_copy(self):
|
||||
broadsize = 4
|
||||
coreshape = (2, 3)
|
||||
coresize = np.prod(coreshape)
|
||||
core_c = np.arange(coresize).reshape(coreshape, order='C')
|
||||
core_f = np.arange(coresize).reshape(coreshape, order='F')
|
||||
for dim in range(len(coreshape)):
|
||||
newindex = (slice(None),) * dim + (np.newaxis,)
|
||||
broadshape = coreshape[:dim] + (broadsize,) + coreshape[dim:]
|
||||
broad_c = np.broadcast_to(core_c[newindex], broadshape)
|
||||
broad_f = np.broadcast_to(core_f[newindex], broadshape)
|
||||
dbroad_c = cuda.to_device(broad_c)
|
||||
dbroad_f = cuda.to_device(broad_f)
|
||||
np.testing.assert_array_equal(dbroad_c.copy_to_host(), broad_c)
|
||||
np.testing.assert_array_equal(dbroad_f.copy_to_host(), broad_f)
|
||||
# Also test copying across different core orderings
|
||||
dbroad_c.copy_to_device(broad_f)
|
||||
dbroad_f.copy_to_device(broad_c)
|
||||
np.testing.assert_array_equal(dbroad_c.copy_to_host(), broad_f)
|
||||
np.testing.assert_array_equal(dbroad_f.copy_to_host(), broad_c)
|
||||
|
||||
def test_devicearray_contiguous_host_strided(self):
|
||||
a_c = np.arange(10)
|
||||
d = cuda.to_device(a_c)
|
||||
arr = np.arange(20)[::2]
|
||||
d.copy_to_device(arr)
|
||||
np.testing.assert_array_equal(d.copy_to_host(), arr)
|
||||
|
||||
def test_devicearray_contiguous_device_strided(self):
|
||||
d = cuda.to_device(np.arange(20))
|
||||
arr = np.arange(20)
|
||||
|
||||
with self.assertRaises(ValueError) as e:
|
||||
d.copy_to_device(cuda.to_device(arr)[::2])
|
||||
self.assertEqual(
|
||||
devicearray.errmsg_contiguous_buffer,
|
||||
str(e.exception))
|
||||
|
||||
@skip_on_cudasim('DeviceNDArray class not present in simulator')
|
||||
def test_devicearray_relaxed_strides(self):
|
||||
# From the reproducer in Issue #6824.
|
||||
|
||||
# Construct a device array that is contiguous even though
|
||||
# the strides for the first axis (800) are not equal to
|
||||
# the strides * size (10 * 8 = 80) for the previous axis,
|
||||
# because the first axis size is 1.
|
||||
arr = devicearray.DeviceNDArray((1, 10), (800, 8), np.float64)
|
||||
|
||||
# Ensure we still believe the array to be contiguous because
|
||||
# strides checking is relaxed.
|
||||
self.assertTrue(arr.flags['C_CONTIGUOUS'])
|
||||
self.assertTrue(arr.flags['F_CONTIGUOUS'])
|
||||
|
||||
def test_c_f_contiguity_matches_numpy(self):
|
||||
# From the reproducer in Issue #4943.
|
||||
|
||||
shapes = ((1, 4), (4, 1))
|
||||
orders = ('C', 'F')
|
||||
|
||||
for shape, order in itertools.product(shapes, orders):
|
||||
arr = np.ndarray(shape, order=order)
|
||||
d_arr = cuda.to_device(arr)
|
||||
self.assertEqual(arr.flags['C_CONTIGUOUS'],
|
||||
d_arr.flags['C_CONTIGUOUS'])
|
||||
self.assertEqual(arr.flags['F_CONTIGUOUS'],
|
||||
d_arr.flags['F_CONTIGUOUS'])
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_simple_c(self):
|
||||
# C-order 1D array
|
||||
a = np.zeros(10, order='C')
|
||||
d = cuda.to_device(a)
|
||||
self.assertEqual(d._numba_type_.layout, 'C')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_simple_f(self):
|
||||
# F-order array that is also C layout.
|
||||
a = np.zeros(10, order='F')
|
||||
d = cuda.to_device(a)
|
||||
self.assertEqual(d._numba_type_.layout, 'C')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_2d_c(self):
|
||||
# C-order 2D array
|
||||
a = np.zeros((2, 10), order='C')
|
||||
d = cuda.to_device(a)
|
||||
self.assertEqual(d._numba_type_.layout, 'C')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_2d_f(self):
|
||||
# F-order array that can only be F layout
|
||||
a = np.zeros((2, 10), order='F')
|
||||
d = cuda.to_device(a)
|
||||
self.assertEqual(d._numba_type_.layout, 'F')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_noncontig_slice_c(self):
|
||||
# Non-contiguous slice of C-order array
|
||||
a = np.zeros((5, 5), order='C')
|
||||
d = cuda.to_device(a)[:,2]
|
||||
self.assertEqual(d._numba_type_.layout, 'A')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_noncontig_slice_f(self):
|
||||
# Non-contiguous slice of F-order array
|
||||
a = np.zeros((5, 5), order='F')
|
||||
d = cuda.to_device(a)[2,:]
|
||||
self.assertEqual(d._numba_type_.layout, 'A')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_contig_slice_c(self):
|
||||
# Contiguous slice of C-order array
|
||||
a = np.zeros((5, 5), order='C')
|
||||
d = cuda.to_device(a)[2,:]
|
||||
self.assertEqual(d._numba_type_.layout, 'C')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_contig_slice_f(self):
|
||||
# Contiguous slice of F-order array - is both C- and F-contiguous, so
|
||||
# types as 'C' layout
|
||||
a = np.zeros((5, 5), order='F')
|
||||
d = cuda.to_device(a)[:,2]
|
||||
self.assertEqual(d._numba_type_.layout, 'C')
|
||||
|
||||
@skip_on_cudasim('Typing not done in the simulator')
|
||||
def test_devicearray_typing_order_broadcasted(self):
|
||||
# Broadcasted array, similar to that used for passing scalars to ufuncs
|
||||
a = np.broadcast_to(np.array([1]), (10,))
|
||||
d = cuda.to_device(a)
|
||||
self.assertEqual(d._numba_type_.layout, 'A')
|
||||
|
||||
def test_bug6697(self):
|
||||
ary = np.arange(10, dtype=np.int16)
|
||||
dary = cuda.to_device(ary)
|
||||
got = np.asarray(dary)
|
||||
self.assertEqual(got.dtype, dary.dtype)
|
||||
|
||||
@skip_on_cudasim('DeviceNDArray class not present in simulator')
|
||||
def test_issue_8477(self):
|
||||
# Ensure that we can copy a zero-length device array to a zero-length
|
||||
# host array when the strides of the device and host arrays differ -
|
||||
# this should be possible because the strides are irrelevant when the
|
||||
# length is zero. For more info see
|
||||
# https://github.com/numba/numba/issues/8477.
|
||||
|
||||
# Create a device array with shape (0,) and strides (8,)
|
||||
dev_array = devicearray.DeviceNDArray(shape=(0,), strides=(8,),
|
||||
dtype=np.int8)
|
||||
|
||||
# Create a host array with shape (0,) and strides (0,)
|
||||
host_array = np.ndarray(shape=(0,), strides=(0,), dtype=np.int8)
|
||||
|
||||
# Sanity check for this test - ensure our destination has the strides
|
||||
# we expect, because strides can be ignored in some cases by the
|
||||
# ndarray constructor - checking here ensures that we haven't failed to
|
||||
# account for unexpected behaviour across different versions of NumPy
|
||||
self.assertEqual(host_array.strides, (0,))
|
||||
|
||||
# Ensure that the copy succeeds in both directions
|
||||
dev_array.copy_to_host(host_array)
|
||||
dev_array.copy_to_device(host_array)
|
||||
|
||||
# Ensure that a device-to-device copy also succeeds when the strides
|
||||
# differ - one way of doing this is to copy the host array across and
|
||||
# use that for copies in both directions.
|
||||
dev_array_from_host = cuda.to_device(host_array)
|
||||
self.assertEqual(dev_array_from_host.shape, (0,))
|
||||
self.assertEqual(dev_array_from_host.strides, (0,))
|
||||
|
||||
dev_array.copy_to_device(dev_array_from_host)
|
||||
dev_array_from_host.copy_to_device(dev_array)
|
||||
|
||||
|
||||
class TestRecarray(CUDATestCase):
|
||||
def test_recarray(self):
|
||||
# From issue #4111
|
||||
a = np.recarray((16,), dtype=[
|
||||
("value1", np.int64),
|
||||
("value2", np.float64),
|
||||
])
|
||||
a.value1 = np.arange(a.size, dtype=np.int64)
|
||||
a.value2 = np.arange(a.size, dtype=np.float64) / 100
|
||||
|
||||
expect1 = a.value1
|
||||
expect2 = a.value2
|
||||
|
||||
def test(x, out1, out2):
|
||||
i = cuda.grid(1)
|
||||
if i < x.size:
|
||||
out1[i] = x.value1[i]
|
||||
out2[i] = x.value2[i]
|
||||
|
||||
got1 = np.zeros_like(expect1)
|
||||
got2 = np.zeros_like(expect2)
|
||||
cuda.jit(test)[1, a.size](a, got1, got2)
|
||||
|
||||
np.testing.assert_array_equal(expect1, got1)
|
||||
np.testing.assert_array_equal(expect2, got2)
|
||||
|
||||
|
||||
class TestCoreContiguous(CUDATestCase):
|
||||
def _test_against_array_core(self, view):
|
||||
self.assertEqual(
|
||||
devicearray.is_contiguous(view),
|
||||
devicearray.array_core(view).flags['C_CONTIGUOUS']
|
||||
)
|
||||
|
||||
def test_device_array_like_1d(self):
|
||||
d_a = cuda.device_array(10, order='C')
|
||||
self._test_against_array_core(d_a)
|
||||
|
||||
def test_device_array_like_2d(self):
|
||||
d_a = cuda.device_array((10, 12), order='C')
|
||||
self._test_against_array_core(d_a)
|
||||
|
||||
def test_device_array_like_2d_transpose(self):
|
||||
d_a = cuda.device_array((10, 12), order='C')
|
||||
self._test_against_array_core(d_a.T)
|
||||
|
||||
def test_device_array_like_3d(self):
|
||||
d_a = cuda.device_array((10, 12, 14), order='C')
|
||||
self._test_against_array_core(d_a)
|
||||
|
||||
def test_device_array_like_1d_f(self):
|
||||
d_a = cuda.device_array(10, order='F')
|
||||
self._test_against_array_core(d_a)
|
||||
|
||||
def test_device_array_like_2d_f(self):
|
||||
d_a = cuda.device_array((10, 12), order='F')
|
||||
self._test_against_array_core(d_a)
|
||||
|
||||
def test_device_array_like_2d_f_transpose(self):
|
||||
d_a = cuda.device_array((10, 12), order='F')
|
||||
self._test_against_array_core(d_a.T)
|
||||
|
||||
def test_device_array_like_3d_f(self):
|
||||
d_a = cuda.device_array((10, 12, 14), order='F')
|
||||
self._test_against_array_core(d_a)
|
||||
|
||||
def test_1d_view(self):
|
||||
shape = 10
|
||||
view = np.zeros(shape)[::2]
|
||||
self._test_against_array_core(view)
|
||||
|
||||
def test_1d_view_f(self):
|
||||
shape = 10
|
||||
view = np.zeros(shape, order='F')[::2]
|
||||
self._test_against_array_core(view)
|
||||
|
||||
def test_2d_view(self):
|
||||
shape = (10, 12)
|
||||
view = np.zeros(shape)[::2, ::2]
|
||||
self._test_against_array_core(view)
|
||||
|
||||
def test_2d_view_f(self):
|
||||
shape = (10, 12)
|
||||
view = np.zeros(shape, order='F')[::2, ::2]
|
||||
self._test_against_array_core(view)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,249 @@
|
||||
from contextlib import contextmanager
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import (unittest, skip_on_cudasim,
|
||||
skip_if_external_memmgr, CUDATestCase)
|
||||
from numba.tests.support import captured_stderr
|
||||
from numba.core import config
|
||||
|
||||
|
||||
@skip_on_cudasim('not supported on CUDASIM')
|
||||
@skip_if_external_memmgr('Deallocation specific to Numba memory management')
|
||||
class TestDeallocation(CUDATestCase):
|
||||
def test_max_pending_count(self):
|
||||
# get deallocation manager and flush it
|
||||
deallocs = cuda.current_context().memory_manager.deallocations
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
# deallocate to maximum count
|
||||
for i in range(config.CUDA_DEALLOCS_COUNT):
|
||||
cuda.to_device(np.arange(1))
|
||||
self.assertEqual(len(deallocs), i + 1)
|
||||
# one more to trigger .clear()
|
||||
cuda.to_device(np.arange(1))
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
|
||||
def test_max_pending_bytes(self):
|
||||
# get deallocation manager and flush it
|
||||
ctx = cuda.current_context()
|
||||
deallocs = ctx.memory_manager.deallocations
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
|
||||
mi = ctx.get_memory_info()
|
||||
|
||||
max_pending = 10**6 # 1MB
|
||||
old_ratio = config.CUDA_DEALLOCS_RATIO
|
||||
try:
|
||||
# change to a smaller ratio
|
||||
config.CUDA_DEALLOCS_RATIO = max_pending / mi.total
|
||||
# due to round off error (floor is used in calculating
|
||||
# _max_pending_bytes) it can be off by 1.
|
||||
self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending,
|
||||
delta=1)
|
||||
|
||||
# allocate half the max size
|
||||
# this will not trigger deallocation
|
||||
cuda.to_device(np.ones(max_pending // 2, dtype=np.int8))
|
||||
self.assertEqual(len(deallocs), 1)
|
||||
|
||||
# allocate another remaining
|
||||
# this will not trigger deallocation
|
||||
cuda.to_device(np.ones(deallocs._max_pending_bytes -
|
||||
deallocs._size, dtype=np.int8))
|
||||
self.assertEqual(len(deallocs), 2)
|
||||
|
||||
# another byte to trigger .clear()
|
||||
cuda.to_device(np.ones(1, dtype=np.int8))
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
finally:
|
||||
# restore old ratio
|
||||
config.CUDA_DEALLOCS_RATIO = old_ratio
|
||||
|
||||
|
||||
@skip_on_cudasim("defer_cleanup has no effect in CUDASIM")
|
||||
@skip_if_external_memmgr('Deallocation specific to Numba memory management')
|
||||
class TestDeferCleanup(CUDATestCase):
|
||||
def test_basic(self):
|
||||
harr = np.arange(5)
|
||||
darr1 = cuda.to_device(harr)
|
||||
deallocs = cuda.current_context().memory_manager.deallocations
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
with cuda.defer_cleanup():
|
||||
darr2 = cuda.to_device(harr)
|
||||
del darr1
|
||||
self.assertEqual(len(deallocs), 1)
|
||||
del darr2
|
||||
self.assertEqual(len(deallocs), 2)
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 2)
|
||||
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
|
||||
def test_nested(self):
|
||||
harr = np.arange(5)
|
||||
darr1 = cuda.to_device(harr)
|
||||
deallocs = cuda.current_context().memory_manager.deallocations
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
with cuda.defer_cleanup():
|
||||
with cuda.defer_cleanup():
|
||||
darr2 = cuda.to_device(harr)
|
||||
del darr1
|
||||
self.assertEqual(len(deallocs), 1)
|
||||
del darr2
|
||||
self.assertEqual(len(deallocs), 2)
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 2)
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 2)
|
||||
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
|
||||
def test_exception(self):
|
||||
harr = np.arange(5)
|
||||
darr1 = cuda.to_device(harr)
|
||||
deallocs = cuda.current_context().memory_manager.deallocations
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
|
||||
class CustomError(Exception):
|
||||
pass
|
||||
|
||||
with self.assertRaises(CustomError):
|
||||
with cuda.defer_cleanup():
|
||||
darr2 = cuda.to_device(harr)
|
||||
del darr2
|
||||
self.assertEqual(len(deallocs), 1)
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 1)
|
||||
raise CustomError
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
del darr1
|
||||
self.assertEqual(len(deallocs), 1)
|
||||
deallocs.clear()
|
||||
self.assertEqual(len(deallocs), 0)
|
||||
|
||||
|
||||
class TestDeferCleanupAvail(CUDATestCase):
|
||||
def test_context_manager(self):
|
||||
# just make sure the API is available
|
||||
with cuda.defer_cleanup():
|
||||
pass
|
||||
|
||||
|
||||
@skip_on_cudasim('not supported on CUDASIM')
|
||||
class TestDel(CUDATestCase):
|
||||
"""
|
||||
Ensure resources are deleted properly without ignored exception.
|
||||
"""
|
||||
@contextmanager
|
||||
def check_ignored_exception(self, ctx):
|
||||
with captured_stderr() as cap:
|
||||
yield
|
||||
ctx.deallocations.clear()
|
||||
self.assertFalse(cap.getvalue())
|
||||
|
||||
def test_stream(self):
|
||||
ctx = cuda.current_context()
|
||||
stream = ctx.create_stream()
|
||||
with self.check_ignored_exception(ctx):
|
||||
del stream
|
||||
|
||||
def test_event(self):
|
||||
ctx = cuda.current_context()
|
||||
event = ctx.create_event()
|
||||
with self.check_ignored_exception(ctx):
|
||||
del event
|
||||
|
||||
def test_pinned_memory(self):
|
||||
ctx = cuda.current_context()
|
||||
mem = ctx.memhostalloc(32)
|
||||
with self.check_ignored_exception(ctx):
|
||||
del mem
|
||||
|
||||
def test_mapped_memory(self):
|
||||
ctx = cuda.current_context()
|
||||
mem = ctx.memhostalloc(32, mapped=True)
|
||||
with self.check_ignored_exception(ctx):
|
||||
del mem
|
||||
|
||||
def test_device_memory(self):
|
||||
ctx = cuda.current_context()
|
||||
mem = ctx.memalloc(32)
|
||||
with self.check_ignored_exception(ctx):
|
||||
del mem
|
||||
|
||||
def test_managed_memory(self):
|
||||
ctx = cuda.current_context()
|
||||
mem = ctx.memallocmanaged(32)
|
||||
with self.check_ignored_exception(ctx):
|
||||
del mem
|
||||
|
||||
def test_pinned_contextmanager(self):
|
||||
# Check that temporarily pinned memory is unregistered immediately,
|
||||
# such that it can be re-pinned at any time
|
||||
class PinnedException(Exception):
|
||||
pass
|
||||
|
||||
arr = np.zeros(1)
|
||||
ctx = cuda.current_context()
|
||||
ctx.deallocations.clear()
|
||||
with self.check_ignored_exception(ctx):
|
||||
with cuda.pinned(arr):
|
||||
pass
|
||||
with cuda.pinned(arr):
|
||||
pass
|
||||
# Should also work inside a `defer_cleanup` block
|
||||
with cuda.defer_cleanup():
|
||||
with cuda.pinned(arr):
|
||||
pass
|
||||
with cuda.pinned(arr):
|
||||
pass
|
||||
# Should also work when breaking out of the block due to an
|
||||
# exception
|
||||
try:
|
||||
with cuda.pinned(arr):
|
||||
raise PinnedException
|
||||
except PinnedException:
|
||||
with cuda.pinned(arr):
|
||||
pass
|
||||
|
||||
def test_mapped_contextmanager(self):
|
||||
# Check that temporarily mapped memory is unregistered immediately,
|
||||
# such that it can be re-mapped at any time
|
||||
class MappedException(Exception):
|
||||
pass
|
||||
|
||||
arr = np.zeros(1)
|
||||
ctx = cuda.current_context()
|
||||
ctx.deallocations.clear()
|
||||
with self.check_ignored_exception(ctx):
|
||||
with cuda.mapped(arr):
|
||||
pass
|
||||
with cuda.mapped(arr):
|
||||
pass
|
||||
# Should also work inside a `defer_cleanup` block
|
||||
with cuda.defer_cleanup():
|
||||
with cuda.mapped(arr):
|
||||
pass
|
||||
with cuda.mapped(arr):
|
||||
pass
|
||||
# Should also work when breaking out of the block due to an
|
||||
# exception
|
||||
try:
|
||||
with cuda.mapped(arr):
|
||||
raise MappedException
|
||||
except MappedException:
|
||||
with cuda.mapped(arr):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,81 @@
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import threading
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
|
||||
skip_under_cuda_memcheck)
|
||||
from numba.tests.support import captured_stdout
|
||||
|
||||
|
||||
class TestCudaDetect(CUDATestCase):
|
||||
def test_cuda_detect(self):
|
||||
# exercise the code path
|
||||
with captured_stdout() as out:
|
||||
cuda.detect()
|
||||
output = out.getvalue()
|
||||
self.assertIn('Found', output)
|
||||
self.assertIn('CUDA devices', output)
|
||||
|
||||
|
||||
@skip_under_cuda_memcheck('Hangs cuda-memcheck')
|
||||
class TestCUDAFindLibs(CUDATestCase):
|
||||
|
||||
def run_cmd(self, cmdline, env):
|
||||
popen = subprocess.Popen(cmdline,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
env=env)
|
||||
|
||||
# finish in 5 minutes or kill it
|
||||
timeout = threading.Timer(5 * 60., popen.kill)
|
||||
try:
|
||||
timeout.start()
|
||||
out, err = popen.communicate()
|
||||
# the process should exit with an error
|
||||
return out.decode(), err.decode()
|
||||
finally:
|
||||
timeout.cancel()
|
||||
return None, None
|
||||
|
||||
def run_test_in_separate_process(self, envvar, envvar_value):
|
||||
env_copy = os.environ.copy()
|
||||
env_copy[envvar] = str(envvar_value)
|
||||
code = """if 1:
|
||||
from numba import cuda
|
||||
@cuda.jit('(int64,)')
|
||||
def kernel(x):
|
||||
pass
|
||||
kernel(1,)
|
||||
"""
|
||||
cmdline = [sys.executable, "-c", code]
|
||||
return self.run_cmd(cmdline, env_copy)
|
||||
|
||||
@skip_on_cudasim('Simulator does not hit device library search code path')
|
||||
@unittest.skipIf(not sys.platform.startswith('linux'), "linux only")
|
||||
def test_cuda_find_lib_errors(self):
|
||||
"""
|
||||
This tests that the find_libs works as expected in the case of an
|
||||
environment variable being used to set the path.
|
||||
"""
|
||||
# one of these is likely to exist on linux, it's also unlikely that
|
||||
# someone has extracted the contents of libdevice into here!
|
||||
locs = ['lib', 'lib64']
|
||||
|
||||
looking_for = None
|
||||
for l in locs:
|
||||
looking_for = os.path.join(os.path.sep, l)
|
||||
if os.path.exists(looking_for):
|
||||
break
|
||||
|
||||
# This is the testing part, the test will only run if there's a valid
|
||||
# path in which to look
|
||||
if looking_for is not None:
|
||||
out, err = self.run_test_in_separate_process("NUMBA_CUDA_DRIVER",
|
||||
looking_for)
|
||||
self.assertTrue(out is not None)
|
||||
self.assertTrue(err is not None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,192 @@
|
||||
import ctypes
|
||||
import numpy as np
|
||||
import weakref
|
||||
|
||||
from numba import cuda
|
||||
from numba.core import config
|
||||
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
||||
from numba.tests.support import linux_only
|
||||
|
||||
if not config.ENABLE_CUDASIM:
|
||||
class DeviceOnlyEMMPlugin(cuda.HostOnlyCUDAMemoryManager):
|
||||
"""
|
||||
Dummy EMM Plugin implementation for testing. It memorises which plugin
|
||||
API methods have been called so that the tests can check that Numba
|
||||
called into the plugin as expected.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# For tracking our dummy allocations
|
||||
self.allocations = {}
|
||||
self.count = 0
|
||||
|
||||
# For tracking which methods have been called
|
||||
self.initialized = False
|
||||
self.memalloc_called = False
|
||||
self.reset_called = False
|
||||
self.get_memory_info_called = False
|
||||
self.get_ipc_handle_called = False
|
||||
|
||||
def memalloc(self, size):
|
||||
# We maintain a list of allocations and keep track of them, so that
|
||||
# we can test that the finalizers of objects returned by memalloc
|
||||
# get called.
|
||||
|
||||
# Numba should have initialized the memory manager when preparing
|
||||
# the context for use, prior to any memalloc call.
|
||||
if not self.initialized:
|
||||
raise RuntimeError("memalloc called before initialize")
|
||||
self.memalloc_called = True
|
||||
|
||||
# Create an allocation and record it
|
||||
self.count += 1
|
||||
alloc_count = self.count
|
||||
self.allocations[alloc_count] = size
|
||||
|
||||
# The finalizer deletes the record from our internal dict of
|
||||
# allocations.
|
||||
finalizer_allocs = self.allocations
|
||||
|
||||
def finalizer():
|
||||
del finalizer_allocs[alloc_count]
|
||||
|
||||
# We use an AutoFreePointer so that the finalizer will be run when
|
||||
# the reference count drops to zero.
|
||||
ctx = weakref.proxy(self.context)
|
||||
ptr = ctypes.c_void_p(alloc_count)
|
||||
return cuda.cudadrv.driver.AutoFreePointer(ctx, ptr, size,
|
||||
finalizer=finalizer)
|
||||
|
||||
def initialize(self):
|
||||
# No special initialization needed.
|
||||
self.initialized = True
|
||||
|
||||
def reset(self):
|
||||
# We remove all allocations on reset, just as a real EMM Plugin
|
||||
# would do. Note that our finalizers in memalloc don't check
|
||||
# whether the allocations are still alive, so running them after
|
||||
# reset will detect any allocations that are floating around at
|
||||
# exit time; however, the atexit finalizer for weakref will only
|
||||
# print a traceback, not terminate the interpreter abnormally.
|
||||
self.reset_called = True
|
||||
|
||||
def get_memory_info(self):
|
||||
# Return some dummy memory information
|
||||
self.get_memory_info_called = True
|
||||
return cuda.MemoryInfo(free=32, total=64)
|
||||
|
||||
def get_ipc_handle(self, memory):
|
||||
# The dummy IPC handle is only a string, so it is important that
|
||||
# the tests don't try to do too much with it (e.g. open / close
|
||||
# it).
|
||||
self.get_ipc_handle_called = True
|
||||
return "Dummy IPC handle for alloc %s" % memory.device_pointer.value
|
||||
|
||||
@property
|
||||
def interface_version(self):
|
||||
# The expected version for an EMM Plugin.
|
||||
return 1
|
||||
|
||||
class BadVersionEMMPlugin(DeviceOnlyEMMPlugin):
|
||||
"""A plugin that claims to implement a different interface version"""
|
||||
|
||||
@property
|
||||
def interface_version(self):
|
||||
return 2
|
||||
|
||||
|
||||
@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
|
||||
class TestDeviceOnlyEMMPlugin(CUDATestCase):
|
||||
"""
|
||||
Tests that the API of an EMM Plugin that implements device allocations
|
||||
only is used correctly by Numba.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Always start afresh with a new context and memory manager
|
||||
cuda.close()
|
||||
cuda.set_memory_manager(DeviceOnlyEMMPlugin)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
# Unset the memory manager for subsequent tests
|
||||
cuda.close()
|
||||
cuda.cudadrv.driver._memory_manager = None
|
||||
|
||||
def test_memalloc(self):
|
||||
mgr = cuda.current_context().memory_manager
|
||||
|
||||
# Allocate an array and check that memalloc was called with the correct
|
||||
# size.
|
||||
arr_1 = np.arange(10)
|
||||
d_arr_1 = cuda.device_array_like(arr_1)
|
||||
self.assertTrue(mgr.memalloc_called)
|
||||
self.assertEqual(mgr.count, 1)
|
||||
self.assertEqual(mgr.allocations[1], arr_1.nbytes)
|
||||
|
||||
# Allocate again, with a different size, and check that it is also
|
||||
# correct.
|
||||
arr_2 = np.arange(5)
|
||||
d_arr_2 = cuda.device_array_like(arr_2)
|
||||
self.assertEqual(mgr.count, 2)
|
||||
self.assertEqual(mgr.allocations[2], arr_2.nbytes)
|
||||
|
||||
# Remove the first array, and check that our finalizer was called for
|
||||
# the first array only.
|
||||
del d_arr_1
|
||||
self.assertNotIn(1, mgr.allocations)
|
||||
self.assertIn(2, mgr.allocations)
|
||||
|
||||
# Remove the second array and check that its finalizer was also
|
||||
# called.
|
||||
del d_arr_2
|
||||
self.assertNotIn(2, mgr.allocations)
|
||||
|
||||
def test_initialized_in_context(self):
|
||||
# If we have a CUDA context, it should already have initialized its
|
||||
# memory manager.
|
||||
self.assertTrue(cuda.current_context().memory_manager.initialized)
|
||||
|
||||
def test_reset(self):
|
||||
ctx = cuda.current_context()
|
||||
ctx.reset()
|
||||
self.assertTrue(ctx.memory_manager.reset_called)
|
||||
|
||||
def test_get_memory_info(self):
|
||||
ctx = cuda.current_context()
|
||||
meminfo = ctx.get_memory_info()
|
||||
self.assertTrue(ctx.memory_manager.get_memory_info_called)
|
||||
self.assertEqual(meminfo.free, 32)
|
||||
self.assertEqual(meminfo.total, 64)
|
||||
|
||||
@linux_only
|
||||
def test_get_ipc_handle(self):
|
||||
# We don't attempt to close the IPC handle in this test because Numba
|
||||
# will be expecting a real IpcHandle object to have been returned from
|
||||
# get_ipc_handle, and it would cause problems to do so.
|
||||
arr = np.arange(2)
|
||||
d_arr = cuda.device_array_like(arr)
|
||||
ipch = d_arr.get_ipc_handle()
|
||||
ctx = cuda.current_context()
|
||||
self.assertTrue(ctx.memory_manager.get_ipc_handle_called)
|
||||
self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle)
|
||||
|
||||
|
||||
@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
|
||||
class TestBadEMMPluginVersion(CUDATestCase):
|
||||
"""
|
||||
Ensure that Numba rejects EMM Plugins with incompatible version
|
||||
numbers.
|
||||
"""
|
||||
|
||||
def test_bad_plugin_version(self):
|
||||
with self.assertRaises(RuntimeError) as raises:
|
||||
cuda.set_memory_manager(BadVersionEMMPlugin)
|
||||
self.assertIn('version 1 required', str(raises.exception))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,38 @@
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
|
||||
|
||||
class TestCudaEvent(CUDATestCase):
|
||||
def test_event_elapsed(self):
|
||||
N = 32
|
||||
dary = cuda.device_array(N, dtype=np.double)
|
||||
evtstart = cuda.event()
|
||||
evtend = cuda.event()
|
||||
|
||||
evtstart.record()
|
||||
cuda.to_device(np.arange(N, dtype=np.double), to=dary)
|
||||
evtend.record()
|
||||
evtend.wait()
|
||||
evtend.synchronize()
|
||||
# Exercise the code path
|
||||
evtstart.elapsed_time(evtend)
|
||||
|
||||
def test_event_elapsed_stream(self):
|
||||
N = 32
|
||||
stream = cuda.stream()
|
||||
dary = cuda.device_array(N, dtype=np.double)
|
||||
evtstart = cuda.event()
|
||||
evtend = cuda.event()
|
||||
|
||||
evtstart.record(stream=stream)
|
||||
cuda.to_device(np.arange(N, dtype=np.double), to=dary, stream=stream)
|
||||
evtend.record(stream=stream)
|
||||
evtend.wait(stream=stream)
|
||||
evtend.synchronize()
|
||||
# Exercise the code path
|
||||
evtstart.elapsed_time(evtend)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,65 @@
|
||||
import numpy as np
|
||||
from numba.cuda.cudadrv import driver
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
|
||||
|
||||
class TestHostAlloc(ContextResettingTestCase):
|
||||
def test_host_alloc_driver(self):
|
||||
n = 32
|
||||
mem = cuda.current_context().memhostalloc(n, mapped=True)
|
||||
|
||||
dtype = np.dtype(np.uint8)
|
||||
ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype,
|
||||
buffer=mem)
|
||||
|
||||
magic = 0xab
|
||||
driver.device_memset(mem, magic, n)
|
||||
|
||||
self.assertTrue(np.all(ary == magic))
|
||||
|
||||
ary.fill(n)
|
||||
|
||||
recv = np.empty_like(ary)
|
||||
|
||||
driver.device_to_host(recv, mem, ary.size)
|
||||
|
||||
self.assertTrue(np.all(ary == recv))
|
||||
self.assertTrue(np.all(recv == n))
|
||||
|
||||
def test_host_alloc_pinned(self):
|
||||
ary = cuda.pinned_array(10, dtype=np.uint32)
|
||||
ary.fill(123)
|
||||
self.assertTrue(all(ary == 123))
|
||||
devary = cuda.to_device(ary)
|
||||
driver.device_memset(devary, 0, driver.device_memory_size(devary))
|
||||
self.assertTrue(all(ary == 123))
|
||||
devary.copy_to_host(ary)
|
||||
self.assertTrue(all(ary == 0))
|
||||
|
||||
def test_host_alloc_mapped(self):
|
||||
ary = cuda.mapped_array(10, dtype=np.uint32)
|
||||
ary.fill(123)
|
||||
self.assertTrue(all(ary == 123))
|
||||
driver.device_memset(ary, 0, driver.device_memory_size(ary))
|
||||
self.assertTrue(all(ary == 0))
|
||||
self.assertTrue(sum(ary != 0) == 0)
|
||||
|
||||
def test_host_operators(self):
|
||||
for ary in [cuda.mapped_array(10, dtype=np.uint32),
|
||||
cuda.pinned_array(10, dtype=np.uint32)]:
|
||||
ary[:] = range(10)
|
||||
self.assertTrue(sum(ary + 1) == 55)
|
||||
self.assertTrue(sum((ary + 1) * 2 - 1) == 100)
|
||||
self.assertTrue(sum(ary < 5) == 5)
|
||||
self.assertTrue(sum(ary <= 5) == 6)
|
||||
self.assertTrue(sum(ary > 6) == 3)
|
||||
self.assertTrue(sum(ary >= 6) == 4)
|
||||
self.assertTrue(sum(ary ** 2) == 285)
|
||||
self.assertTrue(sum(ary // 2) == 20)
|
||||
self.assertTrue(sum(ary / 2.0) == 22.5)
|
||||
self.assertTrue(sum(ary % 2) == 5)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,139 @@
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
|
||||
from numba import cuda
|
||||
from numba.cuda.cudadrv.driver import CudaAPIError, driver
|
||||
from numba.cuda.cudadrv.error import CudaSupportError
|
||||
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
||||
|
||||
|
||||
# A mock of cuInit that always raises a CudaAPIError
|
||||
def cuInit_raising(arg):
|
||||
raise CudaAPIError(999, 'CUDA_ERROR_UNKNOWN')
|
||||
|
||||
|
||||
# Test code to run in a child that patches driver.cuInit to a variant that
|
||||
# always raises. We can't use mock.patch.object here because driver.cuInit is
|
||||
# not assigned until we attempt to initialize - mock.patch.object cannot locate
|
||||
# the non-existent original method, and so fails. Instead we patch
|
||||
# driver.cuInit with our raising version prior to any attempt to initialize.
|
||||
def cuInit_raising_test(result_queue):
|
||||
driver.cuInit = cuInit_raising
|
||||
|
||||
success = False
|
||||
msg = None
|
||||
|
||||
try:
|
||||
# A CUDA operation that forces initialization of the device
|
||||
cuda.device_array(1)
|
||||
except CudaSupportError as e:
|
||||
success = True
|
||||
msg = e.msg
|
||||
|
||||
result_queue.put((success, msg))
|
||||
|
||||
|
||||
# Similar to cuInit_raising_test above, but for testing that the string
|
||||
# returned by cuda_error() is as expected.
|
||||
def initialization_error_test(result_queue):
|
||||
driver.cuInit = cuInit_raising
|
||||
|
||||
success = False
|
||||
msg = None
|
||||
|
||||
try:
|
||||
# A CUDA operation that forces initialization of the device
|
||||
cuda.device_array(1)
|
||||
except CudaSupportError:
|
||||
success = True
|
||||
|
||||
msg = cuda.cuda_error()
|
||||
result_queue.put((success, msg))
|
||||
|
||||
|
||||
# For testing the path where Driver.__init__() catches a CudaSupportError
|
||||
def cuda_disabled_test(result_queue):
|
||||
success = False
|
||||
msg = None
|
||||
|
||||
try:
|
||||
# A CUDA operation that forces initialization of the device
|
||||
cuda.device_array(1)
|
||||
except CudaSupportError as e:
|
||||
success = True
|
||||
msg = e.msg
|
||||
|
||||
result_queue.put((success, msg))
|
||||
|
||||
|
||||
# Similar to cuda_disabled_test, but checks cuda.cuda_error() instead of the
|
||||
# exception raised on initialization
|
||||
def cuda_disabled_error_test(result_queue):
|
||||
success = False
|
||||
msg = None
|
||||
|
||||
try:
|
||||
# A CUDA operation that forces initialization of the device
|
||||
cuda.device_array(1)
|
||||
except CudaSupportError:
|
||||
success = True
|
||||
|
||||
msg = cuda.cuda_error()
|
||||
result_queue.put((success, msg))
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Simulator does not initialize driver')
|
||||
class TestInit(CUDATestCase):
|
||||
def _test_init_failure(self, target, expected):
|
||||
# Run the initialization failure test in a separate subprocess
|
||||
ctx = mp.get_context('spawn')
|
||||
result_queue = ctx.Queue()
|
||||
proc = ctx.Process(target=target, args=(result_queue,))
|
||||
proc.start()
|
||||
proc.join(30) # should complete within 30s
|
||||
success, msg = result_queue.get()
|
||||
|
||||
# Ensure the child process raised an exception during initialization
|
||||
# before checking the message
|
||||
if not success:
|
||||
self.fail('CudaSupportError not raised')
|
||||
|
||||
self.assertIn(expected, msg)
|
||||
|
||||
def test_init_failure_raising(self):
|
||||
expected = 'Error at driver init: CUDA_ERROR_UNKNOWN (999)'
|
||||
self._test_init_failure(cuInit_raising_test, expected)
|
||||
|
||||
def test_init_failure_error(self):
|
||||
expected = 'CUDA_ERROR_UNKNOWN (999)'
|
||||
self._test_init_failure(initialization_error_test, expected)
|
||||
|
||||
def _test_cuda_disabled(self, target):
|
||||
# Uses _test_init_failure to launch the test in a separate subprocess
|
||||
# with CUDA disabled.
|
||||
cuda_disabled = os.environ.get('NUMBA_DISABLE_CUDA')
|
||||
os.environ['NUMBA_DISABLE_CUDA'] = "1"
|
||||
try:
|
||||
expected = 'CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1'
|
||||
self._test_init_failure(cuda_disabled_test, expected)
|
||||
finally:
|
||||
if cuda_disabled is not None:
|
||||
os.environ['NUMBA_DISABLE_CUDA'] = cuda_disabled
|
||||
else:
|
||||
os.environ.pop('NUMBA_DISABLE_CUDA')
|
||||
|
||||
def test_cuda_disabled_raising(self):
|
||||
self._test_cuda_disabled(cuda_disabled_test)
|
||||
|
||||
def test_cuda_disabled_error(self):
|
||||
self._test_cuda_disabled(cuda_disabled_error_test)
|
||||
|
||||
def test_init_success(self):
|
||||
# Here we assume that initialization is successful (because many bad
|
||||
# things will happen with the test suite if it is not) and check that
|
||||
# there is no error recorded.
|
||||
self.assertIsNone(cuda.cuda_error())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,37 @@
|
||||
from llvmlite import ir
|
||||
|
||||
from numba.cuda.cudadrv import nvvm
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
|
||||
|
||||
@skip_on_cudasim('Inline PTX cannot be used in the simulator')
|
||||
class TestCudaInlineAsm(ContextResettingTestCase):
|
||||
def test_inline_rsqrt(self):
|
||||
mod = ir.Module(__name__)
|
||||
mod.triple = 'nvptx64-nvidia-cuda'
|
||||
nvvm.add_ir_version(mod)
|
||||
fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())])
|
||||
fn = ir.Function(mod, fnty, 'cu_rsqrt')
|
||||
bldr = ir.IRBuilder(fn.append_basic_block('entry'))
|
||||
|
||||
rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
|
||||
inlineasm = ir.InlineAsm(rsqrt_approx_fnty,
|
||||
'rsqrt.approx.f32 $0, $1;',
|
||||
'=f,f', side_effect=True)
|
||||
val = bldr.load(fn.args[0])
|
||||
res = bldr.call(inlineasm, [val])
|
||||
|
||||
bldr.store(res, fn.args[0])
|
||||
bldr.ret_void()
|
||||
|
||||
# generate ptx
|
||||
mod.data_layout = nvvm.NVVM().data_layout
|
||||
nvvm.set_cuda_kernel(fn)
|
||||
nvvmir = str(mod)
|
||||
ptx = nvvm.compile_ir(nvvmir)
|
||||
self.assertTrue('rsqrt.approx.f32' in str(ptx))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,12 @@
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
|
||||
|
||||
|
||||
class TestIsFP16Supported(CUDATestCase):
|
||||
def test_is_fp16_supported(self):
|
||||
self.assertTrue(cuda.is_float16_supported())
|
||||
|
||||
@skip_on_cudasim
|
||||
@skip_unless_cc_53
|
||||
def test_device_supports_float16(self):
|
||||
self.assertTrue(cuda.get_current_device().supports_float16)
|
||||
@@ -0,0 +1,317 @@
|
||||
import numpy as np
|
||||
import warnings
|
||||
from numba.cuda.testing import unittest
|
||||
from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
|
||||
from numba.cuda.testing import CUDATestCase, test_data_dir
|
||||
from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
|
||||
LinkerError)
|
||||
from numba.cuda.cudadrv.error import NvrtcError
|
||||
from numba.cuda import require_context
|
||||
from numba.tests.support import ignore_internal_warnings
|
||||
from numba import cuda, void, float64, int64, int32, typeof, float32
|
||||
|
||||
|
||||
CONST1D = np.arange(10, dtype=np.float64)
|
||||
|
||||
|
||||
def simple_const_mem(A):
|
||||
C = cuda.const.array_like(CONST1D)
|
||||
i = cuda.grid(1)
|
||||
|
||||
A[i] = C[i] + 1.0
|
||||
|
||||
|
||||
def func_with_lots_of_registers(x, a, b, c, d, e, f):
|
||||
a1 = 1.0
|
||||
a2 = 1.0
|
||||
a3 = 1.0
|
||||
a4 = 1.0
|
||||
a5 = 1.0
|
||||
b1 = 1.0
|
||||
b2 = 1.0
|
||||
b3 = 1.0
|
||||
b4 = 1.0
|
||||
b5 = 1.0
|
||||
c1 = 1.0
|
||||
c2 = 1.0
|
||||
c3 = 1.0
|
||||
c4 = 1.0
|
||||
c5 = 1.0
|
||||
d1 = 10
|
||||
d2 = 10
|
||||
d3 = 10
|
||||
d4 = 10
|
||||
d5 = 10
|
||||
for i in range(a):
|
||||
a1 += b
|
||||
a2 += c
|
||||
a3 += d
|
||||
a4 += e
|
||||
a5 += f
|
||||
b1 *= b
|
||||
b2 *= c
|
||||
b3 *= d
|
||||
b4 *= e
|
||||
b5 *= f
|
||||
c1 /= b
|
||||
c2 /= c
|
||||
c3 /= d
|
||||
c4 /= e
|
||||
c5 /= f
|
||||
d1 <<= b
|
||||
d2 <<= c
|
||||
d3 <<= d
|
||||
d4 <<= e
|
||||
d5 <<= f
|
||||
x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
|
||||
x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
|
||||
x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
|
||||
x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5
|
||||
|
||||
|
||||
def simple_smem(ary, dty):
|
||||
sm = cuda.shared.array(100, dty)
|
||||
i = cuda.grid(1)
|
||||
if i == 0:
|
||||
for j in range(100):
|
||||
sm[j] = j
|
||||
cuda.syncthreads()
|
||||
ary[i] = sm[i]
|
||||
|
||||
|
||||
def coop_smem2d(ary):
|
||||
i, j = cuda.grid(2)
|
||||
sm = cuda.shared.array((10, 20), float32)
|
||||
sm[i, j] = (i + 1) / (j + 1)
|
||||
cuda.syncthreads()
|
||||
ary[i, j] = sm[i, j]
|
||||
|
||||
|
||||
def simple_maxthreads(ary):
|
||||
i = cuda.grid(1)
|
||||
ary[i] = i
|
||||
|
||||
|
||||
LMEM_SIZE = 1000
|
||||
|
||||
|
||||
def simple_lmem(A, B, dty):
|
||||
C = cuda.local.array(LMEM_SIZE, dty)
|
||||
for i in range(C.shape[0]):
|
||||
C[i] = A[i]
|
||||
for i in range(C.shape[0]):
|
||||
B[i] = C[i]
|
||||
|
||||
|
||||
@skip_on_cudasim('Linking unsupported in the simulator')
|
||||
class TestLinker(CUDATestCase):
|
||||
_NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}
|
||||
|
||||
@require_context
|
||||
def test_linker_basic(self):
|
||||
'''Simply go through the constructor and destructor
|
||||
'''
|
||||
linker = Linker.new(cc=(5, 3))
|
||||
del linker
|
||||
|
||||
def _test_linking(self, eager):
|
||||
global bar # must be a global; other it is recognized as a freevar
|
||||
bar = cuda.declare_device('bar', 'int32(int32)')
|
||||
|
||||
link = str(test_data_dir / 'jitlink.ptx')
|
||||
|
||||
if eager:
|
||||
args = ['void(int32[:], int32[:])']
|
||||
else:
|
||||
args = []
|
||||
|
||||
@cuda.jit(*args, link=[link])
|
||||
def foo(x, y):
|
||||
i = cuda.grid(1)
|
||||
x[i] += bar(y[i])
|
||||
|
||||
A = np.array([123], dtype=np.int32)
|
||||
B = np.array([321], dtype=np.int32)
|
||||
|
||||
foo[1, 1](A, B)
|
||||
|
||||
self.assertTrue(A[0] == 123 + 2 * 321)
|
||||
|
||||
def test_linking_lazy_compile(self):
|
||||
self._test_linking(eager=False)
|
||||
|
||||
def test_linking_eager_compile(self):
|
||||
self._test_linking(eager=True)
|
||||
|
||||
def test_linking_cu(self):
|
||||
bar = cuda.declare_device('bar', 'int32(int32)')
|
||||
|
||||
link = str(test_data_dir / 'jitlink.cu')
|
||||
|
||||
@cuda.jit(link=[link])
|
||||
def kernel(r, x):
|
||||
i = cuda.grid(1)
|
||||
|
||||
if i < len(r):
|
||||
r[i] = bar(x[i])
|
||||
|
||||
x = np.arange(10, dtype=np.int32)
|
||||
r = np.zeros_like(x)
|
||||
|
||||
kernel[1, 32](r, x)
|
||||
|
||||
# Matches the operation of bar() in jitlink.cu
|
||||
expected = x * 2
|
||||
np.testing.assert_array_equal(r, expected)
|
||||
|
||||
def test_linking_cu_log_warning(self):
|
||||
bar = cuda.declare_device('bar', 'int32(int32)')
|
||||
|
||||
link = str(test_data_dir / 'warn.cu')
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
ignore_internal_warnings()
|
||||
|
||||
@cuda.jit('void(int32)', link=[link])
|
||||
def kernel(x):
|
||||
bar(x)
|
||||
|
||||
self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
|
||||
# Check the warning refers to the log messages
|
||||
self.assertIn('NVRTC log messages', str(w[0].message))
|
||||
# Check the message pertaining to the unused variable is provided
|
||||
self.assertIn('declared but never referenced', str(w[0].message))
|
||||
|
||||
def test_linking_cu_error(self):
|
||||
bar = cuda.declare_device('bar', 'int32(int32)')
|
||||
|
||||
link = str(test_data_dir / 'error.cu')
|
||||
|
||||
with self.assertRaises(NvrtcError) as e:
|
||||
@cuda.jit('void(int32)', link=[link])
|
||||
def kernel(x):
|
||||
bar(x)
|
||||
|
||||
msg = e.exception.args[0]
|
||||
# Check the error message refers to the NVRTC compile
|
||||
self.assertIn('NVRTC Compilation failure', msg)
|
||||
# Check the expected error in the CUDA source is reported
|
||||
self.assertIn('identifier "SYNTAX" is undefined', msg)
|
||||
# Check the filename is reported correctly
|
||||
self.assertIn('in the compilation of "error.cu"', msg)
|
||||
|
||||
def test_linking_unknown_filetype_error(self):
|
||||
expected_err = "Don't know how to link file with extension .cuh"
|
||||
with self.assertRaisesRegex(RuntimeError, expected_err):
|
||||
@cuda.jit('void()', link=['header.cuh'])
|
||||
def kernel():
|
||||
pass
|
||||
|
||||
def test_linking_file_with_no_extension_error(self):
|
||||
expected_err = "Don't know how to link file with no extension"
|
||||
with self.assertRaisesRegex(RuntimeError, expected_err):
|
||||
@cuda.jit('void()', link=['data'])
|
||||
def kernel():
|
||||
pass
|
||||
|
||||
@skip_if_cuda_includes_missing
|
||||
def test_linking_cu_cuda_include(self):
|
||||
link = str(test_data_dir / 'cuda_include.cu')
|
||||
|
||||
# An exception will be raised when linking this kernel due to the
|
||||
# compile failure if CUDA includes cannot be found by Nvrtc.
|
||||
@cuda.jit('void()', link=[link])
|
||||
def kernel():
|
||||
pass
|
||||
|
||||
def test_try_to_link_nonexistent(self):
|
||||
with self.assertRaises(LinkerError) as e:
|
||||
@cuda.jit('void(int32[::1])', link=['nonexistent.a'])
|
||||
def f(x):
|
||||
x[0] = 0
|
||||
self.assertIn('nonexistent.a not found', e.exception.args)
|
||||
|
||||
def test_set_registers_no_max(self):
|
||||
"""Ensure that the jitted kernel used in the test_set_registers_* tests
|
||||
uses more than 57 registers - this ensures that test_set_registers_*
|
||||
are really checking that they reduced the number of registers used from
|
||||
something greater than the maximum."""
|
||||
compiled = cuda.jit(func_with_lots_of_registers)
|
||||
compiled = compiled.specialize(np.empty(32), *range(6))
|
||||
self.assertGreater(compiled.get_regs_per_thread(), 57)
|
||||
|
||||
def test_set_registers_57(self):
|
||||
compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)
|
||||
compiled = compiled.specialize(np.empty(32), *range(6))
|
||||
self.assertLessEqual(compiled.get_regs_per_thread(), 57)
|
||||
|
||||
def test_set_registers_38(self):
|
||||
compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)
|
||||
compiled = compiled.specialize(np.empty(32), *range(6))
|
||||
self.assertLessEqual(compiled.get_regs_per_thread(), 38)
|
||||
|
||||
def test_set_registers_eager(self):
|
||||
sig = void(float64[::1], int64, int64, int64, int64, int64, int64)
|
||||
compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)
|
||||
self.assertLessEqual(compiled.get_regs_per_thread(), 38)
|
||||
|
||||
def test_get_const_mem_size(self):
|
||||
sig = void(float64[::1])
|
||||
compiled = cuda.jit(sig)(simple_const_mem)
|
||||
const_mem_size = compiled.get_const_mem_size()
|
||||
self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)
|
||||
|
||||
def test_get_no_shared_memory(self):
|
||||
compiled = cuda.jit(func_with_lots_of_registers)
|
||||
compiled = compiled.specialize(np.empty(32), *range(6))
|
||||
shared_mem_size = compiled.get_shared_mem_per_block()
|
||||
self.assertEqual(shared_mem_size, 0)
|
||||
|
||||
def test_get_shared_mem_per_block(self):
|
||||
sig = void(int32[::1], typeof(np.int32))
|
||||
compiled = cuda.jit(sig)(simple_smem)
|
||||
shared_mem_size = compiled.get_shared_mem_per_block()
|
||||
self.assertEqual(shared_mem_size, 400)
|
||||
|
||||
def test_get_shared_mem_per_specialized(self):
|
||||
compiled = cuda.jit(simple_smem)
|
||||
compiled_specialized = compiled.specialize(
|
||||
np.zeros(100, dtype=np.int32), np.float64)
|
||||
shared_mem_size = compiled_specialized.get_shared_mem_per_block()
|
||||
self.assertEqual(shared_mem_size, 800)
|
||||
|
||||
def test_get_max_threads_per_block(self):
|
||||
compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)
|
||||
max_threads = compiled.get_max_threads_per_block()
|
||||
self.assertGreater(max_threads, 0)
|
||||
|
||||
def test_max_threads_exceeded(self):
|
||||
compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)
|
||||
max_threads = compiled.get_max_threads_per_block()
|
||||
nelem = max_threads + 1
|
||||
ary = np.empty(nelem, dtype=np.int32)
|
||||
try:
|
||||
compiled[1, nelem](ary)
|
||||
except CudaAPIError as e:
|
||||
self.assertIn("cuLaunchKernel", e.msg)
|
||||
|
||||
def test_get_local_mem_per_thread(self):
|
||||
sig = void(int32[::1], int32[::1], typeof(np.int32))
|
||||
compiled = cuda.jit(sig)(simple_lmem)
|
||||
local_mem_size = compiled.get_local_mem_per_thread()
|
||||
calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE
|
||||
self.assertGreaterEqual(local_mem_size, calc_size)
|
||||
|
||||
def test_get_local_mem_per_specialized(self):
|
||||
compiled = cuda.jit(simple_lmem)
|
||||
compiled_specialized = compiled.specialize(
|
||||
np.zeros(LMEM_SIZE, dtype=np.int32),
|
||||
np.zeros(LMEM_SIZE, dtype=np.int32),
|
||||
np.float64)
|
||||
local_mem_size = compiled_specialized.get_local_mem_per_thread()
|
||||
calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
|
||||
self.assertGreaterEqual(local_mem_size, calc_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,127 @@
|
||||
import numpy as np
|
||||
from ctypes import byref, c_size_t
|
||||
from numba.cuda.cudadrv.driver import device_memset, driver, USE_NV_BINDING
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
from numba.cuda.testing import skip_on_cudasim, skip_on_arm
|
||||
from numba.tests.support import linux_only
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
|
||||
@linux_only
|
||||
@skip_on_arm('Managed Alloc support is experimental/untested on ARM')
|
||||
class TestManagedAlloc(ContextResettingTestCase):
|
||||
|
||||
def get_total_gpu_memory(self):
|
||||
# We use a driver function to directly get the total GPU memory because
|
||||
# an EMM plugin may report something different (or not implement
|
||||
# get_memory_info at all).
|
||||
if USE_NV_BINDING:
|
||||
free, total = driver.cuMemGetInfo()
|
||||
return total
|
||||
else:
|
||||
free = c_size_t()
|
||||
total = c_size_t()
|
||||
driver.cuMemGetInfo(byref(free), byref(total))
|
||||
return total.value
|
||||
|
||||
def skip_if_cc_major_lt(self, min_required, reason):
|
||||
"""
|
||||
Skip the current test if the compute capability of the device is
|
||||
less than `min_required`.
|
||||
"""
|
||||
ctx = cuda.current_context()
|
||||
cc_major = ctx.device.compute_capability[0]
|
||||
if cc_major < min_required:
|
||||
self.skipTest(reason)
|
||||
|
||||
# CUDA Unified Memory comes in two flavors. For GPUs in the Kepler and
|
||||
# Maxwell generations, managed memory allocations work as opaque,
|
||||
# contiguous segments that can either be on the device or the host. For
|
||||
# GPUs in the Pascal or later generations, managed memory operates on a
|
||||
# per-page basis, so we can have arrays larger than GPU memory, where only
|
||||
# part of them is resident on the device at one time. To ensure that this
|
||||
# test works correctly on all supported GPUs, we'll select the size of our
|
||||
# memory such that we only oversubscribe the GPU memory if we're on a
|
||||
# Pascal or newer GPU (compute capability at least 6.0).
|
||||
|
||||
def test_managed_alloc_driver_undersubscribe(self):
|
||||
msg = "Managed memory unsupported prior to CC 3.0"
|
||||
self.skip_if_cc_major_lt(3, msg)
|
||||
self._test_managed_alloc_driver(0.5)
|
||||
|
||||
# This test is skipped by default because it is easy to hang the machine
|
||||
# for a very long time or get OOM killed if the GPU memory size is >50% of
|
||||
# the system memory size. Even if the system does have more than 2x the RAM
|
||||
# of the GPU, this test runs for a very long time (in comparison to the
|
||||
# rest of the tests in the suite).
|
||||
#
|
||||
# However, it is left in here for manual testing as required.
|
||||
|
||||
@unittest.skip
|
||||
def test_managed_alloc_driver_oversubscribe(self):
|
||||
msg = "Oversubscription of managed memory unsupported prior to CC 6.0"
|
||||
self.skip_if_cc_major_lt(6, msg)
|
||||
self._test_managed_alloc_driver(2.0)
|
||||
|
||||
def test_managed_alloc_driver_host_attach(self):
|
||||
msg = "Host attached managed memory is not accessible prior to CC 6.0"
|
||||
self.skip_if_cc_major_lt(6, msg)
|
||||
# Only test with a small array (0.01 * memory size) to keep the test
|
||||
# quick.
|
||||
self._test_managed_alloc_driver(0.01, attach_global=False)
|
||||
|
||||
def _test_managed_alloc_driver(self, memory_factor, attach_global=True):
|
||||
# Verify that we can allocate and operate on managed
|
||||
# memory through the CUDA driver interface.
|
||||
|
||||
total_mem_size = self.get_total_gpu_memory()
|
||||
n_bytes = int(memory_factor * total_mem_size)
|
||||
|
||||
ctx = cuda.current_context()
|
||||
mem = ctx.memallocmanaged(n_bytes, attach_global=attach_global)
|
||||
|
||||
dtype = np.dtype(np.uint8)
|
||||
n_elems = n_bytes // dtype.itemsize
|
||||
ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem)
|
||||
|
||||
magic = 0xab
|
||||
device_memset(mem, magic, n_bytes)
|
||||
ctx.synchronize()
|
||||
|
||||
# Note that this assertion operates on the CPU, so this
|
||||
# test effectively drives both the CPU and the GPU on
|
||||
# managed memory.
|
||||
|
||||
self.assertTrue(np.all(ary == magic))
|
||||
|
||||
def _test_managed_array(self, attach_global=True):
|
||||
# Check the managed_array interface on both host and device.
|
||||
|
||||
ary = cuda.managed_array(100, dtype=np.double)
|
||||
ary.fill(123.456)
|
||||
self.assertTrue(all(ary == 123.456))
|
||||
|
||||
@cuda.jit('void(double[:])')
|
||||
def kernel(x):
|
||||
i = cuda.grid(1)
|
||||
if i < x.shape[0]:
|
||||
x[i] = 1.0
|
||||
|
||||
kernel[10, 10](ary)
|
||||
cuda.current_context().synchronize()
|
||||
|
||||
self.assertTrue(all(ary == 1.0))
|
||||
|
||||
def test_managed_array_attach_global(self):
|
||||
self._test_managed_array()
|
||||
|
||||
def test_managed_array_attach_host(self):
|
||||
self._test_managed_array()
|
||||
msg = "Host attached managed memory is not accessible prior to CC 6.0"
|
||||
self.skip_if_cc_major_lt(6, msg)
|
||||
self._test_managed_array(attach_global=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,54 @@
|
||||
import multiprocessing as mp
|
||||
import traceback
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
|
||||
skip_if_mvc_libraries_unavailable)
|
||||
from numba.tests.support import linux_only
|
||||
|
||||
|
||||
def child_test():
|
||||
from numba import config, cuda
|
||||
|
||||
# Change the MVC config after importing numba.cuda
|
||||
config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
|
||||
|
||||
@cuda.jit
|
||||
def f():
|
||||
pass
|
||||
|
||||
f[1, 1]()
|
||||
|
||||
|
||||
def child_test_wrapper(result_queue):
|
||||
try:
|
||||
output = child_test()
|
||||
success = True
|
||||
# Catch anything raised so it can be propagated
|
||||
except: # noqa: E722
|
||||
output = traceback.format_exc()
|
||||
success = False
|
||||
|
||||
result_queue.put((success, output))
|
||||
|
||||
|
||||
@linux_only
|
||||
@skip_under_cuda_memcheck('May hang CUDA memcheck')
|
||||
@skip_on_cudasim('Simulator does not require or implement MVC')
|
||||
@skip_if_mvc_libraries_unavailable
|
||||
class TestMinorVersionCompatibility(CUDATestCase):
|
||||
def test_mvc(self):
|
||||
# Run test with Minor Version Compatibility enabled in a child process
|
||||
ctx = mp.get_context('spawn')
|
||||
result_queue = ctx.Queue()
|
||||
proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
|
||||
proc.start()
|
||||
proc.join()
|
||||
success, output = result_queue.get()
|
||||
|
||||
# Ensure the child process ran to completion before checking its output
|
||||
if not success:
|
||||
self.fail(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,199 @@
|
||||
import warnings
|
||||
|
||||
from llvmlite import ir
|
||||
from numba.cuda.cudadrv import nvvm, runtime
|
||||
from numba.cuda.testing import unittest
|
||||
from numba.cuda.cudadrv.nvvm import LibDevice, NvvmError, NVVM
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
|
||||
|
||||
@skip_on_cudasim('NVVM Driver unsupported in the simulator')
|
||||
class TestNvvmDriver(unittest.TestCase):
|
||||
def get_nvvmir(self):
|
||||
versions = NVVM().get_ir_version()
|
||||
data_layout = NVVM().data_layout
|
||||
return nvvmir_generic.format(data_layout=data_layout, v=versions)
|
||||
|
||||
def test_nvvm_compile_simple(self):
|
||||
nvvmir = self.get_nvvmir()
|
||||
ptx = nvvm.compile_ir(nvvmir).decode('utf8')
|
||||
self.assertTrue('simple' in ptx)
|
||||
self.assertTrue('ave' in ptx)
|
||||
|
||||
def test_nvvm_compile_nullary_option(self):
|
||||
# Tests compilation with an option that doesn't take an argument
|
||||
# ("-gen-lto") - all other NVVM options are of the form
|
||||
# "-<name>=<value>"
|
||||
|
||||
# -gen-lto is not available prior to CUDA 11.5
|
||||
if runtime.get_version() < (11, 5):
|
||||
self.skipTest("-gen-lto unavailable in this toolkit version")
|
||||
|
||||
nvvmir = self.get_nvvmir()
|
||||
ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch="compute_52")
|
||||
|
||||
# Verify we correctly passed the option by checking if we got LTOIR
|
||||
# from NVVM (by looking for the expected magic number for LTOIR)
|
||||
self.assertEqual(ltoir[:4], b'\xed\x43\x4e\x7f')
|
||||
|
||||
def test_nvvm_bad_option(self):
|
||||
# Ensure that unsupported / non-existent options are reported as such
|
||||
# to the user / caller
|
||||
msg = "-made-up-option=2 is an unsupported option"
|
||||
with self.assertRaisesRegex(NvvmError, msg):
|
||||
nvvm.compile_ir("", made_up_option=2)
|
||||
|
||||
def test_nvvm_from_llvm(self):
|
||||
m = ir.Module("test_nvvm_from_llvm")
|
||||
m.triple = 'nvptx64-nvidia-cuda'
|
||||
nvvm.add_ir_version(m)
|
||||
fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
|
||||
kernel = ir.Function(m, fty, name='mycudakernel')
|
||||
bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
|
||||
bldr.ret_void()
|
||||
nvvm.set_cuda_kernel(kernel)
|
||||
|
||||
m.data_layout = NVVM().data_layout
|
||||
ptx = nvvm.compile_ir(str(m)).decode('utf8')
|
||||
self.assertTrue('mycudakernel' in ptx)
|
||||
self.assertTrue('.address_size 64' in ptx)
|
||||
|
||||
def test_used_list(self):
|
||||
# Construct a module
|
||||
m = ir.Module("test_used_list")
|
||||
m.triple = 'nvptx64-nvidia-cuda'
|
||||
m.data_layout = NVVM().data_layout
|
||||
nvvm.add_ir_version(m)
|
||||
|
||||
# Add a function and mark it as a kernel
|
||||
fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
|
||||
kernel = ir.Function(m, fty, name='mycudakernel')
|
||||
bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
|
||||
bldr.ret_void()
|
||||
nvvm.set_cuda_kernel(kernel)
|
||||
|
||||
# Verify that the used list was correctly constructed
|
||||
used_lines = [line for line in str(m).splitlines()
|
||||
if 'llvm.used' in line]
|
||||
msg = 'Expected exactly one @"llvm.used" array'
|
||||
self.assertEqual(len(used_lines), 1, msg)
|
||||
|
||||
used_line = used_lines[0]
|
||||
# Kernel should be referenced in the used list
|
||||
self.assertIn("mycudakernel", used_line)
|
||||
# Check linkage of the used list
|
||||
self.assertIn("appending global", used_line)
|
||||
# Ensure used list is in the metadata section
|
||||
self.assertIn('section "llvm.metadata"', used_line)
|
||||
|
||||
def test_nvvm_ir_verify_fail(self):
|
||||
m = ir.Module("test_bad_ir")
|
||||
m.triple = "unknown-unknown-unknown"
|
||||
m.data_layout = NVVM().data_layout
|
||||
nvvm.add_ir_version(m)
|
||||
with self.assertRaisesRegex(NvvmError, 'Invalid target triple'):
|
||||
nvvm.compile_ir(str(m))
|
||||
|
||||
def _test_nvvm_support(self, arch):
|
||||
compute_xx = 'compute_{0}{1}'.format(*arch)
|
||||
nvvmir = self.get_nvvmir()
|
||||
ptx = nvvm.compile_ir(nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0,
|
||||
prec_div=0).decode('utf8')
|
||||
self.assertIn(".target sm_{0}{1}".format(*arch), ptx)
|
||||
self.assertIn('simple', ptx)
|
||||
self.assertIn('ave', ptx)
|
||||
|
||||
def test_nvvm_support(self):
|
||||
"""Test supported CC by NVVM
|
||||
"""
|
||||
for arch in nvvm.get_supported_ccs():
|
||||
self._test_nvvm_support(arch=arch)
|
||||
|
||||
def test_nvvm_warning(self):
|
||||
m = ir.Module("test_nvvm_warning")
|
||||
m.triple = 'nvptx64-nvidia-cuda'
|
||||
m.data_layout = NVVM().data_layout
|
||||
nvvm.add_ir_version(m)
|
||||
|
||||
fty = ir.FunctionType(ir.VoidType(), [])
|
||||
kernel = ir.Function(m, fty, name='inlinekernel')
|
||||
builder = ir.IRBuilder(kernel.append_basic_block('entry'))
|
||||
builder.ret_void()
|
||||
nvvm.set_cuda_kernel(kernel)
|
||||
|
||||
# Add the noinline attribute to trigger NVVM to generate a warning
|
||||
kernel.attributes.add('noinline')
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
nvvm.compile_ir(str(m))
|
||||
|
||||
self.assertEqual(len(w), 1)
|
||||
self.assertIn('overriding noinline attribute', str(w[0]))
|
||||
|
||||
|
||||
@skip_on_cudasim('NVVM Driver unsupported in the simulator')
|
||||
class TestArchOption(unittest.TestCase):
|
||||
def test_get_arch_option(self):
|
||||
# Test returning the nearest lowest arch.
|
||||
self.assertEqual(nvvm.get_arch_option(5, 3), 'compute_53')
|
||||
self.assertEqual(nvvm.get_arch_option(7, 5), 'compute_75')
|
||||
self.assertEqual(nvvm.get_arch_option(7, 7), 'compute_75')
|
||||
# Test known arch.
|
||||
supported_cc = nvvm.get_supported_ccs()
|
||||
for arch in supported_cc:
|
||||
self.assertEqual(nvvm.get_arch_option(*arch), 'compute_%d%d' % arch)
|
||||
self.assertEqual(nvvm.get_arch_option(1000, 0),
|
||||
'compute_%d%d' % supported_cc[-1])
|
||||
|
||||
|
||||
@skip_on_cudasim('NVVM Driver unsupported in the simulator')
|
||||
class TestLibDevice(unittest.TestCase):
|
||||
def test_libdevice_load(self):
|
||||
# Test that constructing LibDevice gives a bitcode file
|
||||
libdevice = LibDevice()
|
||||
self.assertEqual(libdevice.bc[:4], b'BC\xc0\xde')
|
||||
|
||||
|
||||
nvvmir_generic = '''\
|
||||
target triple="nvptx64-nvidia-cuda"
|
||||
target datalayout = "{data_layout}"
|
||||
|
||||
define i32 @ave(i32 %a, i32 %b) {{
|
||||
entry:
|
||||
%add = add nsw i32 %a, %b
|
||||
%div = sdiv i32 %add, 2
|
||||
ret i32 %div
|
||||
}}
|
||||
|
||||
define void @simple(i32* %data) {{
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
%1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
%mul = mul i32 %0, %1
|
||||
%2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%add = add i32 %mul, %2
|
||||
%call = call i32 @ave(i32 %add, i32 %add)
|
||||
%idxprom = sext i32 %add to i64
|
||||
%arrayidx = getelementptr inbounds i32, i32* %data, i64 %idxprom
|
||||
store i32 %call, i32* %arrayidx, align 4
|
||||
ret void
|
||||
}}
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
|
||||
|
||||
!nvvmir.version = !{{!1}}
|
||||
!1 = !{{i32 {v[0]}, i32 {v[1]}, i32 {v[2]}, i32 {v[3]}}}
|
||||
|
||||
!nvvm.annotations = !{{!2}}
|
||||
!2 = !{{void (i32*)* @simple, !"kernel", i32 1}}
|
||||
|
||||
@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i32*)* @simple to i8*)], section "llvm.metadata"
|
||||
''' # noqa: E501
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,37 @@
|
||||
import numpy as np
|
||||
import platform
|
||||
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
|
||||
|
||||
class TestPinned(ContextResettingTestCase):
|
||||
|
||||
def _run_copies(self, A):
|
||||
A0 = np.copy(A)
|
||||
|
||||
stream = cuda.stream()
|
||||
ptr = cuda.to_device(A, copy=False, stream=stream)
|
||||
ptr.copy_to_device(A, stream=stream)
|
||||
ptr.copy_to_host(A, stream=stream)
|
||||
stream.synchronize()
|
||||
|
||||
self.assertTrue(np.allclose(A, A0))
|
||||
|
||||
def test_pinned(self):
|
||||
machine = platform.machine()
|
||||
if machine.startswith('arm') or machine.startswith('aarch64'):
|
||||
count = 262144 # 2MB
|
||||
else:
|
||||
count = 2097152 # 16MB
|
||||
A = np.arange(count)
|
||||
with cuda.pinned(A):
|
||||
self._run_copies(A)
|
||||
|
||||
def test_unpinned(self):
|
||||
A = np.arange(2 * 1024 * 1024) # 16 MB
|
||||
self._run_copies(A)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,20 @@
|
||||
import unittest
|
||||
from numba.cuda.testing import ContextResettingTestCase
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import skip_on_cudasim
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Profiler unsupported in the simulator')
|
||||
class TestProfiler(ContextResettingTestCase):
|
||||
def test_profiling(self):
|
||||
with cuda.profiling():
|
||||
a = cuda.device_array(10)
|
||||
del a
|
||||
|
||||
with cuda.profiling():
|
||||
a = cuda.device_array(100)
|
||||
del a
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,149 @@
|
||||
import multiprocessing as mp
|
||||
import logging
|
||||
import traceback
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba.cuda.testing import (skip_on_cudasim, skip_with_cuda_python,
|
||||
skip_under_cuda_memcheck)
|
||||
from numba.tests.support import linux_only
|
||||
|
||||
|
||||
def child_test():
|
||||
from numba import cuda, int32, void
|
||||
from numba.core import config
|
||||
import io
|
||||
import numpy as np
|
||||
import threading
|
||||
|
||||
# Enable PTDS before we make any CUDA driver calls. Enabling it first
|
||||
# ensures that PTDS APIs are used because the CUDA driver looks up API
|
||||
# functions on first use and memoizes them.
|
||||
config.CUDA_PER_THREAD_DEFAULT_STREAM = 1
|
||||
|
||||
# Set up log capture for the Driver API so we can see what API calls were
|
||||
# used.
|
||||
logbuf = io.StringIO()
|
||||
handler = logging.StreamHandler(logbuf)
|
||||
cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver')
|
||||
cudadrv_logger.addHandler(handler)
|
||||
cudadrv_logger.setLevel(logging.DEBUG)
|
||||
|
||||
# Set up data for our test, and copy over to the device
|
||||
N = 2 ** 16
|
||||
N_THREADS = 10
|
||||
N_ADDITIONS = 4096
|
||||
|
||||
# Seed the RNG for repeatability
|
||||
np.random.seed(1)
|
||||
x = np.random.randint(low=0, high=1000, size=N, dtype=np.int32)
|
||||
r = np.zeros_like(x)
|
||||
|
||||
# One input and output array for each thread
|
||||
xs = [cuda.to_device(x) for _ in range(N_THREADS)]
|
||||
rs = [cuda.to_device(r) for _ in range(N_THREADS)]
|
||||
|
||||
# Compute the grid size and get the [per-thread] default stream
|
||||
n_threads = 256
|
||||
n_blocks = N // n_threads
|
||||
stream = cuda.default_stream()
|
||||
|
||||
# A simple multiplication-by-addition kernel. What it does exactly is not
|
||||
# too important; only that we have a kernel that does something.
|
||||
@cuda.jit(void(int32[::1], int32[::1]))
|
||||
def f(r, x):
|
||||
i = cuda.grid(1)
|
||||
|
||||
if i > len(r):
|
||||
return
|
||||
|
||||
# Accumulate x into r
|
||||
for j in range(N_ADDITIONS):
|
||||
r[i] += x[i]
|
||||
|
||||
# This function will be used to launch the kernel from each thread on its
|
||||
# own unique data.
|
||||
def kernel_thread(n):
|
||||
f[n_blocks, n_threads, stream](rs[n], xs[n])
|
||||
|
||||
# Create threads
|
||||
threads = [threading.Thread(target=kernel_thread, args=(i,))
|
||||
for i in range(N_THREADS)]
|
||||
|
||||
# Start all threads
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
# Wait for all threads to finish, to ensure that we don't synchronize with
|
||||
# the device until all kernels are scheduled.
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Synchronize with the device
|
||||
cuda.synchronize()
|
||||
|
||||
# Check output is as expected
|
||||
expected = x * N_ADDITIONS
|
||||
for i in range(N_THREADS):
|
||||
np.testing.assert_equal(rs[i].copy_to_host(), expected)
|
||||
|
||||
# Return the driver log output to the calling process for checking
|
||||
handler.flush()
|
||||
return logbuf.getvalue()
|
||||
|
||||
|
||||
def child_test_wrapper(result_queue):
|
||||
try:
|
||||
output = child_test()
|
||||
success = True
|
||||
# Catch anything raised so it can be propagated
|
||||
except: # noqa: E722
|
||||
output = traceback.format_exc()
|
||||
success = False
|
||||
|
||||
result_queue.put((success, output))
|
||||
|
||||
|
||||
# Run on Linux only until the reason for test hangs on Windows (Issue #8635,
|
||||
# https://github.com/numba/numba/issues/8635) is diagnosed
|
||||
@linux_only
|
||||
@skip_under_cuda_memcheck('Hangs cuda-memcheck')
|
||||
@skip_on_cudasim('Streams not supported on the simulator')
|
||||
class TestPTDS(CUDATestCase):
|
||||
@skip_with_cuda_python('Function names unchanged for PTDS with NV Binding')
|
||||
def test_ptds(self):
|
||||
# Run a test with PTDS enabled in a child process
|
||||
ctx = mp.get_context('spawn')
|
||||
result_queue = ctx.Queue()
|
||||
proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
|
||||
proc.start()
|
||||
proc.join()
|
||||
success, output = result_queue.get()
|
||||
|
||||
# Ensure the child process ran to completion before checking its output
|
||||
if not success:
|
||||
self.fail(output)
|
||||
|
||||
# Functions with a per-thread default stream variant that we expect to
|
||||
# see in the output
|
||||
ptds_functions = ('cuMemcpyHtoD_v2_ptds', 'cuLaunchKernel_ptsz',
|
||||
'cuMemcpyDtoH_v2_ptds')
|
||||
|
||||
for fn in ptds_functions:
|
||||
with self.subTest(fn=fn, expected=True):
|
||||
self.assertIn(fn, output)
|
||||
|
||||
# Non-PTDS versions of the functions that we should not see in the
|
||||
# output:
|
||||
legacy_functions = ('cuMemcpyHtoD_v2', 'cuLaunchKernel',
|
||||
'cuMemcpyDtoH_v2')
|
||||
|
||||
for fn in legacy_functions:
|
||||
with self.subTest(fn=fn, expected=False):
|
||||
# Ensure we only spot these function names appearing without a
|
||||
# _ptds or _ptsz suffix by checking including the end of the
|
||||
# line in the log
|
||||
fn_at_end = f'{fn}\n'
|
||||
self.assertNotIn(fn_at_end, output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,36 @@
|
||||
import threading
|
||||
from numba import cuda
|
||||
from numba.cuda.cudadrv.driver import driver
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
from queue import Queue
|
||||
|
||||
|
||||
class TestResetDevice(ContextResettingTestCase):
|
||||
def test_reset_device(self):
|
||||
|
||||
def newthread(exception_queue):
|
||||
try:
|
||||
devices = range(driver.get_device_count())
|
||||
for _ in range(2):
|
||||
for d in devices:
|
||||
cuda.select_device(d)
|
||||
cuda.close()
|
||||
except Exception as e:
|
||||
exception_queue.put(e)
|
||||
|
||||
# Do test on a separate thread so that we don't affect
|
||||
# the current context in the main thread.
|
||||
|
||||
exception_queue = Queue()
|
||||
t = threading.Thread(target=newthread, args=(exception_queue,))
|
||||
t.start()
|
||||
t.join()
|
||||
|
||||
exceptions = []
|
||||
while not exception_queue.empty():
|
||||
exceptions.append(exception_queue.get())
|
||||
self.assertEqual(exceptions, [])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,85 @@
|
||||
import multiprocessing
|
||||
import os
|
||||
from numba.core import config
|
||||
from numba.cuda.cudadrv.runtime import runtime
|
||||
from numba.cuda.testing import unittest, SerialMixin, skip_on_cudasim
|
||||
from unittest.mock import patch
|
||||
|
||||
|
||||
def set_visible_devices_and_check(q):
|
||||
try:
|
||||
from numba import cuda
|
||||
import os
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
q.put(len(cuda.gpus.lst))
|
||||
except: # noqa: E722
|
||||
# Sentinel value for error executing test code
|
||||
q.put(-1)
|
||||
|
||||
|
||||
if config.ENABLE_CUDASIM:
|
||||
SUPPORTED_VERSIONS = (-1, -1),
|
||||
else:
|
||||
SUPPORTED_VERSIONS = ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5),
|
||||
(11, 6), (11, 7))
|
||||
|
||||
|
||||
class TestRuntime(unittest.TestCase):
|
||||
def test_is_supported_version_true(self):
|
||||
for v in SUPPORTED_VERSIONS:
|
||||
with patch.object(runtime, 'get_version', return_value=v):
|
||||
self.assertTrue(runtime.is_supported_version())
|
||||
|
||||
@skip_on_cudasim('The simulator always simulates a supported runtime')
|
||||
def test_is_supported_version_false(self):
|
||||
# Check with an old unsupported version and some potential future
|
||||
# versions
|
||||
for v in ((10, 2), (11, 8), (12, 0)):
|
||||
with patch.object(runtime, 'get_version', return_value=v):
|
||||
self.assertFalse(runtime.is_supported_version())
|
||||
|
||||
def test_supported_versions(self):
|
||||
self.assertEqual(SUPPORTED_VERSIONS, runtime.supported_versions)
|
||||
|
||||
|
||||
class TestVisibleDevices(unittest.TestCase, SerialMixin):
|
||||
def test_visible_devices_set_after_import(self):
|
||||
# See Issue #6149. This test checks that we can set
|
||||
# CUDA_VISIBLE_DEVICES after importing Numba and have the value
|
||||
# reflected in the available list of GPUs. Prior to the fix for this
|
||||
# issue, Numba made a call to runtime.get_version() on import that
|
||||
# initialized the driver and froze the list of available devices before
|
||||
# CUDA_VISIBLE_DEVICES could be set by the user.
|
||||
|
||||
# Avoid importing cuda at the top level so that
|
||||
# set_visible_devices_and_check gets to import it first in its process
|
||||
from numba import cuda
|
||||
|
||||
if len(cuda.gpus.lst) in (0, 1):
|
||||
self.skipTest('This test requires multiple GPUs')
|
||||
|
||||
if os.environ.get('CUDA_VISIBLE_DEVICES'):
|
||||
msg = 'Cannot test when CUDA_VISIBLE_DEVICES already set'
|
||||
self.skipTest(msg)
|
||||
|
||||
ctx = multiprocessing.get_context('spawn')
|
||||
q = ctx.Queue()
|
||||
p = ctx.Process(target=set_visible_devices_and_check, args=(q,))
|
||||
p.start()
|
||||
try:
|
||||
visible_gpu_count = q.get()
|
||||
finally:
|
||||
p.join()
|
||||
|
||||
# Make an obvious distinction between an error running the test code
|
||||
# and an incorrect number of GPUs in the list
|
||||
msg = 'Error running set_visible_devices_and_check'
|
||||
self.assertNotEqual(visible_gpu_count, -1, msg=msg)
|
||||
|
||||
# The actual check that we see only one GPU
|
||||
self.assertEqual(visible_gpu_count, 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,41 @@
|
||||
#
|
||||
# Test does not work on some cards.
|
||||
#
|
||||
import threading
|
||||
from queue import Queue
|
||||
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, ContextResettingTestCase
|
||||
|
||||
|
||||
def newthread(exception_queue):
|
||||
try:
|
||||
cuda.select_device(0)
|
||||
stream = cuda.stream()
|
||||
A = np.arange(100)
|
||||
dA = cuda.to_device(A, stream=stream)
|
||||
stream.synchronize()
|
||||
del dA
|
||||
del stream
|
||||
cuda.close()
|
||||
except Exception as e:
|
||||
exception_queue.put(e)
|
||||
|
||||
|
||||
class TestSelectDevice(ContextResettingTestCase):
|
||||
def test_select_device(self):
|
||||
exception_queue = Queue()
|
||||
for i in range(10):
|
||||
t = threading.Thread(target=newthread, args=(exception_queue,))
|
||||
t.start()
|
||||
t.join()
|
||||
|
||||
exceptions = []
|
||||
while not exception_queue.empty():
|
||||
exceptions.append(exception_queue.get())
|
||||
self.assertEqual(exceptions, [])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,122 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import threading
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
||||
|
||||
|
||||
def with_asyncio_loop(f):
|
||||
@functools.wraps(f)
|
||||
def runner(*args, **kwds):
|
||||
loop = asyncio.new_event_loop()
|
||||
loop.set_debug(True)
|
||||
try:
|
||||
return loop.run_until_complete(f(*args, **kwds))
|
||||
finally:
|
||||
loop.close()
|
||||
return runner
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
|
||||
class TestCudaStream(CUDATestCase):
|
||||
def test_add_callback(self):
|
||||
def callback(stream, status, event):
|
||||
event.set()
|
||||
|
||||
stream = cuda.stream()
|
||||
callback_event = threading.Event()
|
||||
stream.add_callback(callback, callback_event)
|
||||
self.assertTrue(callback_event.wait(1.0))
|
||||
|
||||
def test_add_callback_with_default_arg(self):
|
||||
callback_event = threading.Event()
|
||||
|
||||
def callback(stream, status, arg):
|
||||
self.assertIsNone(arg)
|
||||
callback_event.set()
|
||||
|
||||
stream = cuda.stream()
|
||||
stream.add_callback(callback)
|
||||
self.assertTrue(callback_event.wait(1.0))
|
||||
|
||||
@with_asyncio_loop
|
||||
async def test_async_done(self):
|
||||
stream = cuda.stream()
|
||||
await stream.async_done()
|
||||
|
||||
@with_asyncio_loop
|
||||
async def test_parallel_tasks(self):
|
||||
async def async_cuda_fn(value_in: float) -> float:
|
||||
stream = cuda.stream()
|
||||
h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
|
||||
h_src[:] = value_in
|
||||
d_ary = cuda.to_device(h_src, stream=stream)
|
||||
d_ary.copy_to_host(h_dst, stream=stream)
|
||||
done_result = await stream.async_done()
|
||||
self.assertEqual(done_result, stream)
|
||||
return h_dst.mean()
|
||||
|
||||
values_in = [1, 2, 3, 4]
|
||||
tasks = [asyncio.create_task(async_cuda_fn(v)) for v in values_in]
|
||||
values_out = await asyncio.gather(*tasks)
|
||||
self.assertTrue(np.allclose(values_in, values_out))
|
||||
|
||||
@with_asyncio_loop
|
||||
async def test_multiple_async_done(self):
|
||||
stream = cuda.stream()
|
||||
done_aws = [stream.async_done() for _ in range(4)]
|
||||
done = await asyncio.gather(*done_aws)
|
||||
for d in done:
|
||||
self.assertEqual(d, stream)
|
||||
|
||||
@with_asyncio_loop
|
||||
async def test_multiple_async_done_multiple_streams(self):
|
||||
streams = [cuda.stream() for _ in range(4)]
|
||||
done_aws = [stream.async_done() for stream in streams]
|
||||
done = await asyncio.gather(*done_aws)
|
||||
|
||||
# Ensure we got the four original streams in done
|
||||
self.assertSetEqual(set(done), set(streams))
|
||||
|
||||
@with_asyncio_loop
|
||||
async def test_cancelled_future(self):
|
||||
stream = cuda.stream()
|
||||
done1, done2 = stream.async_done(), stream.async_done()
|
||||
done1.cancel()
|
||||
await done2
|
||||
self.assertTrue(done1.cancelled())
|
||||
self.assertTrue(done2.done())
|
||||
|
||||
|
||||
@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
|
||||
class TestFailingStream(CUDATestCase):
|
||||
# This test can only be run in isolation because it corrupts the CUDA
|
||||
# context, which cannot be recovered from within the same process. It is
|
||||
# left here so that it can be run manually for debugging / testing purposes
|
||||
# - or may be re-enabled if in future there is infrastructure added for
|
||||
# running tests in a separate process (a subprocess cannot be used because
|
||||
# CUDA will have been initialized before the fork, so it cannot be used in
|
||||
# the child process).
|
||||
@unittest.skip
|
||||
@with_asyncio_loop
|
||||
async def test_failed_stream(self):
|
||||
ctx = cuda.current_context()
|
||||
module = ctx.create_module_ptx("""
|
||||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
.visible .entry failing_kernel() { trap; }
|
||||
""")
|
||||
failing_kernel = module.get_function("failing_kernel")
|
||||
|
||||
stream = cuda.stream()
|
||||
failing_kernel.configure((1,), (1,), stream=stream).__call__()
|
||||
done = stream.async_done()
|
||||
with self.assertRaises(Exception):
|
||||
await done
|
||||
self.assertIsNotNone(done.exception())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,8 @@
|
||||
from numba.cuda.testing import ensure_supported_ccs_initialized
|
||||
from numba.testing import load_testsuite
|
||||
import os
|
||||
|
||||
|
||||
def load_tests(loader, tests, pattern):
|
||||
ensure_supported_ccs_initialized()
|
||||
return load_testsuite(loader, os.path.dirname(__file__))
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user