This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,8 @@
from numba.cuda.testing import ensure_supported_ccs_initialized
from numba.testing import load_testsuite
import os
def load_tests(loader, tests, pattern):
ensure_supported_ccs_initialized()
return load_testsuite(loader, os.path.dirname(__file__))

View File

@@ -0,0 +1,234 @@
from numba import cuda
from numba.cuda.testing import CUDATestCase
import numpy as np
import sys
class UseCase:
"""
Provide a way to call a kernel as if it were a function.
This allows the CUDA cache tests to closely match the CPU cache tests, and
also to support calling cache use cases as njitted functions. The class
wraps a function that takes an array for the return value and arguments,
and provides an interface that accepts arguments, launches the kernel
appropriately, and returns the stored return value.
The return type is inferred from the type of the first argument, unless it
is explicitly overridden by the ``retty`` kwarg.
"""
def __init__(self, func, retty=None):
self._func = func
self._retty = retty
def __call__(self, *args):
array_args = [np.asarray(arg) for arg in args]
if self._retty:
array_return = np.ndarray((), dtype=self._retty)
else:
array_return = np.zeros_like(array_args[0])
self._call(array_return, *array_args)
return array_return[()]
@property
def func(self):
return self._func
class CUDAUseCase(UseCase):
def _call(self, ret, *args):
self._func[1, 1](ret, *args)
@cuda.jit(cache=True)
def add_usecase_kernel(r, x, y):
r[()] = x[()] + y[()] + Z
@cuda.jit(cache=False)
def add_nocache_usecase_kernel(r, x, y):
r[()] = x[()] + y[()] + Z
add_usecase = CUDAUseCase(add_usecase_kernel)
add_nocache_usecase = CUDAUseCase(add_nocache_usecase_kernel)
Z = 1
# Inner / outer cached / uncached cases
@cuda.jit(cache=True)
def inner(x, y):
return x + y + Z
@cuda.jit(cache=True)
def outer_kernel(r, x, y):
r[()] = inner(-y[()], x[()])
@cuda.jit(cache=False)
def outer_uncached_kernel(r, x, y):
r[()] = inner(-y[()], x[()])
outer = CUDAUseCase(outer_kernel)
outer_uncached = CUDAUseCase(outer_uncached_kernel)
# Exercise returning a record instance. This used to hardcode the dtype
# pointer's value in the bitcode.
packed_record_type = np.dtype([('a', np.int8), ('b', np.float64)])
aligned_record_type = np.dtype([('a', np.int8), ('b', np.float64)], align=True)
packed_arr = np.empty(2, dtype=packed_record_type)
for i in range(packed_arr.size):
packed_arr[i]['a'] = i + 1
packed_arr[i]['b'] = i + 42.5
aligned_arr = np.array(packed_arr, dtype=aligned_record_type)
@cuda.jit(cache=True)
def record_return(r, ary, i):
r[()] = ary[i]
record_return_packed = CUDAUseCase(record_return, retty=packed_record_type)
record_return_aligned = CUDAUseCase(record_return, retty=aligned_record_type)
# Closure test cases
def make_closure(x):
@cuda.jit(cache=True)
def closure(r, y):
r[()] = x + y[()]
return CUDAUseCase(closure)
closure1 = make_closure(3)
closure2 = make_closure(5)
closure3 = make_closure(7)
closure4 = make_closure(9)
# Ambiguous / renamed functions
@cuda.jit(cache=True)
def ambiguous_function(r, x):
r[()] = x[()] + 2
renamed_function1 = CUDAUseCase(ambiguous_function)
@cuda.jit(cache=True)
def ambiguous_function(r, x):
r[()] = x[()] + 6
renamed_function2 = CUDAUseCase(ambiguous_function)
@cuda.jit(cache=True)
def many_locals():
aa = cuda.local.array((1, 1), np.float64)
ab = cuda.local.array((1, 1), np.float64)
ac = cuda.local.array((1, 1), np.float64)
ad = cuda.local.array((1, 1), np.float64)
ae = cuda.local.array((1, 1), np.float64)
af = cuda.local.array((1, 1), np.float64)
ag = cuda.local.array((1, 1), np.float64)
ah = cuda.local.array((1, 1), np.float64)
ai = cuda.local.array((1, 1), np.float64)
aj = cuda.local.array((1, 1), np.float64)
ak = cuda.local.array((1, 1), np.float64)
al = cuda.local.array((1, 1), np.float64)
am = cuda.local.array((1, 1), np.float64)
an = cuda.local.array((1, 1), np.float64)
ao = cuda.local.array((1, 1), np.float64)
ap = cuda.local.array((1, 1), np.float64)
ar = cuda.local.array((1, 1), np.float64)
at = cuda.local.array((1, 1), np.float64)
au = cuda.local.array((1, 1), np.float64)
av = cuda.local.array((1, 1), np.float64)
aw = cuda.local.array((1, 1), np.float64)
ax = cuda.local.array((1, 1), np.float64)
ay = cuda.local.array((1, 1), np.float64)
az = cuda.local.array((1, 1), np.float64)
aa[:] = 0
ab[:] = 0
ac[:] = 0
ad[:] = 0
ae[:] = 0
af[:] = 0
ag[:] = 0
ah[:] = 0
ai[:] = 0
aj[:] = 0
ak[:] = 0
al[:] = 0
am[:] = 0
an[:] = 0
ao[:] = 0
ap[:] = 0
ar[:] = 0
at[:] = 0
au[:] = 0
av[:] = 0
aw[:] = 0
ax[:] = 0
ay[:] = 0
az[:] = 0
# Simple use case for multiprocessing test
@cuda.jit(cache=True)
def simple_usecase_kernel(r, x):
r[()] = x[()]
simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
# Usecase with cooperative groups
@cuda.jit(cache=True)
def cg_usecase_kernel(r, x):
grid = cuda.cg.this_grid()
grid.sync()
cg_usecase = CUDAUseCase(cg_usecase_kernel)
class _TestModule(CUDATestCase):
"""
Tests for functionality of this module's functions.
Note this does not define any "test_*" method, instead check_module()
should be called by hand.
"""
def check_module(self, mod):
self.assertPreciseEqual(mod.add_usecase(2, 3), 6)
self.assertPreciseEqual(mod.outer_uncached(3, 2), 2)
self.assertPreciseEqual(mod.outer(3, 2), 2)
packed_rec = mod.record_return_packed(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(packed_rec), (2, 43.5))
aligned_rec = mod.record_return_aligned(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(aligned_rec), (2, 43.5))
mod.simple_usecase_caller(2)
def self_test():
mod = sys.modules[__name__]
_TestModule().check_module(mod)

View File

@@ -0,0 +1,41 @@
import sys
from numba import cuda, njit
from numba.cuda.testing import CUDATestCase
from numba.cuda.tests.cudapy.cache_usecases import CUDAUseCase, UseCase
class CPUUseCase(UseCase):
def _call(self, ret, *args):
self._func(ret, *args)
# Using the same function as a cached CPU and CUDA-jitted function
def target_shared_assign(r, x):
r[()] = x[()]
assign_cuda_kernel = cuda.jit(cache=True)(target_shared_assign)
assign_cuda = CUDAUseCase(assign_cuda_kernel)
assign_cpu_jitted = njit(cache=True)(target_shared_assign)
assign_cpu = CPUUseCase(assign_cpu_jitted)
class _TestModule(CUDATestCase):
"""
Tests for functionality of this module's functions.
Note this does not define any "test_*" method, instead check_module()
should be called by hand.
"""
def check_module(self, mod):
self.assertPreciseEqual(mod.assign_cpu(5), 5)
self.assertPreciseEqual(mod.assign_cpu(5.5), 5.5)
self.assertPreciseEqual(mod.assign_cuda(5), 5)
self.assertPreciseEqual(mod.assign_cuda(5.5), 5.5)
def self_test():
mod = sys.modules[__name__]
_TestModule().check_module(mod)

View File

@@ -0,0 +1,58 @@
from numba import types
from numba.core import config
class TestStruct:
def __init__(self, x, y):
self.x = x
self.y = y
class TestStructModelType(types.Type):
def __init__(self):
super().__init__(name="TestStructModelType")
test_struct_model_type = TestStructModelType()
if not config.ENABLE_CUDASIM:
from numba import int32
from numba.core.extending import (
models,
register_model,
make_attribute_wrapper,
typeof_impl,
type_callable
)
from numba.cuda.cudaimpl import lower
from numba.core import cgutils
@typeof_impl.register(TestStruct)
def typeof_teststruct(val, c):
return test_struct_model_type
@register_model(TestStructModelType)
class TestStructModel(models.StructModel):
def __init__(self, dmm, fe_type):
members = [("x", int32), ("y", int32)]
super().__init__(dmm, fe_type, members)
make_attribute_wrapper(TestStructModelType, 'x', 'x')
make_attribute_wrapper(TestStructModelType, 'y', 'y')
@type_callable(TestStruct)
def type_test_struct(context):
def typer(x, y):
if isinstance(x, types.Integer) and isinstance(y, types.Integer):
return test_struct_model_type
return typer
@lower(TestStruct, types.Integer, types.Integer)
def lower_test_type_ctor(context, builder, sig, args):
obj = cgutils.create_struct_proxy(
test_struct_model_type
)(context, builder)
obj.x = args[0]
obj.y = args[1]
return obj._getvalue()

View File

@@ -0,0 +1,100 @@
"""
Usecases of recursive functions in the CUDA target, many derived from
numba/tests/recursion_usecases.py.
Some functions are compiled at import time, hence a separate module.
"""
from numba import cuda
@cuda.jit("i8(i8)", device=True)
def fib1(n):
if n < 2:
return n
# Note the second call does not use a named argument, unlike the CPU target
# usecase
return fib1(n - 1) + fib1(n - 2)
def make_fib2():
@cuda.jit("i8(i8)", device=True)
def fib2(n):
if n < 2:
return n
return fib2(n - 1) + fib2(n - 2)
return fib2
fib2 = make_fib2()
@cuda.jit
def type_change_self(x, y):
if x > 1 and y > 0:
return x + type_change_self(x - y, y)
else:
return y
# Implicit signature
@cuda.jit(device=True)
def fib3(n):
if n < 2:
return n
return fib3(n - 1) + fib3(n - 2)
# Run-away self recursion
@cuda.jit(device=True)
def runaway_self(x):
return runaway_self(x)
@cuda.jit(device=True)
def raise_self(x):
if x == 1:
raise ValueError("raise_self")
elif x > 0:
return raise_self(x - 1)
else:
return 1
@cuda.jit(debug=True, opt=False)
def raise_self_kernel(x):
raise_self(x)
def make_optional_return_case(jit=lambda x: x):
@jit
def foo(x):
if x > 5:
return x - 1
else:
return
@jit
def bar(x):
out = foo(x)
if out is None:
return out
elif out < 8:
return out
else:
return x * bar(out)
return bar
def make_growing_tuple_case(jit=lambda x: x):
# From issue #4387
@jit
def make_list(n):
if n <= 0:
return None
return (n, make_list(n - 1))
return make_list

View File

@@ -0,0 +1,42 @@
import numpy as np
from numba import from_dtype, cuda
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
import unittest
class TestAlignment(CUDATestCase):
def test_record_alignment(self):
rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')], align=True)
rec = from_dtype(rec_dtype)
@cuda.jit((rec[:],))
def foo(a):
i = cuda.grid(1)
a[i].a = a[i].b
a_recarray = np.recarray(3, dtype=rec_dtype)
for i in range(a_recarray.size):
a_rec = a_recarray[i]
a_rec.a = 0
a_rec.b = (i + 1) * 123
foo[1, 3](a_recarray)
self.assertTrue(np.all(a_recarray.a == a_recarray.b))
@skip_on_cudasim('Simulator does not check alignment')
def test_record_alignment_error(self):
rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')])
rec = from_dtype(rec_dtype)
with self.assertRaises(Exception) as raises:
@cuda.jit((rec[:],))
def foo(a):
i = cuda.grid(1)
a[i].a = a[i].b
self.assertTrue('type float64 is not aligned' in str(raises.exception))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,260 @@
import numpy as np
from numba.cuda.testing import unittest, CUDATestCase
from numba.cuda.testing import skip_on_cudasim, skip_unless_cudasim
from numba import config, cuda
if config.ENABLE_CUDASIM:
ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.pinned_array_like)
else:
ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.mapped_array_like,
cuda.pinned_array_like)
class TestCudaArray(CUDATestCase):
def test_gpu_array_zero_length(self):
x = np.arange(0)
dx = cuda.to_device(x)
hx = dx.copy_to_host()
self.assertEqual(x.shape, dx.shape)
self.assertEqual(x.size, dx.size)
self.assertEqual(x.shape, hx.shape)
self.assertEqual(x.size, hx.size)
def test_null_shape(self):
null_shape = ()
shape1 = cuda.device_array(()).shape
shape2 = cuda.device_array_like(np.ndarray(())).shape
self.assertEqual(shape1, null_shape)
self.assertEqual(shape2, null_shape)
def test_gpu_array_strided(self):
@cuda.jit('void(double[:])')
def kernel(x):
i = cuda.grid(1)
if i < x.shape[0]:
x[i] = i
x = np.arange(10, dtype=np.double)
y = np.ndarray(shape=10 * 8, buffer=x, dtype=np.byte)
z = np.ndarray(9, buffer=y[4:-4], dtype=np.double)
kernel[10, 10](z)
self.assertTrue(np.allclose(z, list(range(9))))
def test_gpu_array_interleaved(self):
@cuda.jit('void(double[:], double[:])')
def copykernel(x, y):
i = cuda.grid(1)
if i < x.shape[0]:
x[i] = i
y[i] = i
x = np.arange(10, dtype=np.double)
y = x[:-1:2]
# z = x[1::2]
# n = y.size
try:
cuda.devicearray.auto_device(y)
except ValueError:
pass
else:
raise AssertionError("Should raise exception complaining the "
"contiguous-ness of the array.")
# Should we handle this use case?
# assert z.size == y.size
# copykernel[1, n](y, x)
# print(y, z)
# assert np.all(y == z)
# assert np.all(y == list(range(n)))
def test_auto_device_const(self):
d, _ = cuda.devicearray.auto_device(2)
self.assertTrue(np.all(d.copy_to_host() == np.array(2)))
def _test_array_like_same(self, like_func, array):
"""
Tests of *_array_like where shape, strides, dtype, and flags should
all be equal.
"""
array_like = like_func(array)
self.assertEqual(array.shape, array_like.shape)
self.assertEqual(array.strides, array_like.strides)
self.assertEqual(array.dtype, array_like.dtype)
self.assertEqual(array.flags['C_CONTIGUOUS'],
array_like.flags['C_CONTIGUOUS'])
self.assertEqual(array.flags['F_CONTIGUOUS'],
array_like.flags['F_CONTIGUOUS'])
def test_array_like_1d(self):
d_a = cuda.device_array(10, order='C')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_2d(self):
d_a = cuda.device_array((10, 12), order='C')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_2d_transpose(self):
d_a = cuda.device_array((10, 12), order='C')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_3d(self):
d_a = cuda.device_array((10, 12, 14), order='C')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_1d_f(self):
d_a = cuda.device_array(10, order='F')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_2d_f(self):
d_a = cuda.device_array((10, 12), order='F')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_2d_f_transpose(self):
d_a = cuda.device_array((10, 12), order='F')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def test_array_like_3d_f(self):
d_a = cuda.device_array((10, 12, 14), order='F')
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_same(like_func, d_a)
def _test_array_like_view(self, like_func, view, d_view):
"""
Tests of device_array_like where the original array is a view - the
strides should not be equal because a contiguous array is expected.
"""
nb_like = like_func(d_view)
self.assertEqual(d_view.shape, nb_like.shape)
self.assertEqual(d_view.dtype, nb_like.dtype)
# Use NumPy as a reference for the expected strides
np_like = np.zeros_like(view)
self.assertEqual(nb_like.strides, np_like.strides)
self.assertEqual(nb_like.flags['C_CONTIGUOUS'],
np_like.flags['C_CONTIGUOUS'])
self.assertEqual(nb_like.flags['F_CONTIGUOUS'],
np_like.flags['F_CONTIGUOUS'])
def test_array_like_1d_view(self):
shape = 10
view = np.zeros(shape)[::2]
d_view = cuda.device_array(shape)[::2]
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_view(like_func, view, d_view)
def test_array_like_1d_view_f(self):
shape = 10
view = np.zeros(shape, order='F')[::2]
d_view = cuda.device_array(shape, order='F')[::2]
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_view(like_func, view, d_view)
def test_array_like_2d_view(self):
shape = (10, 12)
view = np.zeros(shape)[::2, ::2]
d_view = cuda.device_array(shape)[::2, ::2]
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_view(like_func, view, d_view)
def test_array_like_2d_view_f(self):
shape = (10, 12)
view = np.zeros(shape, order='F')[::2, ::2]
d_view = cuda.device_array(shape, order='F')[::2, ::2]
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_view(like_func, view, d_view)
@skip_on_cudasim('Numba and NumPy stride semantics differ for transpose')
def test_array_like_2d_view_transpose_device(self):
shape = (10, 12)
d_view = cuda.device_array(shape)[::2, ::2].T
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
# This is a special case (see issue #4974) because creating the
# transpose creates a new contiguous allocation with different
# strides. In this case, rather than comparing against NumPy,
# we can only compare against expected values.
like = like_func(d_view)
self.assertEqual(d_view.shape, like.shape)
self.assertEqual(d_view.dtype, like.dtype)
self.assertEqual((40, 8), like.strides)
self.assertTrue(like.flags['C_CONTIGUOUS'])
self.assertFalse(like.flags['F_CONTIGUOUS'])
@skip_unless_cudasim('Numba and NumPy stride semantics differ for '
'transpose')
def test_array_like_2d_view_transpose_simulator(self):
shape = (10, 12)
view = np.zeros(shape)[::2, ::2].T
d_view = cuda.device_array(shape)[::2, ::2].T
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
# On the simulator, the transpose has different strides to on a
# CUDA device (See issue #4974). Here we can compare strides
# against NumPy as a reference.
np_like = np.zeros_like(view)
nb_like = like_func(d_view)
self.assertEqual(d_view.shape, nb_like.shape)
self.assertEqual(d_view.dtype, nb_like.dtype)
self.assertEqual(np_like.strides, nb_like.strides)
self.assertEqual(np_like.flags['C_CONTIGUOUS'],
nb_like.flags['C_CONTIGUOUS'])
self.assertEqual(np_like.flags['F_CONTIGUOUS'],
nb_like.flags['F_CONTIGUOUS'])
def test_array_like_2d_view_f_transpose(self):
shape = (10, 12)
view = np.zeros(shape, order='F')[::2, ::2].T
d_view = cuda.device_array(shape, order='F')[::2, ::2].T
for like_func in ARRAY_LIKE_FUNCTIONS:
with self.subTest(like_func=like_func):
self._test_array_like_view(like_func, view, d_view)
@skip_on_cudasim('Kernel overloads not created in the simulator')
def test_issue_4628(self):
# CUDA Device arrays were reported as always being typed with 'A' order
# so launching the kernel with a host array and then a device array
# resulted in two overloads being compiled - one for 'C' order from
# the host array, and one for 'A' order from the device array. With the
# resolution of this issue, the order of the device array is also 'C',
# so after the kernel launches there should only be one overload of
# the function.
@cuda.jit
def func(A, out):
i = cuda.grid(1)
out[i] = A[i] * 2
n = 128
a = np.ones((n,))
d_a = cuda.to_device(a)
result = np.zeros((n,))
func[1, 128](a, result)
func[1, 128](d_a, result)
self.assertEqual(1, len(func.overloads))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,224 @@
import numpy as np
from collections import namedtuple
from numba import cuda
from numba.core.errors import TypingError
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
class TestCudaArrayArg(CUDATestCase):
def test_array_ary(self):
@cuda.jit('double(double[:],int64)', device=True, inline=True)
def device_function(a, c):
return a[c]
@cuda.jit('void(double[:],double[:])')
def kernel(x, y):
i = cuda.grid(1)
y[i] = device_function(x, i)
x = np.arange(10, dtype=np.double)
y = np.zeros_like(x)
kernel[10, 1](x, y)
self.assertTrue(np.all(x == y))
def test_unituple(self):
@cuda.jit
def f(r, x):
r[0] = x[0]
r[1] = x[1]
r[2] = x[2]
x = (1, 2, 3)
r = np.zeros(len(x), dtype=np.int64)
f[1, 1](r, x)
for i in range(len(x)):
self.assertEqual(r[i], x[i])
def test_tuple(self):
@cuda.jit
def f(r1, r2, x):
r1[0] = x[0]
r1[1] = x[1]
r1[2] = x[2]
r2[0] = x[3]
r2[1] = x[4]
r2[2] = x[5]
x = (1, 2, 3, 4.5, 5.5, 6.5)
r1 = np.zeros(len(x) // 2, dtype=np.int64)
r2 = np.zeros(len(x) // 2, dtype=np.float64)
f[1, 1](r1, r2, x)
for i in range(len(r1)):
self.assertEqual(r1[i], x[i])
for i in range(len(r2)):
self.assertEqual(r2[i], x[i + len(r1)])
def test_namedunituple(self):
@cuda.jit
def f(r, x):
r[0] = x.x
r[1] = x.y
Point = namedtuple('Point', ('x', 'y'))
x = Point(1, 2)
r = np.zeros(len(x), dtype=np.int64)
f[1, 1](r, x)
self.assertEqual(r[0], x.x)
self.assertEqual(r[1], x.y)
def test_namedtuple(self):
@cuda.jit
def f(r1, r2, x):
r1[0] = x.x
r1[1] = x.y
r2[0] = x.r
Point = namedtuple('Point', ('x', 'y', 'r'))
x = Point(1, 2, 2.236)
r1 = np.zeros(2, dtype=np.int64)
r2 = np.zeros(1, dtype=np.float64)
f[1, 1](r1, r2, x)
self.assertEqual(r1[0], x.x)
self.assertEqual(r1[1], x.y)
self.assertEqual(r2[0], x.r)
def test_empty_tuple(self):
@cuda.jit
def f(r, x):
r[0] = len(x)
x = tuple()
r = np.ones(1, dtype=np.int64)
f[1, 1](r, x)
self.assertEqual(r[0], 0)
def test_tuple_of_empty_tuples(self):
@cuda.jit
def f(r, x):
r[0] = len(x)
r[1] = len(x[0])
x = ((), (), ())
r = np.ones(2, dtype=np.int64)
f[1, 1](r, x)
self.assertEqual(r[0], 3)
self.assertEqual(r[1], 0)
def test_tuple_of_tuples(self):
@cuda.jit
def f(r, x):
r[0] = len(x)
r[1] = len(x[0])
r[2] = len(x[1])
r[3] = len(x[2])
r[4] = x[1][0]
r[5] = x[1][1]
r[6] = x[2][0]
r[7] = x[2][1]
r[8] = x[2][2]
x = ((), (5, 6), (8, 9, 10))
r = np.ones(9, dtype=np.int64)
f[1, 1](r, x)
self.assertEqual(r[0], 3)
self.assertEqual(r[1], 0)
self.assertEqual(r[2], 2)
self.assertEqual(r[3], 3)
self.assertEqual(r[4], 5)
self.assertEqual(r[5], 6)
self.assertEqual(r[6], 8)
self.assertEqual(r[7], 9)
self.assertEqual(r[8], 10)
def test_tuple_of_tuples_and_scalars(self):
@cuda.jit
def f(r, x):
r[0] = len(x)
r[1] = len(x[0])
r[2] = x[0][0]
r[3] = x[0][1]
r[4] = x[0][2]
r[5] = x[1]
x = ((6, 5, 4), 7)
r = np.ones(9, dtype=np.int64)
f[1, 1](r, x)
self.assertEqual(r[0], 2)
self.assertEqual(r[1], 3)
self.assertEqual(r[2], 6)
self.assertEqual(r[3], 5)
self.assertEqual(r[4], 4)
self.assertEqual(r[5], 7)
def test_tuple_of_arrays(self):
@cuda.jit
def f(x):
i = cuda.grid(1)
if i < len(x[0]):
x[0][i] = x[1][i] + x[2][i]
N = 10
x0 = np.zeros(N)
x1 = np.ones_like(x0)
x2 = x1 * 3
x = (x0, x1, x2)
f[1, N](x)
np.testing.assert_equal(x0, x1 + x2)
def test_tuple_of_array_scalar_tuple(self):
@cuda.jit
def f(r, x):
r[0] = x[0][0]
r[1] = x[0][1]
r[2] = x[1]
r[3] = x[2][0]
r[4] = x[2][1]
z = np.arange(2, dtype=np.int64)
x = (2 * z, 10, (4, 3))
r = np.zeros(5, dtype=np.int64)
f[1, 1](r, x)
self.assertEqual(r[0], 0)
self.assertEqual(r[1], 2)
self.assertEqual(r[2], 10)
self.assertEqual(r[3], 4)
self.assertEqual(r[4], 3)
class TestDatetimeIssues(CUDATestCase):
# See also numba.tests.test_npdatetime.TestDatetimeIssues.
@skip_on_cudasim("Typing not used on cudasim")
def test_10y_issue_9585(self):
@cuda.jit
def f(x):
return x + 1
arr = np.array('2010', dtype='datetime64[10Y]')
with self.assertRaises(TypingError) as e:
f[1, 1](arr)
# Note that the CUDA target doesn't report which argument caused the
# exception, so we can't check for it here as we do with the CPU
# target.
message = e.exception.args[0]
unsupported_type = "Unsupported array dtype: datetime64[10Y]"
self.assertIn(unsupported_type, message)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,35 @@
import numpy as np
from numba import cuda
from numba.cuda.testing import CUDATestCase
import unittest
def reinterpret_array_type(byte_arr, start, stop, output):
# Tested with just one thread
val = byte_arr[start:stop].view(np.int32)[0]
output[0] = val
class TestCudaArrayMethods(CUDATestCase):
def test_reinterpret_array_type(self):
"""
Reinterpret byte array as int32 in the GPU.
"""
pyfunc = reinterpret_array_type
kernel = cuda.jit(pyfunc)
byte_arr = np.arange(256, dtype=np.uint8)
itemsize = np.dtype(np.int32).itemsize
for start in range(0, 256, itemsize):
stop = start + itemsize
expect = byte_arr[start:stop].view(np.int32)[0]
output = np.zeros(1, dtype=np.int32)
kernel[1, 1](byte_arr, start, stop, output)
got = output[0]
self.assertEqual(expect, got)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,120 @@
import numpy as np
import math
from numba import cuda, double, void
from numba.cuda.testing import unittest, CUDATestCase
RISKFREE = 0.02
VOLATILITY = 0.30
A1 = 0.31938153
A2 = -0.356563782
A3 = 1.781477937
A4 = -1.821255978
A5 = 1.330274429
RSQRT2PI = 0.39894228040143267793994605993438
def cnd(d):
K = 1.0 / (1.0 + 0.2316419 * np.abs(d))
ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
return np.where(d > 0, 1.0 - ret_val, ret_val)
def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
Riskfree, Volatility):
S = stockPrice
X = optionStrike
T = optionYears
R = Riskfree
V = Volatility
sqrtT = np.sqrt(T)
d1 = (np.log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT)
d2 = d1 - V * sqrtT
cndd1 = cnd(d1)
cndd2 = cnd(d2)
expRT = np.exp(- R * T)
callResult[:] = (S * cndd1 - X * expRT * cndd2)
putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1))
def randfloat(rand_var, low, high):
return (1.0 - rand_var) * low + rand_var * high
class TestBlackScholes(CUDATestCase):
def test_blackscholes(self):
OPT_N = 400
iterations = 2
stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)
callResultNumpy = np.zeros(OPT_N)
putResultNumpy = -np.ones(OPT_N)
callResultNumba = np.zeros(OPT_N)
putResultNumba = -np.ones(OPT_N)
# numpy
for i in range(iterations):
black_scholes(callResultNumpy, putResultNumpy, stockPrice,
optionStrike, optionYears, RISKFREE, VOLATILITY)
@cuda.jit(double(double), device=True, inline=True)
def cnd_cuda(d):
K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
if d > 0:
ret_val = 1.0 - ret_val
return ret_val
@cuda.jit(void(double[:], double[:], double[:], double[:], double[:],
double, double))
def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
if i >= S.shape[0]:
return
sqrtT = math.sqrt(T[i])
d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i])
/ (V * sqrtT))
d2 = d1 - V * sqrtT
cndd1 = cnd_cuda(d1)
cndd2 = cnd_cuda(d2)
expRT = math.exp((-1. * R) * T[i])
callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))
# numba
blockdim = 512, 1
griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1
stream = cuda.stream()
d_callResult = cuda.to_device(callResultNumba, stream)
d_putResult = cuda.to_device(putResultNumba, stream)
d_stockPrice = cuda.to_device(stockPrice, stream)
d_optionStrike = cuda.to_device(optionStrike, stream)
d_optionYears = cuda.to_device(optionYears, stream)
for i in range(iterations):
black_scholes_cuda[griddim, blockdim, stream](
d_callResult, d_putResult, d_stockPrice, d_optionStrike,
d_optionYears, RISKFREE, VOLATILITY)
d_callResult.copy_to_host(callResultNumba, stream)
d_putResult.copy_to_host(putResultNumba, stream)
stream.synchronize()
delta = np.abs(callResultNumpy - callResultNumba)
L1norm = delta.sum() / np.abs(callResultNumpy).sum()
max_abs_err = delta.max()
self.assertTrue(L1norm < 1e-13)
self.assertTrue(max_abs_err < 1e-13)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,24 @@
import numpy as np
from numba.cuda.testing import unittest, CUDATestCase
from numba import cuda
def boolean_func(A, vertial):
if vertial:
A[0] = 123
else:
A[0] = 321
class TestCudaBoolean(CUDATestCase):
def test_boolean(self):
func = cuda.jit('void(float64[:], bool_)')(boolean_func)
A = np.array([0], dtype='float64')
func[1, 1](A, True)
self.assertTrue(A[0] == 123)
func[1, 1](A, False)
self.assertTrue(A[0] == 321)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,545 @@
import multiprocessing
import os
import shutil
import subprocess
import sys
import unittest
import warnings
from numba import cuda
from numba.core.errors import NumbaWarning
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
skip_unless_cc_60, skip_if_cudadevrt_missing,
skip_if_mvc_enabled, test_data_dir)
from numba.tests.support import SerialMixin
from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
skip_bad_access)
@skip_on_cudasim('Simulator does not implement caching')
class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_caching(self):
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # 2 index, 4 data
self.check_hits(f.func, 0, 2)
# Check the code runs ok from another process
self.run_in_separate_process()
def test_no_caching(self):
mod = self.import_module()
f = mod.add_nocache_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(0)
def test_many_locals(self):
# Declaring many local arrays creates a very large LLVM IR, which
# cannot be pickled due to the level of recursion it requires to
# pickle. This test ensures that kernels with many locals (and
# therefore large IR) can be cached. See Issue #8373:
# https://github.com/numba/numba/issues/8373
self.check_pycache(0)
mod = self.import_module()
f = mod.many_locals
f[1, 1]()
self.check_pycache(2) # 1 index, 1 data
def test_closure(self):
mod = self.import_module()
with warnings.catch_warnings():
warnings.simplefilter('error', NumbaWarning)
f = mod.closure1
self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
f = mod.closure2
self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
f = mod.closure3
self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
f = mod.closure4
self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
self.check_pycache(5) # 1 nbi, 4 nbc
def test_cache_reuse(self):
mod = self.import_module()
mod.add_usecase(2, 3)
mod.add_usecase(2.5, 3.5)
mod.outer_uncached(2, 3)
mod.outer(2, 3)
mod.record_return_packed(mod.packed_arr, 0)
mod.record_return_aligned(mod.aligned_arr, 1)
mod.simple_usecase_caller(2)
mtimes = self.get_cache_mtimes()
# Two signatures compiled
self.check_hits(mod.add_usecase.func, 0, 2)
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
f = mod2.add_usecase
f(2, 3)
self.check_hits(f.func, 1, 0)
f(2.5, 3.5)
self.check_hits(f.func, 2, 0)
# The files haven't changed
self.assertEqual(self.get_cache_mtimes(), mtimes)
self.run_in_separate_process()
self.assertEqual(self.get_cache_mtimes(), mtimes)
def test_cache_invalidate(self):
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
# This should change the functions' results
with open(self.modfile, "a") as f:
f.write("\nZ = 10\n")
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 15)
def test_recompile(self):
# Explicit call to recompile() should overwrite the cache
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
mod = self.import_module()
f = mod.add_usecase
mod.Z = 10
self.assertPreciseEqual(f(2, 3), 6)
f.func.recompile()
self.assertPreciseEqual(f(2, 3), 15)
# Freshly recompiled version is re-used from other imports
mod = self.import_module()
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 15)
def test_same_names(self):
# Function with the same names should still disambiguate
mod = self.import_module()
f = mod.renamed_function1
self.assertPreciseEqual(f(2), 4)
f = mod.renamed_function2
self.assertPreciseEqual(f(2), 8)
@skip_unless_cc_60
@skip_if_cudadevrt_missing
@skip_if_mvc_enabled('CG not supported with MVC')
def test_cache_cg(self):
# Functions using cooperative groups should be cacheable. See Issue
# #8888: https://github.com/numba/numba/issues/8888
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
mod.cg_usecase(0)
self.check_pycache(2) # 1 index, 1 data
# Check the code runs ok from another process
self.run_in_separate_process()
@skip_unless_cc_60
@skip_if_cudadevrt_missing
@skip_if_mvc_enabled('CG not supported with MVC')
def test_cache_cg_clean_run(self):
# See Issue #9432: https://github.com/numba/numba/issues/9432
# If a cached function using CG sync was the first thing to compile,
# the compile would fail.
self.check_pycache(0)
# This logic is modelled on run_in_separate_process(), but executes the
# CG usecase directly in the subprocess.
code = """if 1:
import sys
sys.path.insert(0, %(tempdir)r)
mod = __import__(%(modname)r)
mod.cg_usecase(0)
""" % dict(tempdir=self.tempdir, modname=self.modname)
popen = subprocess.Popen([sys.executable, "-c", code],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = popen.communicate(timeout=60)
if popen.returncode != 0:
raise AssertionError(
"process failed with code %s: \n"
"stdout follows\n%s\n"
"stderr follows\n%s\n"
% (popen.returncode, out.decode(), err.decode()),
)
def _test_pycache_fallback(self):
"""
With a disabled __pycache__, test there is a working fallback
(e.g. on the user-wide cache dir)
"""
mod = self.import_module()
f = mod.add_usecase
# Remove this function's cache files at the end, to avoid accumulation
# across test calls.
self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
ignore_errors=True)
self.assertPreciseEqual(f(2, 3), 6)
# It's a cache miss since the file was copied to a new temp location
self.check_hits(f.func, 0, 1)
# Test re-use
mod2 = self.import_module()
f = mod2.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_hits(f.func, 1, 0)
# The __pycache__ is empty (otherwise the test's preconditions
# wouldn't be met)
self.check_pycache(0)
@skip_bad_access
@unittest.skipIf(os.name == "nt",
"cannot easily make a directory read-only on Windows")
def test_non_creatable_pycache(self):
# Make it impossible to create the __pycache__ directory
old_perms = os.stat(self.tempdir).st_mode
os.chmod(self.tempdir, 0o500)
self.addCleanup(os.chmod, self.tempdir, old_perms)
self._test_pycache_fallback()
@skip_bad_access
@unittest.skipIf(os.name == "nt",
"cannot easily make a directory read-only on Windows")
def test_non_writable_pycache(self):
# Make it impossible to write to the __pycache__ directory
pycache = os.path.join(self.tempdir, '__pycache__')
os.mkdir(pycache)
old_perms = os.stat(pycache).st_mode
os.chmod(pycache, 0o500)
self.addCleanup(os.chmod, pycache, old_perms)
self._test_pycache_fallback()
def test_cannot_cache_linking_libraries(self):
link = str(test_data_dir / 'jitlink.ptx')
msg = 'Cannot pickle CUDACodeLibrary with linking files'
with self.assertRaisesRegex(RuntimeError, msg):
@cuda.jit('void()', cache=True, link=[link])
def f():
pass
@skip_on_cudasim('Simulator does not implement caching')
class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
modname = "cuda_and_cpu_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_cpu_and_cuda_targets(self):
# The same function jitted for CPU and CUDA targets should maintain
# separate caches for each target.
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
f_cpu = mod.assign_cpu
f_cuda = mod.assign_cuda
self.assertPreciseEqual(f_cpu(5), 5)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f_cuda(5), 5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f_cpu.func, 0, 1)
self.check_hits(f_cuda.func, 0, 1)
self.assertPreciseEqual(f_cpu(5.5), 5.5)
self.check_pycache(4) # 1 index, 3 data
self.assertPreciseEqual(f_cuda(5.5), 5.5)
self.check_pycache(5) # 1 index, 4 data
self.check_hits(f_cpu.func, 0, 2)
self.check_hits(f_cuda.func, 0, 2)
def test_cpu_and_cuda_reuse(self):
# Existing cache files for the CPU and CUDA targets are reused.
mod = self.import_module()
mod.assign_cpu(5)
mod.assign_cpu(5.5)
mod.assign_cuda(5)
mod.assign_cuda(5.5)
mtimes = self.get_cache_mtimes()
# Two signatures compiled
self.check_hits(mod.assign_cpu.func, 0, 2)
self.check_hits(mod.assign_cuda.func, 0, 2)
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
f_cpu = mod2.assign_cpu
f_cuda = mod2.assign_cuda
f_cpu(2)
self.check_hits(f_cpu.func, 1, 0)
f_cpu(2.5)
self.check_hits(f_cpu.func, 2, 0)
f_cuda(2)
self.check_hits(f_cuda.func, 1, 0)
f_cuda(2.5)
self.check_hits(f_cuda.func, 2, 0)
# The files haven't changed
self.assertEqual(self.get_cache_mtimes(), mtimes)
self.run_in_separate_process()
self.assertEqual(self.get_cache_mtimes(), mtimes)
def get_different_cc_gpus():
# Find two GPUs with different Compute Capabilities and return them as a
# tuple. If two GPUs with distinct Compute Capabilities cannot be found,
# then None is returned.
first_gpu = cuda.gpus[0]
with first_gpu:
first_cc = cuda.current_context().device.compute_capability
for gpu in cuda.gpus[1:]:
with gpu:
cc = cuda.current_context().device.compute_capability
if cc != first_cc:
return (first_gpu, gpu)
return None
@skip_on_cudasim('Simulator does not implement caching')
class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_multi_cc_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_cache(self):
gpus = get_different_cc_gpus()
if not gpus:
self.skipTest('Need two different CCs for multi-CC cache test')
self.check_pycache(0)
mod = self.import_module()
self.check_pycache(0)
# Step 1. Populate the cache with the first GPU
with gpus[0]:
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(2) # 1 index, 1 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(3) # 1 index, 2 data
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # 2 index, 4 data
self.check_hits(f.func, 0, 2)
# Step 2. Run with the second GPU - under present behaviour this
# doesn't further populate the cache.
with gpus[1]:
f = mod.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(6) # cache unchanged
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(6) # cache unchanged
self.check_hits(f.func, 0, 2)
f = mod.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(6) # cache unchanged
self.check_hits(f.func, 0, 2)
# Step 3. Run in a separate module with the second GPU - this populates
# the cache for the second CC.
mod2 = self.import_module()
self.assertIsNot(mod, mod2)
with gpus[1]:
f = mod2.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.check_pycache(7) # 2 index, 5 data
self.assertPreciseEqual(f(2.5, 3), 6.5)
self.check_pycache(8) # 2 index, 6 data
self.check_hits(f.func, 0, 2)
f = mod2.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod2.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
self.check_pycache(10) # 2 index, 8 data
self.check_hits(f.func, 0, 2)
# The following steps check that we can use the NVVM IR loaded from the
# cache to generate PTX for a different compute capability to the
# cached cubin's CC. To check this, we create another module that loads
# the cached version containing a cubin for GPU 1. There will be no
# cubin for GPU 0, so when we try to use it the PTX must be generated.
mod3 = self.import_module()
self.assertIsNot(mod, mod3)
# Step 4. Run with GPU 1 and get a cache hit, loading the cache created
# during Step 3.
with gpus[1]:
f = mod3.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.assertPreciseEqual(f(2.5, 3), 6.5)
f = mod3.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod3.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
# Step 5. Run with GPU 0 using the module from Step 4, to force PTX
# generation from cached NVVM IR.
with gpus[0]:
f = mod3.add_usecase
self.assertPreciseEqual(f(2, 3), 6)
self.assertPreciseEqual(f(2.5, 3), 6.5)
f = mod3.record_return_aligned
rec = f(mod.aligned_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
f = mod3.record_return_packed
rec = f(mod.packed_arr, 1)
self.assertPreciseEqual(tuple(rec), (2, 43.5))
def child_initializer():
# Disable occupancy and implicit copy warnings in processes in a
# multiprocessing pool.
from numba.core import config
config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
config.CUDA_WARN_ON_IMPLICIT_COPY = 0
@skip_on_cudasim('Simulator does not implement caching')
class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
# Nested multiprocessing.Pool raises AssertionError:
# "daemonic processes are not allowed to have children"
_numba_parallel_test_ = False
here = os.path.dirname(__file__)
usecases_file = os.path.join(here, "cache_usecases.py")
modname = "cuda_mp_caching_test_fodder"
def setUp(self):
DispatcherCacheUsecasesTest.setUp(self)
CUDATestCase.setUp(self)
def tearDown(self):
CUDATestCase.tearDown(self)
DispatcherCacheUsecasesTest.tearDown(self)
def test_multiprocessing(self):
# Check caching works from multiple processes at once (#2028)
mod = self.import_module()
# Calling a pure Python caller of the JIT-compiled function is
# necessary to reproduce the issue.
f = mod.simple_usecase_caller
n = 3
try:
ctx = multiprocessing.get_context('spawn')
except AttributeError:
ctx = multiprocessing
pool = ctx.Pool(n, child_initializer)
try:
res = sum(pool.imap(f, range(n)))
finally:
pool.close()
self.assertEqual(res, n * (n - 1) // 2)
@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
class TestCUDACodeLibrary(CUDATestCase):
# For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
# explicitly check
def test_cannot_serialize_unfinalized(self):
# The CUDA codegen failes to import under the simulator, so we cannot
# import it at the top level
from numba.cuda.codegen import CUDACodeLibrary
# Usually a CodeLibrary requires a real CodeGen, but since we don't
# interact with it, anything will do
codegen = object()
name = 'library'
cl = CUDACodeLibrary(codegen, name)
with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
cl._reduce_states()

View File

@@ -0,0 +1,257 @@
import numpy as np
from numba.cuda import compile_ptx
from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
from numba import cuda
from numba.core import types
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
skip_unless_cc_53)
from numba.types import float16, float32
import itertools
import unittest
def native_cast(x):
return float(x)
def to_int8(x):
return np.int8(x)
def to_int16(x):
return np.int16(x)
def to_int32(x):
return np.int32(x)
def to_int64(x):
return np.int64(x)
def to_uint8(x):
return np.uint8(x)
def to_uint16(x):
return np.uint16(x)
def to_uint32(x):
return types.uint32(x)
def to_uint64(x):
return types.uint64(x)
def to_float16(x):
# When division and operators on float16 types are supported, this should
# be changed to match the implementation in to_float32.
return (np.float16(x) * np.float16(0.5))
def to_float32(x):
return np.float32(x) / np.float32(2)
def to_float64(x):
return np.float64(x) / np.float64(2)
def to_complex64(x):
return np.complex64(x)
def to_complex128(x):
return np.complex128(x)
# Since multiplication of float16 is not supported via the operator * on
# float16s yet, and the host does not implement cuda.fp16.*, we need two
# versions of the following functions:
#
# - The device version uses cuda.fp16.hmul
# - The host version uses the * operator
def cuda_int_literal_to_float16(x):
# Note that we need to use `2` and not `np.float16(2)` to ensure that this
# types as a literal int and not a const float16.
return cuda.fp16.hmul(np.float16(x), 2)
def reference_int_literal_to_float16(x):
return np.float16(x) * np.float16(2)
def cuda_float_literal_to_float16(x):
# Note that `2.5` types as a const float64 and not a literal float, but
# this case is provided in case that changes in future.
return cuda.fp16.hmul(np.float16(x), 2.5)
def reference_float_literal_to_float16(x):
return np.float16(x) * np.float16(2.5)
class TestCasting(CUDATestCase):
def _create_wrapped(self, pyfunc, intype, outtype):
wrapped_func = cuda.jit(device=True)(pyfunc)
@cuda.jit
def cuda_wrapper_fn(arg, res):
res[0] = wrapped_func(arg[0])
def wrapper_fn(arg):
argarray = np.zeros(1, dtype=intype)
argarray[0] = arg
resarray = np.zeros(1, dtype=outtype)
cuda_wrapper_fn[1, 1](argarray, resarray)
return resarray[0]
return wrapper_fn
@skip_unless_cc_53
def test_float_to_int(self):
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
totys = (np.int8, np.int16, np.int32, np.int64)
fromtys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
for fromty in fromtys:
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
self.assertEqual(cfunc(12.3), pyfunc(12.3))
self.assertEqual(cfunc(12.3), int(12.3))
self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
self.assertEqual(cfunc(-12.3), int(-12.3))
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_float16_to_int_ptx(self):
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
sizes = (8, 16, 32, 64)
for pyfunc, size in zip(pyfuncs, sizes):
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
self.assertIn(f"cvt.rni.s{size}.f16", ptx)
@skip_unless_cc_53
def test_float_to_uint(self):
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
totys = (np.uint8, np.uint16, np.uint32, np.uint64)
fromtys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
for fromty in fromtys:
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
self.assertEqual(cfunc(12.3), pyfunc(12.3))
self.assertEqual(cfunc(12.3), int(12.3))
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_float16_to_uint_ptx(self):
pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
sizes = (8, 16, 32, 64)
for pyfunc, size in zip(pyfuncs, sizes):
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
self.assertIn(f"cvt.rni.u{size}.f16", ptx)
@skip_unless_cc_53
def test_int_to_float(self):
pyfuncs = (to_float16, to_float32, to_float64)
totys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
with self.subTest(toty=toty):
cfunc = self._create_wrapped(pyfunc, np.int64, toty)
self.assertEqual(cfunc(321), pyfunc(321))
@skip_unless_cc_53
def test_literal_to_float16(self):
cudafuncs = (cuda_int_literal_to_float16,
cuda_float_literal_to_float16)
hostfuncs = (reference_int_literal_to_float16,
reference_float_literal_to_float16)
for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
with self.subTest(func=cudafunc):
cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
self.assertEqual(cfunc(321), hostfunc(321))
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_int_to_float16_ptx(self):
fromtys = (i1, i2, i4, i8)
sizes = (8, 16, 32, 64)
for ty, size in zip(fromtys, sizes):
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
self.assertIn(f"cvt.rn.f16.s{size}", ptx)
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_uint_to_float16_ptx(self):
fromtys = (u1, u2, u4, u8)
sizes = (8, 16, 32, 64)
for ty, size in zip(fromtys, sizes):
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
self.assertIn(f"cvt.rn.f16.u{size}", ptx)
@skip_unless_cc_53
def test_float_to_float(self):
pyfuncs = (to_float16, to_float32, to_float64)
tys = (np.float16, np.float32, np.float64)
for (pyfunc, fromty), toty in itertools.product(zip(pyfuncs, tys), tys):
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
# For this test we cannot use the pyfunc for comparison because
# the CUDA target doesn't yet implement division (or operators)
# for float16 values, so we test by comparing with the computed
# expression instead.
np.testing.assert_allclose(cfunc(12.3),
toty(12.3) / toty(2), rtol=0.0003)
np.testing.assert_allclose(cfunc(-12.3),
toty(-12.3) / toty(2), rtol=0.0003)
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_float16_to_float_ptx(self):
pyfuncs = (to_float32, to_float64)
postfixes = ("f32", "f64")
for pyfunc, postfix in zip(pyfuncs, postfixes):
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
self.assertIn(f"cvt.{postfix}.f16", ptx)
@skip_unless_cc_53
def test_float_to_complex(self):
pyfuncs = (to_complex64, to_complex128)
totys = (np.complex64, np.complex128)
fromtys = (np.float16, np.float32, np.float64)
for pyfunc, toty in zip(pyfuncs, totys):
for fromty in fromtys:
with self.subTest(fromty=fromty, toty=toty):
cfunc = self._create_wrapped(pyfunc, fromty, toty)
# Here we need to explicitly cast the input to the pyfunc
# to match the casting that is automatically applied when
# passing the input to the cfunc as part of wrapping it in
# an array of type fromtype.
np.testing.assert_allclose(cfunc(3.21),
pyfunc(fromty(3.21)))
np.testing.assert_allclose(cfunc(-3.21),
pyfunc(fromty(-3.21)) + 0j)
@skip_on_cudasim('Compilation unsupported in the simulator')
def test_native_cast(self):
float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
self.assertIn("st.f32", float32_ptx)
float16_ptx, _ = cuda.compile_ptx(native_cast, (float16,), device=True)
self.assertIn("st.u16", float16_ptx)
if __name__ == '__main__':
unittest.main()

Some files were not shown because too many files have changed in this diff Show More