Videre
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
from numba.cuda.testing import ensure_supported_ccs_initialized
|
||||
from numba.testing import load_testsuite
|
||||
import os
|
||||
|
||||
|
||||
def load_tests(loader, tests, pattern):
|
||||
ensure_supported_ccs_initialized()
|
||||
return load_testsuite(loader, os.path.dirname(__file__))
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,234 @@
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import CUDATestCase
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
|
||||
class UseCase:
|
||||
"""
|
||||
Provide a way to call a kernel as if it were a function.
|
||||
|
||||
This allows the CUDA cache tests to closely match the CPU cache tests, and
|
||||
also to support calling cache use cases as njitted functions. The class
|
||||
wraps a function that takes an array for the return value and arguments,
|
||||
and provides an interface that accepts arguments, launches the kernel
|
||||
appropriately, and returns the stored return value.
|
||||
|
||||
The return type is inferred from the type of the first argument, unless it
|
||||
is explicitly overridden by the ``retty`` kwarg.
|
||||
"""
|
||||
def __init__(self, func, retty=None):
|
||||
self._func = func
|
||||
self._retty = retty
|
||||
|
||||
def __call__(self, *args):
|
||||
array_args = [np.asarray(arg) for arg in args]
|
||||
if self._retty:
|
||||
array_return = np.ndarray((), dtype=self._retty)
|
||||
else:
|
||||
array_return = np.zeros_like(array_args[0])
|
||||
|
||||
self._call(array_return, *array_args)
|
||||
return array_return[()]
|
||||
|
||||
@property
|
||||
def func(self):
|
||||
return self._func
|
||||
|
||||
|
||||
class CUDAUseCase(UseCase):
|
||||
def _call(self, ret, *args):
|
||||
self._func[1, 1](ret, *args)
|
||||
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def add_usecase_kernel(r, x, y):
|
||||
r[()] = x[()] + y[()] + Z
|
||||
|
||||
|
||||
@cuda.jit(cache=False)
|
||||
def add_nocache_usecase_kernel(r, x, y):
|
||||
r[()] = x[()] + y[()] + Z
|
||||
|
||||
|
||||
add_usecase = CUDAUseCase(add_usecase_kernel)
|
||||
add_nocache_usecase = CUDAUseCase(add_nocache_usecase_kernel)
|
||||
|
||||
Z = 1
|
||||
|
||||
|
||||
# Inner / outer cached / uncached cases
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def inner(x, y):
|
||||
return x + y + Z
|
||||
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def outer_kernel(r, x, y):
|
||||
r[()] = inner(-y[()], x[()])
|
||||
|
||||
|
||||
@cuda.jit(cache=False)
|
||||
def outer_uncached_kernel(r, x, y):
|
||||
r[()] = inner(-y[()], x[()])
|
||||
|
||||
|
||||
outer = CUDAUseCase(outer_kernel)
|
||||
outer_uncached = CUDAUseCase(outer_uncached_kernel)
|
||||
|
||||
|
||||
# Exercise returning a record instance. This used to hardcode the dtype
|
||||
# pointer's value in the bitcode.
|
||||
|
||||
packed_record_type = np.dtype([('a', np.int8), ('b', np.float64)])
|
||||
aligned_record_type = np.dtype([('a', np.int8), ('b', np.float64)], align=True)
|
||||
|
||||
packed_arr = np.empty(2, dtype=packed_record_type)
|
||||
for i in range(packed_arr.size):
|
||||
packed_arr[i]['a'] = i + 1
|
||||
packed_arr[i]['b'] = i + 42.5
|
||||
|
||||
aligned_arr = np.array(packed_arr, dtype=aligned_record_type)
|
||||
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def record_return(r, ary, i):
|
||||
r[()] = ary[i]
|
||||
|
||||
|
||||
record_return_packed = CUDAUseCase(record_return, retty=packed_record_type)
|
||||
record_return_aligned = CUDAUseCase(record_return, retty=aligned_record_type)
|
||||
|
||||
|
||||
# Closure test cases
|
||||
|
||||
def make_closure(x):
|
||||
@cuda.jit(cache=True)
|
||||
def closure(r, y):
|
||||
r[()] = x + y[()]
|
||||
|
||||
return CUDAUseCase(closure)
|
||||
|
||||
|
||||
closure1 = make_closure(3)
|
||||
closure2 = make_closure(5)
|
||||
closure3 = make_closure(7)
|
||||
closure4 = make_closure(9)
|
||||
|
||||
|
||||
# Ambiguous / renamed functions
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def ambiguous_function(r, x):
|
||||
r[()] = x[()] + 2
|
||||
|
||||
|
||||
renamed_function1 = CUDAUseCase(ambiguous_function)
|
||||
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def ambiguous_function(r, x):
|
||||
r[()] = x[()] + 6
|
||||
|
||||
|
||||
renamed_function2 = CUDAUseCase(ambiguous_function)
|
||||
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def many_locals():
|
||||
aa = cuda.local.array((1, 1), np.float64)
|
||||
ab = cuda.local.array((1, 1), np.float64)
|
||||
ac = cuda.local.array((1, 1), np.float64)
|
||||
ad = cuda.local.array((1, 1), np.float64)
|
||||
ae = cuda.local.array((1, 1), np.float64)
|
||||
af = cuda.local.array((1, 1), np.float64)
|
||||
ag = cuda.local.array((1, 1), np.float64)
|
||||
ah = cuda.local.array((1, 1), np.float64)
|
||||
ai = cuda.local.array((1, 1), np.float64)
|
||||
aj = cuda.local.array((1, 1), np.float64)
|
||||
ak = cuda.local.array((1, 1), np.float64)
|
||||
al = cuda.local.array((1, 1), np.float64)
|
||||
am = cuda.local.array((1, 1), np.float64)
|
||||
an = cuda.local.array((1, 1), np.float64)
|
||||
ao = cuda.local.array((1, 1), np.float64)
|
||||
ap = cuda.local.array((1, 1), np.float64)
|
||||
ar = cuda.local.array((1, 1), np.float64)
|
||||
at = cuda.local.array((1, 1), np.float64)
|
||||
au = cuda.local.array((1, 1), np.float64)
|
||||
av = cuda.local.array((1, 1), np.float64)
|
||||
aw = cuda.local.array((1, 1), np.float64)
|
||||
ax = cuda.local.array((1, 1), np.float64)
|
||||
ay = cuda.local.array((1, 1), np.float64)
|
||||
az = cuda.local.array((1, 1), np.float64)
|
||||
|
||||
aa[:] = 0
|
||||
ab[:] = 0
|
||||
ac[:] = 0
|
||||
ad[:] = 0
|
||||
ae[:] = 0
|
||||
af[:] = 0
|
||||
ag[:] = 0
|
||||
ah[:] = 0
|
||||
ai[:] = 0
|
||||
aj[:] = 0
|
||||
ak[:] = 0
|
||||
al[:] = 0
|
||||
am[:] = 0
|
||||
an[:] = 0
|
||||
ao[:] = 0
|
||||
ap[:] = 0
|
||||
ar[:] = 0
|
||||
at[:] = 0
|
||||
au[:] = 0
|
||||
av[:] = 0
|
||||
aw[:] = 0
|
||||
ax[:] = 0
|
||||
ay[:] = 0
|
||||
az[:] = 0
|
||||
|
||||
|
||||
# Simple use case for multiprocessing test
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def simple_usecase_kernel(r, x):
|
||||
r[()] = x[()]
|
||||
|
||||
|
||||
simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
|
||||
|
||||
|
||||
# Usecase with cooperative groups
|
||||
|
||||
@cuda.jit(cache=True)
|
||||
def cg_usecase_kernel(r, x):
|
||||
grid = cuda.cg.this_grid()
|
||||
grid.sync()
|
||||
|
||||
|
||||
cg_usecase = CUDAUseCase(cg_usecase_kernel)
|
||||
|
||||
|
||||
class _TestModule(CUDATestCase):
|
||||
"""
|
||||
Tests for functionality of this module's functions.
|
||||
Note this does not define any "test_*" method, instead check_module()
|
||||
should be called by hand.
|
||||
"""
|
||||
|
||||
def check_module(self, mod):
|
||||
self.assertPreciseEqual(mod.add_usecase(2, 3), 6)
|
||||
self.assertPreciseEqual(mod.outer_uncached(3, 2), 2)
|
||||
self.assertPreciseEqual(mod.outer(3, 2), 2)
|
||||
|
||||
packed_rec = mod.record_return_packed(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(packed_rec), (2, 43.5))
|
||||
aligned_rec = mod.record_return_aligned(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(aligned_rec), (2, 43.5))
|
||||
|
||||
mod.simple_usecase_caller(2)
|
||||
|
||||
|
||||
def self_test():
|
||||
mod = sys.modules[__name__]
|
||||
_TestModule().check_module(mod)
|
||||
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
|
||||
from numba import cuda, njit
|
||||
from numba.cuda.testing import CUDATestCase
|
||||
from numba.cuda.tests.cudapy.cache_usecases import CUDAUseCase, UseCase
|
||||
|
||||
|
||||
class CPUUseCase(UseCase):
|
||||
def _call(self, ret, *args):
|
||||
self._func(ret, *args)
|
||||
|
||||
|
||||
# Using the same function as a cached CPU and CUDA-jitted function
|
||||
|
||||
def target_shared_assign(r, x):
|
||||
r[()] = x[()]
|
||||
|
||||
|
||||
assign_cuda_kernel = cuda.jit(cache=True)(target_shared_assign)
|
||||
assign_cuda = CUDAUseCase(assign_cuda_kernel)
|
||||
assign_cpu_jitted = njit(cache=True)(target_shared_assign)
|
||||
assign_cpu = CPUUseCase(assign_cpu_jitted)
|
||||
|
||||
|
||||
class _TestModule(CUDATestCase):
|
||||
"""
|
||||
Tests for functionality of this module's functions.
|
||||
Note this does not define any "test_*" method, instead check_module()
|
||||
should be called by hand.
|
||||
"""
|
||||
|
||||
def check_module(self, mod):
|
||||
self.assertPreciseEqual(mod.assign_cpu(5), 5)
|
||||
self.assertPreciseEqual(mod.assign_cpu(5.5), 5.5)
|
||||
self.assertPreciseEqual(mod.assign_cuda(5), 5)
|
||||
self.assertPreciseEqual(mod.assign_cuda(5.5), 5.5)
|
||||
|
||||
|
||||
def self_test():
|
||||
mod = sys.modules[__name__]
|
||||
_TestModule().check_module(mod)
|
||||
@@ -0,0 +1,58 @@
|
||||
from numba import types
|
||||
from numba.core import config
|
||||
|
||||
|
||||
class TestStruct:
|
||||
def __init__(self, x, y):
|
||||
self.x = x
|
||||
self.y = y
|
||||
|
||||
|
||||
class TestStructModelType(types.Type):
|
||||
def __init__(self):
|
||||
super().__init__(name="TestStructModelType")
|
||||
|
||||
|
||||
test_struct_model_type = TestStructModelType()
|
||||
|
||||
|
||||
if not config.ENABLE_CUDASIM:
|
||||
from numba import int32
|
||||
from numba.core.extending import (
|
||||
models,
|
||||
register_model,
|
||||
make_attribute_wrapper,
|
||||
typeof_impl,
|
||||
type_callable
|
||||
)
|
||||
from numba.cuda.cudaimpl import lower
|
||||
from numba.core import cgutils
|
||||
|
||||
@typeof_impl.register(TestStruct)
|
||||
def typeof_teststruct(val, c):
|
||||
return test_struct_model_type
|
||||
|
||||
@register_model(TestStructModelType)
|
||||
class TestStructModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type):
|
||||
members = [("x", int32), ("y", int32)]
|
||||
super().__init__(dmm, fe_type, members)
|
||||
|
||||
make_attribute_wrapper(TestStructModelType, 'x', 'x')
|
||||
make_attribute_wrapper(TestStructModelType, 'y', 'y')
|
||||
|
||||
@type_callable(TestStruct)
|
||||
def type_test_struct(context):
|
||||
def typer(x, y):
|
||||
if isinstance(x, types.Integer) and isinstance(y, types.Integer):
|
||||
return test_struct_model_type
|
||||
return typer
|
||||
|
||||
@lower(TestStruct, types.Integer, types.Integer)
|
||||
def lower_test_type_ctor(context, builder, sig, args):
|
||||
obj = cgutils.create_struct_proxy(
|
||||
test_struct_model_type
|
||||
)(context, builder)
|
||||
obj.x = args[0]
|
||||
obj.y = args[1]
|
||||
return obj._getvalue()
|
||||
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Usecases of recursive functions in the CUDA target, many derived from
|
||||
numba/tests/recursion_usecases.py.
|
||||
|
||||
Some functions are compiled at import time, hence a separate module.
|
||||
"""
|
||||
|
||||
from numba import cuda
|
||||
|
||||
|
||||
@cuda.jit("i8(i8)", device=True)
|
||||
def fib1(n):
|
||||
if n < 2:
|
||||
return n
|
||||
# Note the second call does not use a named argument, unlike the CPU target
|
||||
# usecase
|
||||
return fib1(n - 1) + fib1(n - 2)
|
||||
|
||||
|
||||
def make_fib2():
|
||||
@cuda.jit("i8(i8)", device=True)
|
||||
def fib2(n):
|
||||
if n < 2:
|
||||
return n
|
||||
return fib2(n - 1) + fib2(n - 2)
|
||||
|
||||
return fib2
|
||||
|
||||
|
||||
fib2 = make_fib2()
|
||||
|
||||
|
||||
@cuda.jit
|
||||
def type_change_self(x, y):
|
||||
if x > 1 and y > 0:
|
||||
return x + type_change_self(x - y, y)
|
||||
else:
|
||||
return y
|
||||
|
||||
|
||||
# Implicit signature
|
||||
@cuda.jit(device=True)
|
||||
def fib3(n):
|
||||
if n < 2:
|
||||
return n
|
||||
|
||||
return fib3(n - 1) + fib3(n - 2)
|
||||
|
||||
|
||||
# Run-away self recursion
|
||||
@cuda.jit(device=True)
|
||||
def runaway_self(x):
|
||||
return runaway_self(x)
|
||||
|
||||
|
||||
@cuda.jit(device=True)
|
||||
def raise_self(x):
|
||||
if x == 1:
|
||||
raise ValueError("raise_self")
|
||||
elif x > 0:
|
||||
return raise_self(x - 1)
|
||||
else:
|
||||
return 1
|
||||
|
||||
|
||||
@cuda.jit(debug=True, opt=False)
|
||||
def raise_self_kernel(x):
|
||||
raise_self(x)
|
||||
|
||||
|
||||
def make_optional_return_case(jit=lambda x: x):
|
||||
@jit
|
||||
def foo(x):
|
||||
if x > 5:
|
||||
return x - 1
|
||||
else:
|
||||
return
|
||||
|
||||
@jit
|
||||
def bar(x):
|
||||
out = foo(x)
|
||||
if out is None:
|
||||
return out
|
||||
elif out < 8:
|
||||
return out
|
||||
else:
|
||||
return x * bar(out)
|
||||
|
||||
return bar
|
||||
|
||||
|
||||
def make_growing_tuple_case(jit=lambda x: x):
|
||||
# From issue #4387
|
||||
@jit
|
||||
def make_list(n):
|
||||
if n <= 0:
|
||||
return None
|
||||
|
||||
return (n, make_list(n - 1))
|
||||
return make_list
|
||||
@@ -0,0 +1,42 @@
|
||||
import numpy as np
|
||||
from numba import from_dtype, cuda
|
||||
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
||||
import unittest
|
||||
|
||||
|
||||
class TestAlignment(CUDATestCase):
|
||||
def test_record_alignment(self):
|
||||
rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')], align=True)
|
||||
rec = from_dtype(rec_dtype)
|
||||
|
||||
@cuda.jit((rec[:],))
|
||||
def foo(a):
|
||||
i = cuda.grid(1)
|
||||
a[i].a = a[i].b
|
||||
|
||||
a_recarray = np.recarray(3, dtype=rec_dtype)
|
||||
for i in range(a_recarray.size):
|
||||
a_rec = a_recarray[i]
|
||||
a_rec.a = 0
|
||||
a_rec.b = (i + 1) * 123
|
||||
|
||||
foo[1, 3](a_recarray)
|
||||
|
||||
self.assertTrue(np.all(a_recarray.a == a_recarray.b))
|
||||
|
||||
@skip_on_cudasim('Simulator does not check alignment')
|
||||
def test_record_alignment_error(self):
|
||||
rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')])
|
||||
rec = from_dtype(rec_dtype)
|
||||
|
||||
with self.assertRaises(Exception) as raises:
|
||||
@cuda.jit((rec[:],))
|
||||
def foo(a):
|
||||
i = cuda.grid(1)
|
||||
a[i].a = a[i].b
|
||||
|
||||
self.assertTrue('type float64 is not aligned' in str(raises.exception))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,260 @@
|
||||
import numpy as np
|
||||
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba.cuda.testing import skip_on_cudasim, skip_unless_cudasim
|
||||
from numba import config, cuda
|
||||
|
||||
|
||||
if config.ENABLE_CUDASIM:
|
||||
ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.pinned_array_like)
|
||||
else:
|
||||
ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.mapped_array_like,
|
||||
cuda.pinned_array_like)
|
||||
|
||||
|
||||
class TestCudaArray(CUDATestCase):
|
||||
def test_gpu_array_zero_length(self):
|
||||
x = np.arange(0)
|
||||
dx = cuda.to_device(x)
|
||||
hx = dx.copy_to_host()
|
||||
self.assertEqual(x.shape, dx.shape)
|
||||
self.assertEqual(x.size, dx.size)
|
||||
self.assertEqual(x.shape, hx.shape)
|
||||
self.assertEqual(x.size, hx.size)
|
||||
|
||||
def test_null_shape(self):
|
||||
null_shape = ()
|
||||
shape1 = cuda.device_array(()).shape
|
||||
shape2 = cuda.device_array_like(np.ndarray(())).shape
|
||||
self.assertEqual(shape1, null_shape)
|
||||
self.assertEqual(shape2, null_shape)
|
||||
|
||||
def test_gpu_array_strided(self):
|
||||
|
||||
@cuda.jit('void(double[:])')
|
||||
def kernel(x):
|
||||
i = cuda.grid(1)
|
||||
if i < x.shape[0]:
|
||||
x[i] = i
|
||||
|
||||
x = np.arange(10, dtype=np.double)
|
||||
y = np.ndarray(shape=10 * 8, buffer=x, dtype=np.byte)
|
||||
z = np.ndarray(9, buffer=y[4:-4], dtype=np.double)
|
||||
kernel[10, 10](z)
|
||||
self.assertTrue(np.allclose(z, list(range(9))))
|
||||
|
||||
def test_gpu_array_interleaved(self):
|
||||
|
||||
@cuda.jit('void(double[:], double[:])')
|
||||
def copykernel(x, y):
|
||||
i = cuda.grid(1)
|
||||
if i < x.shape[0]:
|
||||
x[i] = i
|
||||
y[i] = i
|
||||
|
||||
x = np.arange(10, dtype=np.double)
|
||||
y = x[:-1:2]
|
||||
# z = x[1::2]
|
||||
# n = y.size
|
||||
try:
|
||||
cuda.devicearray.auto_device(y)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
raise AssertionError("Should raise exception complaining the "
|
||||
"contiguous-ness of the array.")
|
||||
# Should we handle this use case?
|
||||
# assert z.size == y.size
|
||||
# copykernel[1, n](y, x)
|
||||
# print(y, z)
|
||||
# assert np.all(y == z)
|
||||
# assert np.all(y == list(range(n)))
|
||||
|
||||
def test_auto_device_const(self):
|
||||
d, _ = cuda.devicearray.auto_device(2)
|
||||
self.assertTrue(np.all(d.copy_to_host() == np.array(2)))
|
||||
|
||||
def _test_array_like_same(self, like_func, array):
|
||||
"""
|
||||
Tests of *_array_like where shape, strides, dtype, and flags should
|
||||
all be equal.
|
||||
"""
|
||||
array_like = like_func(array)
|
||||
self.assertEqual(array.shape, array_like.shape)
|
||||
self.assertEqual(array.strides, array_like.strides)
|
||||
self.assertEqual(array.dtype, array_like.dtype)
|
||||
self.assertEqual(array.flags['C_CONTIGUOUS'],
|
||||
array_like.flags['C_CONTIGUOUS'])
|
||||
self.assertEqual(array.flags['F_CONTIGUOUS'],
|
||||
array_like.flags['F_CONTIGUOUS'])
|
||||
|
||||
def test_array_like_1d(self):
|
||||
d_a = cuda.device_array(10, order='C')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_2d(self):
|
||||
d_a = cuda.device_array((10, 12), order='C')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_2d_transpose(self):
|
||||
d_a = cuda.device_array((10, 12), order='C')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_3d(self):
|
||||
d_a = cuda.device_array((10, 12, 14), order='C')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_1d_f(self):
|
||||
d_a = cuda.device_array(10, order='F')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_2d_f(self):
|
||||
d_a = cuda.device_array((10, 12), order='F')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_2d_f_transpose(self):
|
||||
d_a = cuda.device_array((10, 12), order='F')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def test_array_like_3d_f(self):
|
||||
d_a = cuda.device_array((10, 12, 14), order='F')
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_same(like_func, d_a)
|
||||
|
||||
def _test_array_like_view(self, like_func, view, d_view):
|
||||
"""
|
||||
Tests of device_array_like where the original array is a view - the
|
||||
strides should not be equal because a contiguous array is expected.
|
||||
"""
|
||||
nb_like = like_func(d_view)
|
||||
self.assertEqual(d_view.shape, nb_like.shape)
|
||||
self.assertEqual(d_view.dtype, nb_like.dtype)
|
||||
|
||||
# Use NumPy as a reference for the expected strides
|
||||
np_like = np.zeros_like(view)
|
||||
self.assertEqual(nb_like.strides, np_like.strides)
|
||||
self.assertEqual(nb_like.flags['C_CONTIGUOUS'],
|
||||
np_like.flags['C_CONTIGUOUS'])
|
||||
self.assertEqual(nb_like.flags['F_CONTIGUOUS'],
|
||||
np_like.flags['F_CONTIGUOUS'])
|
||||
|
||||
def test_array_like_1d_view(self):
|
||||
shape = 10
|
||||
view = np.zeros(shape)[::2]
|
||||
d_view = cuda.device_array(shape)[::2]
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_view(like_func, view, d_view)
|
||||
|
||||
def test_array_like_1d_view_f(self):
|
||||
shape = 10
|
||||
view = np.zeros(shape, order='F')[::2]
|
||||
d_view = cuda.device_array(shape, order='F')[::2]
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_view(like_func, view, d_view)
|
||||
|
||||
def test_array_like_2d_view(self):
|
||||
shape = (10, 12)
|
||||
view = np.zeros(shape)[::2, ::2]
|
||||
d_view = cuda.device_array(shape)[::2, ::2]
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_view(like_func, view, d_view)
|
||||
|
||||
def test_array_like_2d_view_f(self):
|
||||
shape = (10, 12)
|
||||
view = np.zeros(shape, order='F')[::2, ::2]
|
||||
d_view = cuda.device_array(shape, order='F')[::2, ::2]
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_view(like_func, view, d_view)
|
||||
|
||||
@skip_on_cudasim('Numba and NumPy stride semantics differ for transpose')
|
||||
def test_array_like_2d_view_transpose_device(self):
|
||||
shape = (10, 12)
|
||||
d_view = cuda.device_array(shape)[::2, ::2].T
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
# This is a special case (see issue #4974) because creating the
|
||||
# transpose creates a new contiguous allocation with different
|
||||
# strides. In this case, rather than comparing against NumPy,
|
||||
# we can only compare against expected values.
|
||||
like = like_func(d_view)
|
||||
self.assertEqual(d_view.shape, like.shape)
|
||||
self.assertEqual(d_view.dtype, like.dtype)
|
||||
self.assertEqual((40, 8), like.strides)
|
||||
self.assertTrue(like.flags['C_CONTIGUOUS'])
|
||||
self.assertFalse(like.flags['F_CONTIGUOUS'])
|
||||
|
||||
@skip_unless_cudasim('Numba and NumPy stride semantics differ for '
|
||||
'transpose')
|
||||
def test_array_like_2d_view_transpose_simulator(self):
|
||||
shape = (10, 12)
|
||||
view = np.zeros(shape)[::2, ::2].T
|
||||
d_view = cuda.device_array(shape)[::2, ::2].T
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
# On the simulator, the transpose has different strides to on a
|
||||
# CUDA device (See issue #4974). Here we can compare strides
|
||||
# against NumPy as a reference.
|
||||
np_like = np.zeros_like(view)
|
||||
nb_like = like_func(d_view)
|
||||
self.assertEqual(d_view.shape, nb_like.shape)
|
||||
self.assertEqual(d_view.dtype, nb_like.dtype)
|
||||
self.assertEqual(np_like.strides, nb_like.strides)
|
||||
self.assertEqual(np_like.flags['C_CONTIGUOUS'],
|
||||
nb_like.flags['C_CONTIGUOUS'])
|
||||
self.assertEqual(np_like.flags['F_CONTIGUOUS'],
|
||||
nb_like.flags['F_CONTIGUOUS'])
|
||||
|
||||
def test_array_like_2d_view_f_transpose(self):
|
||||
shape = (10, 12)
|
||||
view = np.zeros(shape, order='F')[::2, ::2].T
|
||||
d_view = cuda.device_array(shape, order='F')[::2, ::2].T
|
||||
for like_func in ARRAY_LIKE_FUNCTIONS:
|
||||
with self.subTest(like_func=like_func):
|
||||
self._test_array_like_view(like_func, view, d_view)
|
||||
|
||||
@skip_on_cudasim('Kernel overloads not created in the simulator')
|
||||
def test_issue_4628(self):
|
||||
# CUDA Device arrays were reported as always being typed with 'A' order
|
||||
# so launching the kernel with a host array and then a device array
|
||||
# resulted in two overloads being compiled - one for 'C' order from
|
||||
# the host array, and one for 'A' order from the device array. With the
|
||||
# resolution of this issue, the order of the device array is also 'C',
|
||||
# so after the kernel launches there should only be one overload of
|
||||
# the function.
|
||||
@cuda.jit
|
||||
def func(A, out):
|
||||
i = cuda.grid(1)
|
||||
out[i] = A[i] * 2
|
||||
|
||||
n = 128
|
||||
a = np.ones((n,))
|
||||
d_a = cuda.to_device(a)
|
||||
result = np.zeros((n,))
|
||||
|
||||
func[1, 128](a, result)
|
||||
func[1, 128](d_a, result)
|
||||
|
||||
self.assertEqual(1, len(func.overloads))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,224 @@
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
|
||||
from numba import cuda
|
||||
from numba.core.errors import TypingError
|
||||
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
||||
|
||||
|
||||
class TestCudaArrayArg(CUDATestCase):
|
||||
def test_array_ary(self):
|
||||
|
||||
@cuda.jit('double(double[:],int64)', device=True, inline=True)
|
||||
def device_function(a, c):
|
||||
return a[c]
|
||||
|
||||
@cuda.jit('void(double[:],double[:])')
|
||||
def kernel(x, y):
|
||||
i = cuda.grid(1)
|
||||
y[i] = device_function(x, i)
|
||||
|
||||
x = np.arange(10, dtype=np.double)
|
||||
y = np.zeros_like(x)
|
||||
kernel[10, 1](x, y)
|
||||
self.assertTrue(np.all(x == y))
|
||||
|
||||
def test_unituple(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = x[0]
|
||||
r[1] = x[1]
|
||||
r[2] = x[2]
|
||||
|
||||
x = (1, 2, 3)
|
||||
r = np.zeros(len(x), dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
for i in range(len(x)):
|
||||
self.assertEqual(r[i], x[i])
|
||||
|
||||
def test_tuple(self):
|
||||
@cuda.jit
|
||||
def f(r1, r2, x):
|
||||
r1[0] = x[0]
|
||||
r1[1] = x[1]
|
||||
r1[2] = x[2]
|
||||
r2[0] = x[3]
|
||||
r2[1] = x[4]
|
||||
r2[2] = x[5]
|
||||
|
||||
x = (1, 2, 3, 4.5, 5.5, 6.5)
|
||||
r1 = np.zeros(len(x) // 2, dtype=np.int64)
|
||||
r2 = np.zeros(len(x) // 2, dtype=np.float64)
|
||||
f[1, 1](r1, r2, x)
|
||||
|
||||
for i in range(len(r1)):
|
||||
self.assertEqual(r1[i], x[i])
|
||||
|
||||
for i in range(len(r2)):
|
||||
self.assertEqual(r2[i], x[i + len(r1)])
|
||||
|
||||
def test_namedunituple(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = x.x
|
||||
r[1] = x.y
|
||||
|
||||
Point = namedtuple('Point', ('x', 'y'))
|
||||
x = Point(1, 2)
|
||||
r = np.zeros(len(x), dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
self.assertEqual(r[0], x.x)
|
||||
self.assertEqual(r[1], x.y)
|
||||
|
||||
def test_namedtuple(self):
|
||||
@cuda.jit
|
||||
def f(r1, r2, x):
|
||||
r1[0] = x.x
|
||||
r1[1] = x.y
|
||||
r2[0] = x.r
|
||||
|
||||
Point = namedtuple('Point', ('x', 'y', 'r'))
|
||||
x = Point(1, 2, 2.236)
|
||||
r1 = np.zeros(2, dtype=np.int64)
|
||||
r2 = np.zeros(1, dtype=np.float64)
|
||||
f[1, 1](r1, r2, x)
|
||||
|
||||
self.assertEqual(r1[0], x.x)
|
||||
self.assertEqual(r1[1], x.y)
|
||||
self.assertEqual(r2[0], x.r)
|
||||
|
||||
def test_empty_tuple(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = len(x)
|
||||
|
||||
x = tuple()
|
||||
r = np.ones(1, dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
self.assertEqual(r[0], 0)
|
||||
|
||||
def test_tuple_of_empty_tuples(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = len(x)
|
||||
r[1] = len(x[0])
|
||||
|
||||
x = ((), (), ())
|
||||
r = np.ones(2, dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
self.assertEqual(r[0], 3)
|
||||
self.assertEqual(r[1], 0)
|
||||
|
||||
def test_tuple_of_tuples(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = len(x)
|
||||
r[1] = len(x[0])
|
||||
r[2] = len(x[1])
|
||||
r[3] = len(x[2])
|
||||
r[4] = x[1][0]
|
||||
r[5] = x[1][1]
|
||||
r[6] = x[2][0]
|
||||
r[7] = x[2][1]
|
||||
r[8] = x[2][2]
|
||||
|
||||
x = ((), (5, 6), (8, 9, 10))
|
||||
r = np.ones(9, dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
self.assertEqual(r[0], 3)
|
||||
self.assertEqual(r[1], 0)
|
||||
self.assertEqual(r[2], 2)
|
||||
self.assertEqual(r[3], 3)
|
||||
self.assertEqual(r[4], 5)
|
||||
self.assertEqual(r[5], 6)
|
||||
self.assertEqual(r[6], 8)
|
||||
self.assertEqual(r[7], 9)
|
||||
self.assertEqual(r[8], 10)
|
||||
|
||||
def test_tuple_of_tuples_and_scalars(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = len(x)
|
||||
r[1] = len(x[0])
|
||||
r[2] = x[0][0]
|
||||
r[3] = x[0][1]
|
||||
r[4] = x[0][2]
|
||||
r[5] = x[1]
|
||||
|
||||
x = ((6, 5, 4), 7)
|
||||
r = np.ones(9, dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
self.assertEqual(r[0], 2)
|
||||
self.assertEqual(r[1], 3)
|
||||
self.assertEqual(r[2], 6)
|
||||
self.assertEqual(r[3], 5)
|
||||
self.assertEqual(r[4], 4)
|
||||
self.assertEqual(r[5], 7)
|
||||
|
||||
def test_tuple_of_arrays(self):
|
||||
@cuda.jit
|
||||
def f(x):
|
||||
i = cuda.grid(1)
|
||||
if i < len(x[0]):
|
||||
x[0][i] = x[1][i] + x[2][i]
|
||||
|
||||
N = 10
|
||||
x0 = np.zeros(N)
|
||||
x1 = np.ones_like(x0)
|
||||
x2 = x1 * 3
|
||||
x = (x0, x1, x2)
|
||||
f[1, N](x)
|
||||
|
||||
np.testing.assert_equal(x0, x1 + x2)
|
||||
|
||||
def test_tuple_of_array_scalar_tuple(self):
|
||||
@cuda.jit
|
||||
def f(r, x):
|
||||
r[0] = x[0][0]
|
||||
r[1] = x[0][1]
|
||||
r[2] = x[1]
|
||||
r[3] = x[2][0]
|
||||
r[4] = x[2][1]
|
||||
|
||||
z = np.arange(2, dtype=np.int64)
|
||||
x = (2 * z, 10, (4, 3))
|
||||
r = np.zeros(5, dtype=np.int64)
|
||||
f[1, 1](r, x)
|
||||
|
||||
self.assertEqual(r[0], 0)
|
||||
self.assertEqual(r[1], 2)
|
||||
self.assertEqual(r[2], 10)
|
||||
self.assertEqual(r[3], 4)
|
||||
self.assertEqual(r[4], 3)
|
||||
|
||||
|
||||
class TestDatetimeIssues(CUDATestCase):
|
||||
# See also numba.tests.test_npdatetime.TestDatetimeIssues.
|
||||
|
||||
@skip_on_cudasim("Typing not used on cudasim")
|
||||
def test_10y_issue_9585(self):
|
||||
@cuda.jit
|
||||
def f(x):
|
||||
return x + 1
|
||||
|
||||
arr = np.array('2010', dtype='datetime64[10Y]')
|
||||
|
||||
with self.assertRaises(TypingError) as e:
|
||||
f[1, 1](arr)
|
||||
|
||||
# Note that the CUDA target doesn't report which argument caused the
|
||||
# exception, so we can't check for it here as we do with the CPU
|
||||
# target.
|
||||
message = e.exception.args[0]
|
||||
unsupported_type = "Unsupported array dtype: datetime64[10Y]"
|
||||
self.assertIn(unsupported_type, message)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,35 @@
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from numba.cuda.testing import CUDATestCase
|
||||
import unittest
|
||||
|
||||
|
||||
def reinterpret_array_type(byte_arr, start, stop, output):
|
||||
# Tested with just one thread
|
||||
val = byte_arr[start:stop].view(np.int32)[0]
|
||||
output[0] = val
|
||||
|
||||
|
||||
class TestCudaArrayMethods(CUDATestCase):
|
||||
def test_reinterpret_array_type(self):
|
||||
"""
|
||||
Reinterpret byte array as int32 in the GPU.
|
||||
"""
|
||||
pyfunc = reinterpret_array_type
|
||||
kernel = cuda.jit(pyfunc)
|
||||
|
||||
byte_arr = np.arange(256, dtype=np.uint8)
|
||||
itemsize = np.dtype(np.int32).itemsize
|
||||
for start in range(0, 256, itemsize):
|
||||
stop = start + itemsize
|
||||
expect = byte_arr[start:stop].view(np.int32)[0]
|
||||
|
||||
output = np.zeros(1, dtype=np.int32)
|
||||
kernel[1, 1](byte_arr, start, stop, output)
|
||||
|
||||
got = output[0]
|
||||
self.assertEqual(expect, got)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,120 @@
|
||||
import numpy as np
|
||||
import math
|
||||
from numba import cuda, double, void
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
|
||||
|
||||
RISKFREE = 0.02
|
||||
VOLATILITY = 0.30
|
||||
|
||||
A1 = 0.31938153
|
||||
A2 = -0.356563782
|
||||
A3 = 1.781477937
|
||||
A4 = -1.821255978
|
||||
A5 = 1.330274429
|
||||
RSQRT2PI = 0.39894228040143267793994605993438
|
||||
|
||||
|
||||
def cnd(d):
|
||||
K = 1.0 / (1.0 + 0.2316419 * np.abs(d))
|
||||
ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) *
|
||||
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
|
||||
return np.where(d > 0, 1.0 - ret_val, ret_val)
|
||||
|
||||
|
||||
def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
|
||||
Riskfree, Volatility):
|
||||
S = stockPrice
|
||||
X = optionStrike
|
||||
T = optionYears
|
||||
R = Riskfree
|
||||
V = Volatility
|
||||
sqrtT = np.sqrt(T)
|
||||
d1 = (np.log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT)
|
||||
d2 = d1 - V * sqrtT
|
||||
cndd1 = cnd(d1)
|
||||
cndd2 = cnd(d2)
|
||||
|
||||
expRT = np.exp(- R * T)
|
||||
callResult[:] = (S * cndd1 - X * expRT * cndd2)
|
||||
putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1))
|
||||
|
||||
|
||||
def randfloat(rand_var, low, high):
|
||||
return (1.0 - rand_var) * low + rand_var * high
|
||||
|
||||
|
||||
class TestBlackScholes(CUDATestCase):
|
||||
def test_blackscholes(self):
|
||||
OPT_N = 400
|
||||
iterations = 2
|
||||
|
||||
stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
|
||||
optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
|
||||
optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)
|
||||
|
||||
callResultNumpy = np.zeros(OPT_N)
|
||||
putResultNumpy = -np.ones(OPT_N)
|
||||
|
||||
callResultNumba = np.zeros(OPT_N)
|
||||
putResultNumba = -np.ones(OPT_N)
|
||||
|
||||
# numpy
|
||||
for i in range(iterations):
|
||||
black_scholes(callResultNumpy, putResultNumpy, stockPrice,
|
||||
optionStrike, optionYears, RISKFREE, VOLATILITY)
|
||||
|
||||
@cuda.jit(double(double), device=True, inline=True)
|
||||
def cnd_cuda(d):
|
||||
K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
|
||||
ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
|
||||
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
|
||||
if d > 0:
|
||||
ret_val = 1.0 - ret_val
|
||||
return ret_val
|
||||
|
||||
@cuda.jit(void(double[:], double[:], double[:], double[:], double[:],
|
||||
double, double))
|
||||
def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
|
||||
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
|
||||
if i >= S.shape[0]:
|
||||
return
|
||||
sqrtT = math.sqrt(T[i])
|
||||
d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i])
|
||||
/ (V * sqrtT))
|
||||
d2 = d1 - V * sqrtT
|
||||
cndd1 = cnd_cuda(d1)
|
||||
cndd2 = cnd_cuda(d2)
|
||||
|
||||
expRT = math.exp((-1. * R) * T[i])
|
||||
callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
|
||||
putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))
|
||||
|
||||
# numba
|
||||
blockdim = 512, 1
|
||||
griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1
|
||||
stream = cuda.stream()
|
||||
d_callResult = cuda.to_device(callResultNumba, stream)
|
||||
d_putResult = cuda.to_device(putResultNumba, stream)
|
||||
d_stockPrice = cuda.to_device(stockPrice, stream)
|
||||
d_optionStrike = cuda.to_device(optionStrike, stream)
|
||||
d_optionYears = cuda.to_device(optionYears, stream)
|
||||
|
||||
for i in range(iterations):
|
||||
black_scholes_cuda[griddim, blockdim, stream](
|
||||
d_callResult, d_putResult, d_stockPrice, d_optionStrike,
|
||||
d_optionYears, RISKFREE, VOLATILITY)
|
||||
d_callResult.copy_to_host(callResultNumba, stream)
|
||||
d_putResult.copy_to_host(putResultNumba, stream)
|
||||
stream.synchronize()
|
||||
|
||||
delta = np.abs(callResultNumpy - callResultNumba)
|
||||
L1norm = delta.sum() / np.abs(callResultNumpy).sum()
|
||||
|
||||
max_abs_err = delta.max()
|
||||
self.assertTrue(L1norm < 1e-13)
|
||||
self.assertTrue(max_abs_err < 1e-13)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,24 @@
|
||||
import numpy as np
|
||||
from numba.cuda.testing import unittest, CUDATestCase
|
||||
from numba import cuda
|
||||
|
||||
|
||||
def boolean_func(A, vertial):
|
||||
if vertial:
|
||||
A[0] = 123
|
||||
else:
|
||||
A[0] = 321
|
||||
|
||||
|
||||
class TestCudaBoolean(CUDATestCase):
|
||||
def test_boolean(self):
|
||||
func = cuda.jit('void(float64[:], bool_)')(boolean_func)
|
||||
A = np.array([0], dtype='float64')
|
||||
func[1, 1](A, True)
|
||||
self.assertTrue(A[0] == 123)
|
||||
func[1, 1](A, False)
|
||||
self.assertTrue(A[0] == 321)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,545 @@
|
||||
import multiprocessing
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from numba import cuda
|
||||
from numba.core.errors import NumbaWarning
|
||||
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
|
||||
skip_unless_cc_60, skip_if_cudadevrt_missing,
|
||||
skip_if_mvc_enabled, test_data_dir)
|
||||
from numba.tests.support import SerialMixin
|
||||
from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
|
||||
skip_bad_access)
|
||||
|
||||
|
||||
@skip_on_cudasim('Simulator does not implement caching')
|
||||
class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
|
||||
here = os.path.dirname(__file__)
|
||||
usecases_file = os.path.join(here, "cache_usecases.py")
|
||||
modname = "cuda_caching_test_fodder"
|
||||
|
||||
def setUp(self):
|
||||
DispatcherCacheUsecasesTest.setUp(self)
|
||||
CUDATestCase.setUp(self)
|
||||
|
||||
def tearDown(self):
|
||||
CUDATestCase.tearDown(self)
|
||||
DispatcherCacheUsecasesTest.tearDown(self)
|
||||
|
||||
def test_caching(self):
|
||||
self.check_pycache(0)
|
||||
mod = self.import_module()
|
||||
self.check_pycache(0)
|
||||
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.check_pycache(2) # 1 index, 1 data
|
||||
self.assertPreciseEqual(f(2.5, 3), 6.5)
|
||||
self.check_pycache(3) # 1 index, 2 data
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
f = mod.record_return_aligned
|
||||
rec = f(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
f = mod.record_return_packed
|
||||
rec = f(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
self.check_pycache(6) # 2 index, 4 data
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
# Check the code runs ok from another process
|
||||
self.run_in_separate_process()
|
||||
|
||||
def test_no_caching(self):
|
||||
mod = self.import_module()
|
||||
|
||||
f = mod.add_nocache_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.check_pycache(0)
|
||||
|
||||
def test_many_locals(self):
|
||||
# Declaring many local arrays creates a very large LLVM IR, which
|
||||
# cannot be pickled due to the level of recursion it requires to
|
||||
# pickle. This test ensures that kernels with many locals (and
|
||||
# therefore large IR) can be cached. See Issue #8373:
|
||||
# https://github.com/numba/numba/issues/8373
|
||||
self.check_pycache(0)
|
||||
mod = self.import_module()
|
||||
f = mod.many_locals
|
||||
f[1, 1]()
|
||||
self.check_pycache(2) # 1 index, 1 data
|
||||
|
||||
def test_closure(self):
|
||||
mod = self.import_module()
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter('error', NumbaWarning)
|
||||
|
||||
f = mod.closure1
|
||||
self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
|
||||
f = mod.closure2
|
||||
self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
|
||||
f = mod.closure3
|
||||
self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
|
||||
f = mod.closure4
|
||||
self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
|
||||
self.check_pycache(5) # 1 nbi, 4 nbc
|
||||
|
||||
def test_cache_reuse(self):
|
||||
mod = self.import_module()
|
||||
mod.add_usecase(2, 3)
|
||||
mod.add_usecase(2.5, 3.5)
|
||||
mod.outer_uncached(2, 3)
|
||||
mod.outer(2, 3)
|
||||
mod.record_return_packed(mod.packed_arr, 0)
|
||||
mod.record_return_aligned(mod.aligned_arr, 1)
|
||||
mod.simple_usecase_caller(2)
|
||||
mtimes = self.get_cache_mtimes()
|
||||
# Two signatures compiled
|
||||
self.check_hits(mod.add_usecase.func, 0, 2)
|
||||
|
||||
mod2 = self.import_module()
|
||||
self.assertIsNot(mod, mod2)
|
||||
f = mod2.add_usecase
|
||||
f(2, 3)
|
||||
self.check_hits(f.func, 1, 0)
|
||||
f(2.5, 3.5)
|
||||
self.check_hits(f.func, 2, 0)
|
||||
|
||||
# The files haven't changed
|
||||
self.assertEqual(self.get_cache_mtimes(), mtimes)
|
||||
|
||||
self.run_in_separate_process()
|
||||
self.assertEqual(self.get_cache_mtimes(), mtimes)
|
||||
|
||||
def test_cache_invalidate(self):
|
||||
mod = self.import_module()
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
|
||||
# This should change the functions' results
|
||||
with open(self.modfile, "a") as f:
|
||||
f.write("\nZ = 10\n")
|
||||
|
||||
mod = self.import_module()
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 15)
|
||||
|
||||
def test_recompile(self):
|
||||
# Explicit call to recompile() should overwrite the cache
|
||||
mod = self.import_module()
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
|
||||
mod = self.import_module()
|
||||
f = mod.add_usecase
|
||||
mod.Z = 10
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
f.func.recompile()
|
||||
self.assertPreciseEqual(f(2, 3), 15)
|
||||
|
||||
# Freshly recompiled version is re-used from other imports
|
||||
mod = self.import_module()
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 15)
|
||||
|
||||
def test_same_names(self):
|
||||
# Function with the same names should still disambiguate
|
||||
mod = self.import_module()
|
||||
f = mod.renamed_function1
|
||||
self.assertPreciseEqual(f(2), 4)
|
||||
f = mod.renamed_function2
|
||||
self.assertPreciseEqual(f(2), 8)
|
||||
|
||||
@skip_unless_cc_60
|
||||
@skip_if_cudadevrt_missing
|
||||
@skip_if_mvc_enabled('CG not supported with MVC')
|
||||
def test_cache_cg(self):
|
||||
# Functions using cooperative groups should be cacheable. See Issue
|
||||
# #8888: https://github.com/numba/numba/issues/8888
|
||||
self.check_pycache(0)
|
||||
mod = self.import_module()
|
||||
self.check_pycache(0)
|
||||
|
||||
mod.cg_usecase(0)
|
||||
self.check_pycache(2) # 1 index, 1 data
|
||||
|
||||
# Check the code runs ok from another process
|
||||
self.run_in_separate_process()
|
||||
|
||||
@skip_unless_cc_60
|
||||
@skip_if_cudadevrt_missing
|
||||
@skip_if_mvc_enabled('CG not supported with MVC')
|
||||
def test_cache_cg_clean_run(self):
|
||||
# See Issue #9432: https://github.com/numba/numba/issues/9432
|
||||
# If a cached function using CG sync was the first thing to compile,
|
||||
# the compile would fail.
|
||||
self.check_pycache(0)
|
||||
|
||||
# This logic is modelled on run_in_separate_process(), but executes the
|
||||
# CG usecase directly in the subprocess.
|
||||
code = """if 1:
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, %(tempdir)r)
|
||||
mod = __import__(%(modname)r)
|
||||
mod.cg_usecase(0)
|
||||
""" % dict(tempdir=self.tempdir, modname=self.modname)
|
||||
|
||||
popen = subprocess.Popen([sys.executable, "-c", code],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
out, err = popen.communicate(timeout=60)
|
||||
if popen.returncode != 0:
|
||||
raise AssertionError(
|
||||
"process failed with code %s: \n"
|
||||
"stdout follows\n%s\n"
|
||||
"stderr follows\n%s\n"
|
||||
% (popen.returncode, out.decode(), err.decode()),
|
||||
)
|
||||
|
||||
def _test_pycache_fallback(self):
|
||||
"""
|
||||
With a disabled __pycache__, test there is a working fallback
|
||||
(e.g. on the user-wide cache dir)
|
||||
"""
|
||||
mod = self.import_module()
|
||||
f = mod.add_usecase
|
||||
# Remove this function's cache files at the end, to avoid accumulation
|
||||
# across test calls.
|
||||
self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
|
||||
ignore_errors=True)
|
||||
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
# It's a cache miss since the file was copied to a new temp location
|
||||
self.check_hits(f.func, 0, 1)
|
||||
|
||||
# Test re-use
|
||||
mod2 = self.import_module()
|
||||
f = mod2.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.check_hits(f.func, 1, 0)
|
||||
|
||||
# The __pycache__ is empty (otherwise the test's preconditions
|
||||
# wouldn't be met)
|
||||
self.check_pycache(0)
|
||||
|
||||
@skip_bad_access
|
||||
@unittest.skipIf(os.name == "nt",
|
||||
"cannot easily make a directory read-only on Windows")
|
||||
def test_non_creatable_pycache(self):
|
||||
# Make it impossible to create the __pycache__ directory
|
||||
old_perms = os.stat(self.tempdir).st_mode
|
||||
os.chmod(self.tempdir, 0o500)
|
||||
self.addCleanup(os.chmod, self.tempdir, old_perms)
|
||||
|
||||
self._test_pycache_fallback()
|
||||
|
||||
@skip_bad_access
|
||||
@unittest.skipIf(os.name == "nt",
|
||||
"cannot easily make a directory read-only on Windows")
|
||||
def test_non_writable_pycache(self):
|
||||
# Make it impossible to write to the __pycache__ directory
|
||||
pycache = os.path.join(self.tempdir, '__pycache__')
|
||||
os.mkdir(pycache)
|
||||
old_perms = os.stat(pycache).st_mode
|
||||
os.chmod(pycache, 0o500)
|
||||
self.addCleanup(os.chmod, pycache, old_perms)
|
||||
|
||||
self._test_pycache_fallback()
|
||||
|
||||
def test_cannot_cache_linking_libraries(self):
|
||||
link = str(test_data_dir / 'jitlink.ptx')
|
||||
msg = 'Cannot pickle CUDACodeLibrary with linking files'
|
||||
with self.assertRaisesRegex(RuntimeError, msg):
|
||||
@cuda.jit('void()', cache=True, link=[link])
|
||||
def f():
|
||||
pass
|
||||
|
||||
|
||||
@skip_on_cudasim('Simulator does not implement caching')
|
||||
class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
|
||||
here = os.path.dirname(__file__)
|
||||
usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
|
||||
modname = "cuda_and_cpu_caching_test_fodder"
|
||||
|
||||
def setUp(self):
|
||||
DispatcherCacheUsecasesTest.setUp(self)
|
||||
CUDATestCase.setUp(self)
|
||||
|
||||
def tearDown(self):
|
||||
CUDATestCase.tearDown(self)
|
||||
DispatcherCacheUsecasesTest.tearDown(self)
|
||||
|
||||
def test_cpu_and_cuda_targets(self):
|
||||
# The same function jitted for CPU and CUDA targets should maintain
|
||||
# separate caches for each target.
|
||||
self.check_pycache(0)
|
||||
mod = self.import_module()
|
||||
self.check_pycache(0)
|
||||
|
||||
f_cpu = mod.assign_cpu
|
||||
f_cuda = mod.assign_cuda
|
||||
self.assertPreciseEqual(f_cpu(5), 5)
|
||||
self.check_pycache(2) # 1 index, 1 data
|
||||
self.assertPreciseEqual(f_cuda(5), 5)
|
||||
self.check_pycache(3) # 1 index, 2 data
|
||||
|
||||
self.check_hits(f_cpu.func, 0, 1)
|
||||
self.check_hits(f_cuda.func, 0, 1)
|
||||
|
||||
self.assertPreciseEqual(f_cpu(5.5), 5.5)
|
||||
self.check_pycache(4) # 1 index, 3 data
|
||||
self.assertPreciseEqual(f_cuda(5.5), 5.5)
|
||||
self.check_pycache(5) # 1 index, 4 data
|
||||
|
||||
self.check_hits(f_cpu.func, 0, 2)
|
||||
self.check_hits(f_cuda.func, 0, 2)
|
||||
|
||||
def test_cpu_and_cuda_reuse(self):
|
||||
# Existing cache files for the CPU and CUDA targets are reused.
|
||||
mod = self.import_module()
|
||||
mod.assign_cpu(5)
|
||||
mod.assign_cpu(5.5)
|
||||
mod.assign_cuda(5)
|
||||
mod.assign_cuda(5.5)
|
||||
|
||||
mtimes = self.get_cache_mtimes()
|
||||
|
||||
# Two signatures compiled
|
||||
self.check_hits(mod.assign_cpu.func, 0, 2)
|
||||
self.check_hits(mod.assign_cuda.func, 0, 2)
|
||||
|
||||
mod2 = self.import_module()
|
||||
self.assertIsNot(mod, mod2)
|
||||
f_cpu = mod2.assign_cpu
|
||||
f_cuda = mod2.assign_cuda
|
||||
|
||||
f_cpu(2)
|
||||
self.check_hits(f_cpu.func, 1, 0)
|
||||
f_cpu(2.5)
|
||||
self.check_hits(f_cpu.func, 2, 0)
|
||||
f_cuda(2)
|
||||
self.check_hits(f_cuda.func, 1, 0)
|
||||
f_cuda(2.5)
|
||||
self.check_hits(f_cuda.func, 2, 0)
|
||||
|
||||
# The files haven't changed
|
||||
self.assertEqual(self.get_cache_mtimes(), mtimes)
|
||||
|
||||
self.run_in_separate_process()
|
||||
self.assertEqual(self.get_cache_mtimes(), mtimes)
|
||||
|
||||
|
||||
def get_different_cc_gpus():
|
||||
# Find two GPUs with different Compute Capabilities and return them as a
|
||||
# tuple. If two GPUs with distinct Compute Capabilities cannot be found,
|
||||
# then None is returned.
|
||||
first_gpu = cuda.gpus[0]
|
||||
with first_gpu:
|
||||
first_cc = cuda.current_context().device.compute_capability
|
||||
|
||||
for gpu in cuda.gpus[1:]:
|
||||
with gpu:
|
||||
cc = cuda.current_context().device.compute_capability
|
||||
if cc != first_cc:
|
||||
return (first_gpu, gpu)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@skip_on_cudasim('Simulator does not implement caching')
|
||||
class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
|
||||
here = os.path.dirname(__file__)
|
||||
usecases_file = os.path.join(here, "cache_usecases.py")
|
||||
modname = "cuda_multi_cc_caching_test_fodder"
|
||||
|
||||
def setUp(self):
|
||||
DispatcherCacheUsecasesTest.setUp(self)
|
||||
CUDATestCase.setUp(self)
|
||||
|
||||
def tearDown(self):
|
||||
CUDATestCase.tearDown(self)
|
||||
DispatcherCacheUsecasesTest.tearDown(self)
|
||||
|
||||
def test_cache(self):
|
||||
gpus = get_different_cc_gpus()
|
||||
if not gpus:
|
||||
self.skipTest('Need two different CCs for multi-CC cache test')
|
||||
|
||||
self.check_pycache(0)
|
||||
mod = self.import_module()
|
||||
self.check_pycache(0)
|
||||
|
||||
# Step 1. Populate the cache with the first GPU
|
||||
with gpus[0]:
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.check_pycache(2) # 1 index, 1 data
|
||||
self.assertPreciseEqual(f(2.5, 3), 6.5)
|
||||
self.check_pycache(3) # 1 index, 2 data
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
f = mod.record_return_aligned
|
||||
rec = f(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
f = mod.record_return_packed
|
||||
rec = f(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
self.check_pycache(6) # 2 index, 4 data
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
# Step 2. Run with the second GPU - under present behaviour this
|
||||
# doesn't further populate the cache.
|
||||
with gpus[1]:
|
||||
f = mod.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.check_pycache(6) # cache unchanged
|
||||
self.assertPreciseEqual(f(2.5, 3), 6.5)
|
||||
self.check_pycache(6) # cache unchanged
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
f = mod.record_return_aligned
|
||||
rec = f(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
f = mod.record_return_packed
|
||||
rec = f(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
self.check_pycache(6) # cache unchanged
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
# Step 3. Run in a separate module with the second GPU - this populates
|
||||
# the cache for the second CC.
|
||||
mod2 = self.import_module()
|
||||
self.assertIsNot(mod, mod2)
|
||||
|
||||
with gpus[1]:
|
||||
f = mod2.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.check_pycache(7) # 2 index, 5 data
|
||||
self.assertPreciseEqual(f(2.5, 3), 6.5)
|
||||
self.check_pycache(8) # 2 index, 6 data
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
f = mod2.record_return_aligned
|
||||
rec = f(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
f = mod2.record_return_packed
|
||||
rec = f(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
self.check_pycache(10) # 2 index, 8 data
|
||||
self.check_hits(f.func, 0, 2)
|
||||
|
||||
# The following steps check that we can use the NVVM IR loaded from the
|
||||
# cache to generate PTX for a different compute capability to the
|
||||
# cached cubin's CC. To check this, we create another module that loads
|
||||
# the cached version containing a cubin for GPU 1. There will be no
|
||||
# cubin for GPU 0, so when we try to use it the PTX must be generated.
|
||||
|
||||
mod3 = self.import_module()
|
||||
self.assertIsNot(mod, mod3)
|
||||
|
||||
# Step 4. Run with GPU 1 and get a cache hit, loading the cache created
|
||||
# during Step 3.
|
||||
with gpus[1]:
|
||||
f = mod3.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.assertPreciseEqual(f(2.5, 3), 6.5)
|
||||
|
||||
f = mod3.record_return_aligned
|
||||
rec = f(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
f = mod3.record_return_packed
|
||||
rec = f(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
# Step 5. Run with GPU 0 using the module from Step 4, to force PTX
|
||||
# generation from cached NVVM IR.
|
||||
with gpus[0]:
|
||||
f = mod3.add_usecase
|
||||
self.assertPreciseEqual(f(2, 3), 6)
|
||||
self.assertPreciseEqual(f(2.5, 3), 6.5)
|
||||
|
||||
f = mod3.record_return_aligned
|
||||
rec = f(mod.aligned_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
f = mod3.record_return_packed
|
||||
rec = f(mod.packed_arr, 1)
|
||||
self.assertPreciseEqual(tuple(rec), (2, 43.5))
|
||||
|
||||
|
||||
def child_initializer():
|
||||
# Disable occupancy and implicit copy warnings in processes in a
|
||||
# multiprocessing pool.
|
||||
from numba.core import config
|
||||
config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
|
||||
config.CUDA_WARN_ON_IMPLICIT_COPY = 0
|
||||
|
||||
|
||||
@skip_on_cudasim('Simulator does not implement caching')
|
||||
class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
|
||||
|
||||
# Nested multiprocessing.Pool raises AssertionError:
|
||||
# "daemonic processes are not allowed to have children"
|
||||
_numba_parallel_test_ = False
|
||||
|
||||
here = os.path.dirname(__file__)
|
||||
usecases_file = os.path.join(here, "cache_usecases.py")
|
||||
modname = "cuda_mp_caching_test_fodder"
|
||||
|
||||
def setUp(self):
|
||||
DispatcherCacheUsecasesTest.setUp(self)
|
||||
CUDATestCase.setUp(self)
|
||||
|
||||
def tearDown(self):
|
||||
CUDATestCase.tearDown(self)
|
||||
DispatcherCacheUsecasesTest.tearDown(self)
|
||||
|
||||
def test_multiprocessing(self):
|
||||
# Check caching works from multiple processes at once (#2028)
|
||||
mod = self.import_module()
|
||||
# Calling a pure Python caller of the JIT-compiled function is
|
||||
# necessary to reproduce the issue.
|
||||
f = mod.simple_usecase_caller
|
||||
n = 3
|
||||
try:
|
||||
ctx = multiprocessing.get_context('spawn')
|
||||
except AttributeError:
|
||||
ctx = multiprocessing
|
||||
|
||||
pool = ctx.Pool(n, child_initializer)
|
||||
|
||||
try:
|
||||
res = sum(pool.imap(f, range(n)))
|
||||
finally:
|
||||
pool.close()
|
||||
self.assertEqual(res, n * (n - 1) // 2)
|
||||
|
||||
|
||||
@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
|
||||
class TestCUDACodeLibrary(CUDATestCase):
|
||||
# For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
|
||||
# explicitly check
|
||||
|
||||
def test_cannot_serialize_unfinalized(self):
|
||||
# The CUDA codegen failes to import under the simulator, so we cannot
|
||||
# import it at the top level
|
||||
from numba.cuda.codegen import CUDACodeLibrary
|
||||
|
||||
# Usually a CodeLibrary requires a real CodeGen, but since we don't
|
||||
# interact with it, anything will do
|
||||
codegen = object()
|
||||
name = 'library'
|
||||
cl = CUDACodeLibrary(codegen, name)
|
||||
with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
|
||||
cl._reduce_states()
|
||||
@@ -0,0 +1,257 @@
|
||||
import numpy as np
|
||||
|
||||
from numba.cuda import compile_ptx
|
||||
from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
|
||||
from numba import cuda
|
||||
from numba.core import types
|
||||
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
|
||||
skip_unless_cc_53)
|
||||
from numba.types import float16, float32
|
||||
import itertools
|
||||
import unittest
|
||||
|
||||
|
||||
def native_cast(x):
|
||||
return float(x)
|
||||
|
||||
|
||||
def to_int8(x):
|
||||
return np.int8(x)
|
||||
|
||||
|
||||
def to_int16(x):
|
||||
return np.int16(x)
|
||||
|
||||
|
||||
def to_int32(x):
|
||||
return np.int32(x)
|
||||
|
||||
|
||||
def to_int64(x):
|
||||
return np.int64(x)
|
||||
|
||||
|
||||
def to_uint8(x):
|
||||
return np.uint8(x)
|
||||
|
||||
|
||||
def to_uint16(x):
|
||||
return np.uint16(x)
|
||||
|
||||
|
||||
def to_uint32(x):
|
||||
return types.uint32(x)
|
||||
|
||||
|
||||
def to_uint64(x):
|
||||
return types.uint64(x)
|
||||
|
||||
|
||||
def to_float16(x):
|
||||
# When division and operators on float16 types are supported, this should
|
||||
# be changed to match the implementation in to_float32.
|
||||
return (np.float16(x) * np.float16(0.5))
|
||||
|
||||
|
||||
def to_float32(x):
|
||||
return np.float32(x) / np.float32(2)
|
||||
|
||||
|
||||
def to_float64(x):
|
||||
return np.float64(x) / np.float64(2)
|
||||
|
||||
|
||||
def to_complex64(x):
|
||||
return np.complex64(x)
|
||||
|
||||
|
||||
def to_complex128(x):
|
||||
return np.complex128(x)
|
||||
|
||||
|
||||
# Since multiplication of float16 is not supported via the operator * on
|
||||
# float16s yet, and the host does not implement cuda.fp16.*, we need two
|
||||
# versions of the following functions:
|
||||
#
|
||||
# - The device version uses cuda.fp16.hmul
|
||||
# - The host version uses the * operator
|
||||
|
||||
def cuda_int_literal_to_float16(x):
|
||||
# Note that we need to use `2` and not `np.float16(2)` to ensure that this
|
||||
# types as a literal int and not a const float16.
|
||||
return cuda.fp16.hmul(np.float16(x), 2)
|
||||
|
||||
|
||||
def reference_int_literal_to_float16(x):
|
||||
return np.float16(x) * np.float16(2)
|
||||
|
||||
|
||||
def cuda_float_literal_to_float16(x):
|
||||
# Note that `2.5` types as a const float64 and not a literal float, but
|
||||
# this case is provided in case that changes in future.
|
||||
return cuda.fp16.hmul(np.float16(x), 2.5)
|
||||
|
||||
|
||||
def reference_float_literal_to_float16(x):
|
||||
return np.float16(x) * np.float16(2.5)
|
||||
|
||||
|
||||
class TestCasting(CUDATestCase):
|
||||
def _create_wrapped(self, pyfunc, intype, outtype):
|
||||
wrapped_func = cuda.jit(device=True)(pyfunc)
|
||||
|
||||
@cuda.jit
|
||||
def cuda_wrapper_fn(arg, res):
|
||||
res[0] = wrapped_func(arg[0])
|
||||
|
||||
def wrapper_fn(arg):
|
||||
argarray = np.zeros(1, dtype=intype)
|
||||
argarray[0] = arg
|
||||
resarray = np.zeros(1, dtype=outtype)
|
||||
cuda_wrapper_fn[1, 1](argarray, resarray)
|
||||
return resarray[0]
|
||||
|
||||
return wrapper_fn
|
||||
|
||||
@skip_unless_cc_53
|
||||
def test_float_to_int(self):
|
||||
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
|
||||
totys = (np.int8, np.int16, np.int32, np.int64)
|
||||
fromtys = (np.float16, np.float32, np.float64)
|
||||
|
||||
for pyfunc, toty in zip(pyfuncs, totys):
|
||||
for fromty in fromtys:
|
||||
with self.subTest(fromty=fromty, toty=toty):
|
||||
cfunc = self._create_wrapped(pyfunc, fromty, toty)
|
||||
self.assertEqual(cfunc(12.3), pyfunc(12.3))
|
||||
self.assertEqual(cfunc(12.3), int(12.3))
|
||||
self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
|
||||
self.assertEqual(cfunc(-12.3), int(-12.3))
|
||||
|
||||
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||||
def test_float16_to_int_ptx(self):
|
||||
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
|
||||
sizes = (8, 16, 32, 64)
|
||||
|
||||
for pyfunc, size in zip(pyfuncs, sizes):
|
||||
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
|
||||
self.assertIn(f"cvt.rni.s{size}.f16", ptx)
|
||||
|
||||
@skip_unless_cc_53
|
||||
def test_float_to_uint(self):
|
||||
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
|
||||
totys = (np.uint8, np.uint16, np.uint32, np.uint64)
|
||||
fromtys = (np.float16, np.float32, np.float64)
|
||||
|
||||
for pyfunc, toty in zip(pyfuncs, totys):
|
||||
for fromty in fromtys:
|
||||
with self.subTest(fromty=fromty, toty=toty):
|
||||
cfunc = self._create_wrapped(pyfunc, fromty, toty)
|
||||
self.assertEqual(cfunc(12.3), pyfunc(12.3))
|
||||
self.assertEqual(cfunc(12.3), int(12.3))
|
||||
|
||||
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||||
def test_float16_to_uint_ptx(self):
|
||||
pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
|
||||
sizes = (8, 16, 32, 64)
|
||||
|
||||
for pyfunc, size in zip(pyfuncs, sizes):
|
||||
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
|
||||
self.assertIn(f"cvt.rni.u{size}.f16", ptx)
|
||||
|
||||
@skip_unless_cc_53
|
||||
def test_int_to_float(self):
|
||||
pyfuncs = (to_float16, to_float32, to_float64)
|
||||
totys = (np.float16, np.float32, np.float64)
|
||||
|
||||
for pyfunc, toty in zip(pyfuncs, totys):
|
||||
with self.subTest(toty=toty):
|
||||
cfunc = self._create_wrapped(pyfunc, np.int64, toty)
|
||||
self.assertEqual(cfunc(321), pyfunc(321))
|
||||
|
||||
@skip_unless_cc_53
|
||||
def test_literal_to_float16(self):
|
||||
cudafuncs = (cuda_int_literal_to_float16,
|
||||
cuda_float_literal_to_float16)
|
||||
hostfuncs = (reference_int_literal_to_float16,
|
||||
reference_float_literal_to_float16)
|
||||
|
||||
for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
|
||||
with self.subTest(func=cudafunc):
|
||||
cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
|
||||
self.assertEqual(cfunc(321), hostfunc(321))
|
||||
|
||||
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||||
def test_int_to_float16_ptx(self):
|
||||
fromtys = (i1, i2, i4, i8)
|
||||
sizes = (8, 16, 32, 64)
|
||||
|
||||
for ty, size in zip(fromtys, sizes):
|
||||
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
|
||||
self.assertIn(f"cvt.rn.f16.s{size}", ptx)
|
||||
|
||||
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||||
def test_uint_to_float16_ptx(self):
|
||||
fromtys = (u1, u2, u4, u8)
|
||||
sizes = (8, 16, 32, 64)
|
||||
|
||||
for ty, size in zip(fromtys, sizes):
|
||||
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
|
||||
self.assertIn(f"cvt.rn.f16.u{size}", ptx)
|
||||
|
||||
@skip_unless_cc_53
|
||||
def test_float_to_float(self):
|
||||
pyfuncs = (to_float16, to_float32, to_float64)
|
||||
tys = (np.float16, np.float32, np.float64)
|
||||
|
||||
for (pyfunc, fromty), toty in itertools.product(zip(pyfuncs, tys), tys):
|
||||
with self.subTest(fromty=fromty, toty=toty):
|
||||
cfunc = self._create_wrapped(pyfunc, fromty, toty)
|
||||
# For this test we cannot use the pyfunc for comparison because
|
||||
# the CUDA target doesn't yet implement division (or operators)
|
||||
# for float16 values, so we test by comparing with the computed
|
||||
# expression instead.
|
||||
np.testing.assert_allclose(cfunc(12.3),
|
||||
toty(12.3) / toty(2), rtol=0.0003)
|
||||
np.testing.assert_allclose(cfunc(-12.3),
|
||||
toty(-12.3) / toty(2), rtol=0.0003)
|
||||
|
||||
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||||
def test_float16_to_float_ptx(self):
|
||||
pyfuncs = (to_float32, to_float64)
|
||||
postfixes = ("f32", "f64")
|
||||
|
||||
for pyfunc, postfix in zip(pyfuncs, postfixes):
|
||||
ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
|
||||
self.assertIn(f"cvt.{postfix}.f16", ptx)
|
||||
|
||||
@skip_unless_cc_53
|
||||
def test_float_to_complex(self):
|
||||
pyfuncs = (to_complex64, to_complex128)
|
||||
totys = (np.complex64, np.complex128)
|
||||
fromtys = (np.float16, np.float32, np.float64)
|
||||
|
||||
for pyfunc, toty in zip(pyfuncs, totys):
|
||||
for fromty in fromtys:
|
||||
with self.subTest(fromty=fromty, toty=toty):
|
||||
cfunc = self._create_wrapped(pyfunc, fromty, toty)
|
||||
# Here we need to explicitly cast the input to the pyfunc
|
||||
# to match the casting that is automatically applied when
|
||||
# passing the input to the cfunc as part of wrapping it in
|
||||
# an array of type fromtype.
|
||||
np.testing.assert_allclose(cfunc(3.21),
|
||||
pyfunc(fromty(3.21)))
|
||||
np.testing.assert_allclose(cfunc(-3.21),
|
||||
pyfunc(fromty(-3.21)) + 0j)
|
||||
|
||||
@skip_on_cudasim('Compilation unsupported in the simulator')
|
||||
def test_native_cast(self):
|
||||
float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
|
||||
self.assertIn("st.f32", float32_ptx)
|
||||
|
||||
float16_ptx, _ = cuda.compile_ptx(native_cast, (float16,), device=True)
|
||||
self.assertIn("st.u16", float16_ptx)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user