Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/random.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/random.py
@@ -0,0 +1,292 @@
+import math
+
+from numba import (config, cuda, float32, float64, uint32, int64, uint64,
+                   from_dtype, jit)
+
+import numpy as np
+
+# This implementation is based upon the xoroshiro128+ and splitmix64 algorithms
+# described at:
+#
+#     http://xoroshiro.di.unimi.it/
+#
+# and originally implemented by David Blackman and Sebastiano Vigna.
+#
+# The implementations below are based on the C source code:
+#
+#  * http://xoroshiro.di.unimi.it/xoroshiro128plus.c
+#  * http://xoroshiro.di.unimi.it/splitmix64.c
+#
+# Splitmix64 is used to generate the initial state of the xoroshiro128+
+# generator to ensure that small seeds don't result in predictable output.
+
+# **WARNING**: There is a lot of verbose casting in this file to ensure that
+# NumPy casting conventions (which cast uint64 [op] int32 to float64) don't
+# turn integers into floats when using these functions in the CUDA simulator.
+#
+# There are also no function type signatures to ensure that compilation is
+# deferred so that import is quick, and Sphinx autodoc works.  We are also
+# using the CPU @jit decorator everywhere to create functions that work as
+# both CPU and CUDA device functions.
+
+xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)],
+                               align=True)
+xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
+
+# When cudasim is enabled, Fake CUDA arrays are passed to some of the
+# @jit-decorated functions. This required fallback to object mode. With
+# Numba 0.59.0 object mode must be explicitly enabled.
+# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
+# In order to avoid the warning / future error, we explicitly specify that
+# object mode with loop lifting is acceptable when using the simulator.
+_forceobj = _looplift = config.ENABLE_CUDASIM
+_nopython = not config.ENABLE_CUDASIM
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def init_xoroshiro128p_state(states, index, seed):
+    '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
+
+    This ensures that manually set small seeds don't result in a predictable
+    initial sequence from the random number generator.
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: uint64
+    :param index: offset in states to update
+    :type seed: int64
+    :param seed: seed value to use when initializing state
+    '''
+    index = int64(index)
+    seed = uint64(seed)
+
+    z = seed + uint64(0x9E3779B97F4A7C15)
+    z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9)
+    z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
+    z = z ^ (z >> uint32(31))
+
+    states[index]['s0'] = z
+    states[index]['s1'] = z
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def rotl(x, k):
+    '''Left rotate x by k bits.'''
+    x = uint64(x)
+    k = uint32(k)
+    return (x << k) | (x >> uint32(64 - k))
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def xoroshiro128p_next(states, index):
+    '''Return the next random uint64 and advance the RNG in states[index].
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: int64
+    :param index: offset in states to update
+    :rtype: uint64
+    '''
+    index = int64(index)
+    s0 = states[index]['s0']
+    s1 = states[index]['s1']
+    result = s0 + s1
+
+    s1 ^= s0
+    states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
+    states[index]['s1'] = uint64(rotl(s1, uint32(36)))
+
+    return result
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def xoroshiro128p_jump(states, index):
+    '''Advance the RNG in ``states[index]`` by 2**64 steps.
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: int64
+    :param index: offset in states to update
+    '''
+    index = int64(index)
+
+    jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922))
+
+    s0 = uint64(0)
+    s1 = uint64(0)
+
+    for i in range(2):
+        for b in range(64):
+            if jump[i] & (uint64(1) << uint32(b)):
+                s0 ^= states[index]['s0']
+                s1 ^= states[index]['s1']
+            xoroshiro128p_next(states, index)
+
+    states[index]['s0'] = s0
+    states[index]['s1'] = s1
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def uint64_to_unit_float64(x):
+    '''Convert uint64 to float64 value in the range [0.0, 1.0)'''
+    x = uint64(x)
+    return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def uint64_to_unit_float32(x):
+    '''Convert uint64 to float32 value in the range [0.0, 1.0)'''
+    x = uint64(x)
+    return float32(uint64_to_unit_float64(x))
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def xoroshiro128p_uniform_float32(states, index):
+    '''Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: int64
+    :param index: offset in states to update
+    :rtype: float32
+    '''
+    index = int64(index)
+    return uint64_to_unit_float32(xoroshiro128p_next(states, index))
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def xoroshiro128p_uniform_float64(states, index):
+    '''Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: int64
+    :param index: offset in states to update
+    :rtype: float64
+    '''
+    index = int64(index)
+    return uint64_to_unit_float64(xoroshiro128p_next(states, index))
+
+
+TWO_PI_FLOAT32 = np.float32(2 * math.pi)
+TWO_PI_FLOAT64 = np.float64(2 * math.pi)
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def xoroshiro128p_normal_float32(states, index):
+    '''Return a normally distributed float32 and advance ``states[index]``.
+
+    The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
+    Box-Muller transform.  This advances the RNG sequence by two steps.
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: int64
+    :param index: offset in states to update
+    :rtype: float32
+    '''
+    index = int64(index)
+
+    u1 = xoroshiro128p_uniform_float32(states, index)
+    u2 = xoroshiro128p_uniform_float32(states, index)
+
+    z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2)
+    # discarding second normal value
+    # z1 = math.sqrt(-float32(2.0) * math.log(u1))
+    #                * math.sin(TWO_PI_FLOAT32 * u2)
+    return z0
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def xoroshiro128p_normal_float64(states, index):
+    '''Return a normally distributed float32 and advance ``states[index]``.
+
+    The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
+    Box-Muller transform.  This advances the RNG sequence by two steps.
+
+    :type states: 1D array, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type index: int64
+    :param index: offset in states to update
+    :rtype: float64
+    '''
+    index = int64(index)
+
+    u1 = xoroshiro128p_uniform_float32(states, index)
+    u2 = xoroshiro128p_uniform_float32(states, index)
+
+    z0 = math.sqrt(-float64(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT64 * u2)
+    # discarding second normal value
+    # z1 = math.sqrt(-float64(2.0) * math.log(u1))
+    #                * math.sin(TWO_PI_FLOAT64 * u2)
+    return z0
+
+
+@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
+def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
+    n = states.shape[0]
+    seed = uint64(seed)
+    subsequence_start = uint64(subsequence_start)
+
+    if n >= 1:
+        init_xoroshiro128p_state(states, 0, seed)
+
+        # advance to starting subsequence number
+        for _ in range(subsequence_start):
+            xoroshiro128p_jump(states, 0)
+
+        # populate the rest of the array
+        for i in range(1, n):
+            states[i] = states[i - 1]  # take state of previous generator
+            xoroshiro128p_jump(states, i)  # and jump forward 2**64 steps
+
+
+def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
+    '''Initialize RNG states on the GPU for parallel generators.
+
+    This initializes the RNG states so that each state in the array corresponds
+    subsequences in the separated by 2**64 steps from each other in the main
+    sequence.  Therefore, as long no CUDA thread requests more than 2**64
+    random numbers, all of the RNG states produced by this function are
+    guaranteed to be independent.
+
+    The subsequence_start parameter can be used to advance the first RNG state
+    by a multiple of 2**64 steps.
+
+    :type states: 1D DeviceNDArray, dtype=xoroshiro128p_dtype
+    :param states: array of RNG states
+    :type seed: uint64
+    :param seed: starting seed for list of generators
+    '''
+
+    # Initialization on CPU is much faster than the GPU
+    states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
+    init_xoroshiro128p_states_cpu(states_cpu, seed, subsequence_start)
+
+    states.copy_to_device(states_cpu, stream=stream)
+
+
+def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
+    '''Returns a new device array initialized for n random number generators.
+
+    This initializes the RNG states so that each state in the array corresponds
+    subsequences in the separated by 2**64 steps from each other in the main
+    sequence.  Therefore, as long no CUDA thread requests more than 2**64
+    random numbers, all of the RNG states produced by this function are
+    guaranteed to be independent.
+
+    The subsequence_start parameter can be used to advance the first RNG state
+    by a multiple of 2**64 steps.
+
+    :type n: int
+    :param n: number of RNG states to create
+    :type seed: uint64
+    :param seed: starting seed for list of generators
+    :type subsequence_start: uint64
+    :param subsequence_start:
+    :type stream: CUDA stream
+    :param stream: stream to run initialization kernel on
+    '''
+    states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
+    init_xoroshiro128p_states(states, seed, subsequence_start, stream)
+    return states