Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/codegen.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/numba/cuda/codegen.py
@@ -0,0 +1,378 @@
+from llvmlite import ir
+
+from numba.core import config, serialize
+from numba.core.codegen import Codegen, CodeLibrary
+from .cudadrv import devices, driver, nvvm, runtime
+from numba.cuda.cudadrv.libs import get_cudalib
+
+import os
+import subprocess
+import tempfile
+
+
+CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
+
+
+def run_nvdisasm(cubin, flags):
+    # nvdisasm only accepts input from a file, so we need to write out to a
+    # temp file and clean up afterwards.
+    fd = None
+    fname = None
+    try:
+        fd, fname = tempfile.mkstemp()
+        with open(fname, 'wb') as f:
+            f.write(cubin)
+
+        try:
+            cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        except FileNotFoundError as e:
+            msg = ("nvdisasm has not been found. You may need "
+                   "to install the CUDA toolkit and ensure that "
+                   "it is available on your PATH.\n")
+            raise RuntimeError(msg) from e
+        return cp.stdout.decode('utf-8')
+    finally:
+        if fd is not None:
+            os.close(fd)
+        if fname is not None:
+            os.unlink(fname)
+
+
+def disassemble_cubin(cubin):
+    # Request lineinfo in disassembly
+    flags = ['-gi']
+    return run_nvdisasm(cubin, flags)
+
+
+def disassemble_cubin_for_cfg(cubin):
+    # Request control flow graph in disassembly
+    flags = ['-cfg']
+    return run_nvdisasm(cubin, flags)
+
+
+class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
+    """
+    The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
+    compute capabilities. It also loads cubins to multiple devices (via
+    get_cufunc), which may be of different compute capabilities.
+    """
+
+    def __init__(self, codegen, name, entry_name=None, max_registers=None,
+                 nvvm_options=None):
+        """
+        codegen:
+            Codegen object.
+        name:
+            Name of the function in the source.
+        entry_name:
+            Name of the kernel function in the binary, if this is a global
+            kernel and not a device function.
+        max_registers:
+            The maximum register usage to aim for when linking.
+        nvvm_options:
+                Dict of options to pass to NVVM.
+        """
+        super().__init__(codegen, name)
+
+        # The llvmlite module for this library.
+        self._module = None
+        # CodeLibrary objects that will be "linked" into this library. The
+        # modules within them are compiled from NVVM IR to PTX along with the
+        # IR from this module - in that sense they are "linked" by NVVM at PTX
+        # generation time, rather than at link time.
+        self._linking_libraries = set()
+        # Files to link with the generated PTX. These are linked using the
+        # Driver API at link time.
+        self._linking_files = set()
+        # Should we link libcudadevrt?
+        self.needs_cudadevrt = False
+
+        # Cache the LLVM IR string
+        self._llvm_strs = None
+        # Maps CC -> PTX string
+        self._ptx_cache = {}
+        # Maps CC -> LTO-IR
+        self._ltoir_cache = {}
+        # Maps CC -> cubin
+        self._cubin_cache = {}
+        # Maps CC -> linker info output for cubin
+        self._linkerinfo_cache = {}
+        # Maps Device numeric ID -> cufunc
+        self._cufunc_cache = {}
+
+        self._max_registers = max_registers
+        if nvvm_options is None:
+            nvvm_options = {}
+        self._nvvm_options = nvvm_options
+        self._entry_name = entry_name
+
+    @property
+    def llvm_strs(self):
+        if self._llvm_strs is None:
+            self._llvm_strs = [str(mod) for mod in self.modules]
+        return self._llvm_strs
+
+    def get_llvm_str(self):
+        return "\n\n".join(self.llvm_strs)
+
+    def _ensure_cc(self, cc):
+        if cc is not None:
+            return cc
+
+        device = devices.get_context().device
+        return device.compute_capability
+
+    def get_asm_str(self, cc=None):
+        cc = self._ensure_cc(cc)
+
+        ptxes = self._ptx_cache.get(cc, None)
+        if ptxes:
+            return ptxes
+
+        arch = nvvm.get_arch_option(*cc)
+        options = self._nvvm_options.copy()
+        options['arch'] = arch
+
+        irs = self.llvm_strs
+
+        ptx = nvvm.compile_ir(irs, **options)
+
+        # Sometimes the result from NVVM contains trailing whitespace and
+        # nulls, which we strip so that the assembly dump looks a little
+        # tidier.
+        ptx = ptx.decode().strip('\x00').strip()
+
+        if config.DUMP_ASSEMBLY:
+            print(("ASSEMBLY %s" % self._name).center(80, '-'))
+            print(ptx)
+            print('=' * 80)
+
+        self._ptx_cache[cc] = ptx
+
+        return ptx
+
+    def get_ltoir(self, cc=None):
+        cc = self._ensure_cc(cc)
+
+        ltoir = self._ltoir_cache.get(cc, None)
+        if ltoir is not None:
+            return ltoir
+
+        arch = nvvm.get_arch_option(*cc)
+        options = self._nvvm_options.copy()
+        options['arch'] = arch
+        options['gen-lto'] = None
+
+        irs = self.llvm_strs
+        ltoir = nvvm.compile_ir(irs, **options)
+        self._ltoir_cache[cc] = ltoir
+
+        return ltoir
+
+    def get_cubin(self, cc=None):
+        cc = self._ensure_cc(cc)
+
+        cubin = self._cubin_cache.get(cc, None)
+        if cubin:
+            return cubin
+
+        linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
+
+        if linker.lto:
+            ltoir = self.get_ltoir(cc=cc)
+            linker.add_ltoir(ltoir)
+        else:
+            ptx = self.get_asm_str(cc=cc)
+            linker.add_ptx(ptx.encode())
+
+        for path in self._linking_files:
+            linker.add_file_guess_ext(path)
+        if self.needs_cudadevrt:
+            linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
+
+        cubin = linker.complete()
+        self._cubin_cache[cc] = cubin
+        self._linkerinfo_cache[cc] = linker.info_log
+
+        return cubin
+
+    def get_cufunc(self):
+        if self._entry_name is None:
+            msg = "Missing entry_name - are you trying to get the cufunc " \
+                  "for a device function?"
+            raise RuntimeError(msg)
+
+        ctx = devices.get_context()
+        device = ctx.device
+
+        cufunc = self._cufunc_cache.get(device.id, None)
+        if cufunc:
+            return cufunc
+
+        cubin = self.get_cubin(cc=device.compute_capability)
+        module = ctx.create_module_image(cubin)
+
+        # Load
+        cufunc = module.get_function(self._entry_name)
+
+        # Populate caches
+        self._cufunc_cache[device.id] = cufunc
+
+        return cufunc
+
+    def get_linkerinfo(self, cc):
+        try:
+            return self._linkerinfo_cache[cc]
+        except KeyError:
+            raise KeyError(f'No linkerinfo for CC {cc}')
+
+    def get_sass(self, cc=None):
+        return disassemble_cubin(self.get_cubin(cc=cc))
+
+    def get_sass_cfg(self, cc=None):
+        return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
+
+    def add_ir_module(self, mod):
+        self._raise_if_finalized()
+        if self._module is not None:
+            raise RuntimeError('CUDACodeLibrary only supports one module')
+        self._module = mod
+
+    def add_linking_library(self, library):
+        library._ensure_finalized()
+
+        # We don't want to allow linking more libraries in after finalization
+        # because our linked libraries are modified by the finalization, and we
+        # won't be able to finalize again after adding new ones
+        self._raise_if_finalized()
+
+        self._linking_libraries.add(library)
+
+    def add_linking_file(self, filepath):
+        self._linking_files.add(filepath)
+
+    def get_function(self, name):
+        for fn in self._module.functions:
+            if fn.name == name:
+                return fn
+        raise KeyError(f'Function {name} not found')
+
+    @property
+    def modules(self):
+        return [self._module] + [mod for lib in self._linking_libraries
+                                 for mod in lib.modules]
+
+    @property
+    def linking_libraries(self):
+        # Libraries we link to may link to other libraries, so we recursively
+        # traverse the linking libraries property to build up a list of all
+        # linked libraries.
+        libs = []
+        for lib in self._linking_libraries:
+            libs.extend(lib.linking_libraries)
+            libs.append(lib)
+        return libs
+
+    def finalize(self):
+        # Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
+        # we only adjust the linkage of functions. Global kernels (with
+        # external linkage) have their linkage untouched. Device functions are
+        # set linkonce_odr to prevent them appearing in the PTX.
+
+        self._raise_if_finalized()
+
+        # Note in-place modification of the linkage of functions in linked
+        # libraries. This presently causes no issues as only device functions
+        # are shared across code libraries, so they would always need their
+        # linkage set to linkonce_odr. If in a future scenario some code
+        # libraries require linkonce_odr linkage of functions in linked
+        # modules, and another code library requires another linkage, each code
+        # library will need to take its own private copy of its linked modules.
+        #
+        # See also discussion on PR #890:
+        # https://github.com/numba/numba/pull/890
+        for library in self._linking_libraries:
+            for mod in library.modules:
+                for fn in mod.functions:
+                    if not fn.is_declaration:
+                        fn.linkage = 'linkonce_odr'
+
+        self._finalized = True
+
+    def _reduce_states(self):
+        """
+        Reduce the instance for serialization. We retain the PTX and cubins,
+        but loaded functions are discarded. They are recreated when needed
+        after deserialization.
+        """
+        if self._linking_files:
+            msg = 'Cannot pickle CUDACodeLibrary with linking files'
+            raise RuntimeError(msg)
+        if not self._finalized:
+            raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
+        return dict(
+            codegen=None,
+            name=self.name,
+            entry_name=self._entry_name,
+            llvm_strs=self.llvm_strs,
+            ptx_cache=self._ptx_cache,
+            cubin_cache=self._cubin_cache,
+            linkerinfo_cache=self._linkerinfo_cache,
+            max_registers=self._max_registers,
+            nvvm_options=self._nvvm_options,
+            needs_cudadevrt=self.needs_cudadevrt
+        )
+
+    @classmethod
+    def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
+                 cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
+                 needs_cudadevrt):
+        """
+        Rebuild an instance.
+        """
+        instance = cls(codegen, name, entry_name=entry_name)
+
+        instance._llvm_strs = llvm_strs
+        instance._ptx_cache = ptx_cache
+        instance._cubin_cache = cubin_cache
+        instance._linkerinfo_cache = linkerinfo_cache
+
+        instance._max_registers = max_registers
+        instance._nvvm_options = nvvm_options
+        instance.needs_cudadevrt = needs_cudadevrt
+
+        instance._finalized = True
+
+        return instance
+
+
+class JITCUDACodegen(Codegen):
+    """
+    This codegen implementation for CUDA only generates optimized LLVM IR.
+    Generation of PTX code is done separately (see numba.cuda.compiler).
+    """
+
+    _library_class = CUDACodeLibrary
+
+    def __init__(self, module_name):
+        pass
+
+    def _create_empty_module(self, name):
+        ir_module = ir.Module(name)
+        ir_module.triple = CUDA_TRIPLE
+        ir_module.data_layout = nvvm.NVVM().data_layout
+        nvvm.add_ir_version(ir_module)
+        return ir_module
+
+    def _add_module(self, module):
+        pass
+
+    def magic_tuple(self):
+        """
+        Return a tuple unambiguously describing the codegen behaviour.
+        """
+        ctx = devices.get_context()
+        cc = ctx.device.compute_capability
+        return (runtime.runtime.get_version(), cc)