Source code for parallelproj.backend

"""backend functions that interface the parallelproj C/CUDA libraries"""

from __future__ import annotations

import os
import shutil
import importlib.util
import math

import ctypes
from ctypes import POINTER
from ctypes.util import find_library

from pathlib import Path
from warnings import warn

import numpy as np
import array_api_compat
import numpy.ctypeslib as npct

from types import ModuleType
from typing import Union, TYPE_CHECKING


# check if cuda is present
cuda_present = shutil.which("nvidia-smi") is not None

# check if cupy is available
cupy_enabled = importlib.util.find_spec("cupy") is not None

# check if cupy is available
torch_enabled = importlib.util.find_spec("torch") is not None

if TYPE_CHECKING:
    import array_api_compat.cupy as cp
    import array_api_compat.torch as torch

    Array = Union[np.ndarray, cp.ndarray, torch.Tensor]  # Used for type checking
else:
    Array = np.ndarray  # Default at runtime


# numpy ctypes lib array definitions
ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags="C")
ar_1d_int = npct.ndpointer(dtype=ctypes.c_int, ndim=1, flags="C")
ar_1d_short = npct.ndpointer(dtype=ctypes.c_short, ndim=1, flags="C")

# ---------------------------------------------------------------------------------------
# ---- find the compiled C / CUDA libraries

lib_parallelproj_c_fname = None
if "PARALLELPROJ_C_LIB" in os.environ:
    lib_parallelproj_c_fname = os.environ["PARALLELPROJ_C_LIB"]
else:
    lib_parallelproj_c_fname = find_library("parallelproj_c")

if lib_parallelproj_c_fname is None:
    raise ImportError(
        "Cannot find parallelproj c lib. Consider setting the environment variable PARALLELPROJ_C_LIB."
    )
else:
    lib_parallelproj_c = npct.load_library(
        os.path.basename(lib_parallelproj_c_fname),
        os.path.dirname(lib_parallelproj_c_fname),
    )

    lib_parallelproj_c.joseph3d_fwd.restype = None
    lib_parallelproj_c.joseph3d_fwd.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_ulonglong,  # nlors
        ar_1d_int,  # img_dim
    ]

    lib_parallelproj_c.joseph3d_back.restype = None
    lib_parallelproj_c.joseph3d_back.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_ulonglong,  # nlors
        ar_1d_int,  # img_dim
    ]
    lib_parallelproj_c.joseph3d_fwd_tof_sino.restype = None
    lib_parallelproj_c.joseph3d_fwd_tof_sino.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ctypes.c_short,  # n_tofbins
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte,  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_back_tof_sino.restype = None
    lib_parallelproj_c.joseph3d_back_tof_sino.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ctypes.c_short,  # n_tofbins
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte,  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_fwd_tof_lm.restype = None
    lib_parallelproj_c.joseph3d_fwd_tof_lm.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ar_1d_short,  # tof bin
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte,  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_back_tof_lm.restype = None
    lib_parallelproj_c.joseph3d_back_tof_lm.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ar_1d_short,  # tof bin
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte,  # LOR dep. TOF center offset
    ]

# ---------------------------------------------------------------------------------------

num_visible_cuda_devices = 0
lib_parallelproj_cuda_fname = None
cuda_kernel_file = None

if cuda_present:
    if "PARALLELPROJ_CUDA_LIB" in os.environ:
        lib_parallelproj_cuda_fname = os.environ["PARALLELPROJ_CUDA_LIB"]
    else:
        lib_parallelproj_cuda_fname = find_library("parallelproj_cuda")

    if lib_parallelproj_cuda_fname is None:
        raise ImportError(
            "Cannot find parallelproj cuda lib. Consider settting the environment variable PARALLELPROJ_CUDA_LIB."
        )
    else:
        lib_parallelproj_cuda = npct.load_library(
            os.path.basename(lib_parallelproj_cuda_fname),
            os.path.dirname(lib_parallelproj_cuda_fname),
        )

        # get the number of visible cuda devices
        lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
        num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count()

        if (num_visible_cuda_devices == 0) and cupy_enabled:
            cupy_enabled = False

        lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_int,  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_back_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_int,  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_end
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ctypes.c_short,  # n_tofbins
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int,  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_end
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ctypes.c_short,  # n_tofbins
            ctypes.c_ubyte,  # LOR dep.TOF sigma
            ctypes.c_ubyte,  # LOR dep.TOF center offset
            ctypes.c_int,
        ]  # threads per block

        lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ar_1d_short,  # tof bin
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int,
        ]  # threads per block

        lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ar_1d_short,  # tof bin
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int,
        ]  # threads per block

        lib_parallelproj_cuda.copy_float_array_to_all_devices.restype = POINTER(
            POINTER(ctypes.c_float)
        )
        lib_parallelproj_cuda.copy_float_array_to_all_devices.argtypes = [
            ar_1d_single,  # h_array
            ctypes.c_longlong,  # n
        ]

        lib_parallelproj_cuda.free_float_array_on_all_devices.restype = None
        lib_parallelproj_cuda.free_float_array_on_all_devices.argtypes = [
            POINTER(POINTER(ctypes.c_float))  # d_array
        ]

        lib_parallelproj_cuda.sum_float_arrays_on_first_device.restype = None
        lib_parallelproj_cuda.sum_float_arrays_on_first_device.argtypes = [
            POINTER(POINTER(ctypes.c_float)),  # d_array
            ctypes.c_longlong,  # n
        ]

        lib_parallelproj_cuda.get_float_array_from_device.restype = None
        lib_parallelproj_cuda.get_float_array_from_device.argtypes = [
            POINTER(POINTER(ctypes.c_float)),  # d_array
            ctypes.c_longlong,  # n
            ctypes.c_int,  # i_dev
            ar_1d_single,  # h_array
        ]

    # ---------------------------------------------------------------------------------------
    if cupy_enabled:
        import cupy as cp

        if "PARALLELPROJ_CUDA_KERNEL_FILE" in os.environ:
            cuda_kernel_file = Path(os.environ["PARALLELPROJ_CUDA_KERNEL_FILE"])
        else:
            # find all cuda kernel files installed with the parallelproj libs
            cuda_kernel_files = sorted(
                list(
                    (Path(lib_parallelproj_cuda_fname).parents[1] / "lib").glob(
                        "projector_kernels.cu.*"
                    )
                )
            )
            if len(cuda_kernel_files) == 1:
                cuda_kernel_file = cuda_kernel_files[0]
            elif len(cuda_kernel_files) > 1:
                cuda_kernel_file = cuda_kernel_files[-1]
                warn("More than one kernel file available.")
            else:
                raise ImportError(
                    "No kernel file found. Consider setting the environment variable PARALLELPROJ_CUDA_KERNEL_FILE."
                )

        if cuda_kernel_file is not None:
            # load a kernel defined in a external file
            with open(cuda_kernel_file, "r", encoding="utf8") as f:
                lines = f.read()

            _joseph3d_fwd_cuda_kernel = cp.RawKernel(lines, "joseph3d_fwd_cuda_kernel")
            _joseph3d_back_cuda_kernel = cp.RawKernel(
                lines, "joseph3d_back_cuda_kernel"
            )
            _joseph3d_fwd_tof_sino_cuda_kernel = cp.RawKernel(
                lines, "joseph3d_fwd_tof_sino_cuda_kernel"
            )
            _joseph3d_back_tof_sino_cuda_kernel = cp.RawKernel(
                lines, "joseph3d_back_tof_sino_cuda_kernel"
            )
            _joseph3d_fwd_tof_lm_cuda_kernel = cp.RawKernel(
                lines, "joseph3d_fwd_tof_lm_cuda_kernel"
            )
            _joseph3d_back_tof_lm_cuda_kernel = cp.RawKernel(
                lines, "joseph3d_back_tof_lm_cuda_kernel"
            )
        else:
            warn("cannot find cuda kernel file for cupy kernels")


[docs] def calc_chunks(nLORs: int, num_chunks: int) -> list[int] | list[int]: """calculate indices to split an array of length nLORs into num_chunks chunks example: splitting an array of length 10 into 3 chunks returns [0,4,7,10] """ rem = nLORs % num_chunks div = nLORs // num_chunks chunks = [0] for i in range(num_chunks): if i < rem: nLORs_chunck = div + 1 else: nLORs_chunck = div chunks.append(chunks[i] + nLORs_chunck) return chunks
[docs] def is_cuda_array(x: Array) -> bool: """test whether an array is a cuda array Parameters ---------- x : Array array to be tested Returns ------- bool """ iscuda = False if "cupy" in array_api_compat.get_namespace(x).__name__: iscuda = True elif "torch" in array_api_compat.get_namespace(x).__name__: if array_api_compat.device(x).type == "cuda": iscuda = True return iscuda
[docs] def empty_cuda_cache(xp: ModuleType) -> None: """Empty the CUDA cache Parameters ---------- xp : ModuleType array module type supporting CUDA arrays (cupy or torch) """ if xp.__name__ == "array_api_compat.cupy": xp.get_default_memory_pool().free_all_blocks() xp.get_default_pinned_memory_pool().free_all_blocks() elif xp.__name__ == "array_api_compat.torch": xp.torch.cuda.empty_cache() elif xp.__name__ == "torch": xp.cuda.empty_cache()
[docs] def joseph3d_fwd( xstart: Array, xend: Array, img: Array, img_origin: Array, voxsize: Array, threadsperblock: int = 32, num_chunks: int = 1, ) -> Array: """Non-TOF Joseph 3D forward projector Parameters ---------- xstart : Array start world coordinates of the LORs, shape (nLORs, 3) xend : Array end world coordinates of the LORs, shape (nLORs, 3) img : Array containing the 3D image to be projected img_origin : Array containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : Array array containing the voxel size threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img) if is_cuda_array(img): # projection of GPU array (cupy to torch GPU array) using the cupy raw # kernel img_fwd = cp.zeros(xstart.shape[:-1], dtype=cp.float32) _joseph3d_fwd_cuda_kernel( (math.ceil(nLORs / threadsperblock),), (threadsperblock,), ( cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), cp.asarray(img, dtype=cp.float32).ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), img_fwd.ravel(), nLORs, cp.asarray(img.shape, dtype=cp.int32), ), ) cp.cuda.Device().synchronize() else: img_fwd = np.zeros(xstart.shape[:-1], dtype=np.float32) # projection of CPU array (numpy to torch CPU array) if num_visible_cuda_devices > 0: # projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(img)) # send image to all devices d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( np.asarray(img, dtype=np.float32).ravel(), num_voxel ) # split call to GPU lib into chunks (useful for systems with # limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): lib_parallelproj_cuda.joseph3d_fwd_cuda( np.asarray(xstart, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), np.asarray(xend, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), d_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel()[ic[i] : ic[i + 1]], ic[i + 1] - ic[i], np.asarray(img.shape, dtype=np.int32), threadsperblock, ) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_img) else: # projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_fwd( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), np.asarray(img, dtype=np.float32).ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel(), nLORs, np.asarray(img.shape, dtype=np.int32), ) return array_api_compat.to_device( xp.from_dlpack(img_fwd), array_api_compat.device(img) )
[docs] def joseph3d_back( xstart: Array, xend: Array, img_shape: tuple[int, int, int], img_origin: Array, voxsize: Array, img_fwd: Array, threadsperblock: int = 32, num_chunks: int = 1, ) -> Array: """Non-TOF Joseph 3D back projector Parameters ---------- xstart : Array start world coordinates of the LORs, shape (nLORs, 3) xend : Array end world coordinates of the LORs, shape (nLORs, 3) img_shape : tuple[int, int, int] the shape of the back projected image img_origin : Array containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : Array array containing the voxel size img_fwd : Array array of length nLORs containing the values to be back projected threadsperblock : int, optional by default 32 num_chunks : int, optional break down the back projection in hybrid mode into chunks to save memory on the GPU, by default 1 """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img_fwd) if is_cuda_array(img_fwd): # back projection of cupy or torch GPU array using the cupy raw kernel back_img = cp.zeros(img_shape, dtype=cp.float32) _joseph3d_back_cuda_kernel( (math.ceil(nLORs / threadsperblock),), (threadsperblock,), ( cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), cp.asarray(img_fwd, dtype=cp.float32).ravel(), nLORs, cp.asarray(back_img.shape, dtype=cp.int32), ), ) cp.cuda.Device().synchronize() else: # back projection of numpy or torch CPU array back_img = np.zeros(img_shape, dtype=np.float32) if num_visible_cuda_devices > 0: # back projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(back_img)) # send image to all devices d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( back_img.ravel(), num_voxel ) # split call to GPU lib into chunks (useful for systems with # limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): lib_parallelproj_cuda.joseph3d_back_cuda( np.asarray(xstart, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), np.asarray(xend, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), d_back_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel()[ic[i] : ic[i + 1]], ic[i + 1] - ic[i], np.asarray(back_img.shape, dtype=np.int32), threadsperblock, ) # sum all device arrays in the first device lib_parallelproj_cuda.sum_float_arrays_on_first_device( d_back_img, num_voxel ) # copy summed image back from first device lib_parallelproj_cuda.get_float_array_from_device( d_back_img, num_voxel, 0, back_img.ravel() ) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img) else: # back projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_back( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel(), nLORs, np.asarray(back_img.shape, dtype=np.int32), ) return array_api_compat.to_device( xp.from_dlpack(back_img), array_api_compat.device(img_fwd) )
[docs] def joseph3d_fwd_tof_sino( xstart: Array, xend: Array, img: Array, img_origin: Array, voxsize: Array, tofbin_width: float, sigma_tof: Array, tofcenter_offset: Array, nsigmas: float, ntofbins: int, threadsperblock: int = 32, num_chunks: int = 1, ) -> Array: """TOF Joseph 3D sinogram forward projector Parameters ---------- xstart : Array start world coordinates of the LORs, shape (nLORs, 3) xend : Array end world coordinates of the LORs, shape (nLORs, 3) img : Array containing the 3D image to be projected img_origin : Array containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : Array array containing the voxel size tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : Array sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length nLORs -> LOR dependent sigma tofcenter_offset: Array center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all LORs or an array of length nLORs -> LOR dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) ntofbins: int total number of TOF bins threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- Array """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img) lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs) if is_cuda_array(img): # projection of cupy or torch GPU array using the cupy raw kernel img_fwd = cp.zeros(xstart.shape[:-1] + (ntofbins,), dtype=cp.float32) _joseph3d_fwd_tof_sino_cuda_kernel( (math.ceil(nLORs / threadsperblock),), (threadsperblock,), ( cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), cp.asarray(img, dtype=cp.float32).ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), img_fwd.ravel(), nLORs, cp.asarray(img.shape, dtype=cp.int32), np.int16(ntofbins), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32).ravel(), cp.asarray(tofcenter_offset, dtype=cp.float32).ravel(), np.float32(nsigmas), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ), ) cp.cuda.Device().synchronize() else: # projection of numpy or torch CPU array img_fwd = np.zeros(xstart.shape[:-1] + (ntofbins,), dtype=np.float32) if num_visible_cuda_devices > 0: # projection using libparallelproj_cuda num_voxel = ctypes.c_longlong(array_api_compat.size(img)) # send image to all devices d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( np.asarray(img, dtype=np.float32).ravel(), num_voxel ) # split call to GPU lib into chunks (useful for systems with # limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda( np.asarray(xstart, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), np.asarray(xend, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), d_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel()[ntofbins * ic[i] : ntofbins * (ic[i + 1])], ic[i + 1] - ic[i], np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel()[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32).ravel()[ioff0:ioff1], np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock, ) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_img) else: # projection the openmp libparallelproj_c lib_parallelproj_c.joseph3d_fwd_tof_sino( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), np.asarray(img, dtype=np.float32).ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel(), np.int64(nLORs), np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel(), np.asarray(tofcenter_offset, dtype=np.float32).ravel(), np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ) return array_api_compat.to_device( xp.from_dlpack(img_fwd), array_api_compat.device(img) )
[docs] def joseph3d_back_tof_sino( xstart: Array, xend: Array, img_shape: tuple[int, int, int], img_origin: Array, voxsize: Array, img_fwd: Array, tofbin_width: float, sigma_tof: Array, tofcenter_offset: Array, nsigmas: float, ntofbins: int, threadsperblock: int = 32, num_chunks: int = 1, ) -> Array: """TOF Joseph 3D sinogram back projector Parameters ---------- xstart : Array start world coordinates of the LORs, shape (nLORs, 3) xend : Array end world coordinates of the LORs, shape (nLORs, 3) img_shape : tuple[int, int, int] the shape of the back projected image img_origin : Array containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : Array array containing the voxel size img_fwd : Array array of size nLOR*ntofbins containing the values to be back projected tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : Array sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length nLORs -> LOR dependent sigma tofcenter_offset: Array center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all LORs or an array of length nLORs -> LOR dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) ntofbins: int total number of TOF bins threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- Array """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img_fwd) lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs) if is_cuda_array(img_fwd): # back projection of cupy or torch GPU array using the cupy raw kernel back_img = cp.zeros(img_shape, dtype=cp.float32) _joseph3d_back_tof_sino_cuda_kernel( (math.ceil(nLORs / threadsperblock),), (threadsperblock,), ( cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), cp.asarray(img_fwd, dtype=cp.float32).ravel(), nLORs, cp.asarray(back_img.shape, dtype=cp.int32), np.int16(ntofbins), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32).ravel(), cp.asarray(tofcenter_offset, dtype=cp.float32).ravel(), np.float32(nsigmas), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ), ) cp.cuda.Device().synchronize() else: # back projection of numpy or torch CPU array back_img = np.zeros(img_shape, dtype=np.float32) if num_visible_cuda_devices > 0: # back projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(back_img)) # send image to all devices d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( back_img.ravel(), num_voxel ) # split call to GPU lib into chunks (useful for systems with # limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda( np.asarray(xstart, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), np.asarray(xend, dtype=np.float32) .reshape(-1, 3)[ic[i] : (ic[i + 1]), :] .ravel(), d_back_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel()[ ntofbins * ic[i] : ntofbins * ic[i + 1] ], ic[i + 1] - ic[i], np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel()[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32).ravel()[ioff0:ioff1], np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock, ) # sum all device arrays in the first device lib_parallelproj_cuda.sum_float_arrays_on_first_device( d_back_img, num_voxel ) # copy summed image back from first device lib_parallelproj_cuda.get_float_array_from_device( d_back_img, num_voxel, 0, back_img.ravel() ) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img) else: # back projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_back_tof_sino( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel(), nLORs, np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel(), np.asarray(tofcenter_offset, dtype=np.float32).ravel(), np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ) return array_api_compat.to_device( xp.from_dlpack(back_img), array_api_compat.device(img_fwd) )
[docs] def joseph3d_fwd_tof_lm( xstart: Array, xend: Array, img: Array, img_origin: Array, voxsize: Array, tofbin_width: float, sigma_tof: Array, tofcenter_offset: Array, nsigmas: float, tofbin: Array, threadsperblock: int = 32, num_chunks: int = 1, ) -> Array: """TOF Joseph 3D listmode forward projector Parameters ---------- xstart : Array start world coordinates of the event LORs, shape (num_events, 3) xend : Array end world coordinates of the event LORs, shape (num_events, 3) img : Array containing the 3D image to be projected img_origin : Array containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : Array array containing the voxel size tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : Array sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length nLORs -> LOR dependent sigma tofcenter_offset: Array center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all events or an array of length num_events -> event dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) tofbin: Array signed integer array with the tofbin of the events the center of TOF bin 0 is assumed to be at the center of the LOR (shifted by the tofcenter_offset) threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- Array """ nLORs = np.int64(xstart.shape[0]) xp = array_api_compat.get_namespace(img) if not xp.isdtype(tofbin.dtype, "integral"): raise TypeError("tofbin must be an int array") lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs) if is_cuda_array(img): # projection of cupy or torch GPU array using the cupy raw kernel img_fwd = cp.zeros(nLORs, dtype=cp.float32) _joseph3d_fwd_tof_lm_cuda_kernel( (math.ceil(nLORs / threadsperblock),), (threadsperblock,), ( cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), cp.asarray(img, dtype=cp.float32).ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), img_fwd, nLORs, cp.asarray(img.shape, dtype=cp.int32), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32), cp.asarray(tofcenter_offset, dtype=cp.float32), np.float32(nsigmas), cp.asarray(tofbin, dtype=cp.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ), ) cp.cuda.Device().synchronize() else: # projection of numpy or torch CPU array img_fwd = np.zeros(nLORs, dtype=np.float32) if num_visible_cuda_devices > 0: # projection using libparallelproj_cuda num_voxel = ctypes.c_longlong(array_api_compat.size(img)) # send image to all devices d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( np.asarray(img, dtype=np.float32).ravel(), num_voxel ) # split call to GPU lib into chunks (useful for systems with # limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda( np.asarray(xstart, dtype=np.float32)[ ic[i] : (ic[i + 1]), : ].ravel(), np.asarray(xend, dtype=np.float32)[ic[i] : (ic[i + 1]), :].ravel(), d_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd[ic[i] : (ic[i + 1])], ic[i + 1] - ic[i], np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32)[ioff0:ioff1], np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16)[ic[i] : (ic[i + 1])], lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock, ) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_img) else: # projection the openmp libparallelproj_c lib_parallelproj_c.joseph3d_fwd_tof_lm( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), np.asarray(img, dtype=np.float32).ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd, np.int64(nLORs), np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32), np.asarray(tofcenter_offset, dtype=np.float32), np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ) return array_api_compat.to_device( xp.from_dlpack(img_fwd), array_api_compat.device(img) )
[docs] def joseph3d_back_tof_lm( xstart: Array, xend: Array, img_shape: tuple[int, int, int], img_origin: Array, voxsize: Array, img_fwd: Array, tofbin_width: float, sigma_tof: Array, tofcenter_offset: Array, nsigmas: float, tofbin: Array, threadsperblock: int = 32, num_chunks: int = 1, ) -> Array: """TOF Joseph 3D listmode back projector Parameters ---------- xstart : Array start world coordinates of the event LORs, shape (nLORs, 3) xend : Array end world coordinates of the event LORs, shape (nLORs, 3) img_shape : tuple[int, int, int] the shape of the back projected image img_origin : Array containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : Array array containing the voxel size img_fwd : Array array of size num_events containing the values to be back projected tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : Array sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length num_events -> event dependent sigma tofcenter_offset: Array center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all LORs or an array of length num_events -> event dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) tofbin: Array signed integer array with the tofbin of the events the center of TOF bin 0 is assumed to be at the center of the LOR (shifted by the tofcenter_offset) threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- Array """ nLORs = np.int64(xstart.shape[0]) xp = array_api_compat.get_namespace(img_fwd) if not xp.isdtype(tofbin.dtype, "integral"): raise TypeError("tofbin must be an int array") lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs) if is_cuda_array(img_fwd): # back projection of cupy or torch GPU array using the cupy raw kernel back_img = cp.zeros(img_shape, dtype=cp.float32) _joseph3d_back_tof_lm_cuda_kernel( (math.ceil(nLORs / threadsperblock),), (threadsperblock,), ( cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), cp.asarray(img_fwd, dtype=cp.float32), nLORs, cp.asarray(back_img.shape, dtype=cp.int32), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32), cp.asarray(tofcenter_offset, dtype=cp.float32), np.float32(nsigmas), cp.asarray(tofbin, dtype=cp.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ), ) cp.cuda.Device().synchronize() else: # back projection of numpy or torch CPU array back_img = np.zeros(img_shape, dtype=np.float32) if num_visible_cuda_devices > 0: # back projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(back_img)) # send image to all devices d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( back_img.ravel(), num_voxel ) # split call to GPU lib into chunks (useful for systems with # limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda( np.asarray(xstart, dtype=np.float32)[ ic[i] : (ic[i + 1]), : ].ravel(), np.asarray(xend, dtype=np.float32)[ic[i] : (ic[i + 1]), :].ravel(), d_back_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32)[ic[i] : ic[i + 1]], ic[i + 1] - ic[i], np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32)[ioff0:ioff1], np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16)[ic[i] : (ic[i + 1])], lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock, ) # sum all device arrays in the first device lib_parallelproj_cuda.sum_float_arrays_on_first_device( d_back_img, num_voxel ) # copy summed image back from first device lib_parallelproj_cuda.get_float_array_from_device( d_back_img, num_voxel, 0, back_img.ravel() ) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img) else: # back projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_back_tof_lm( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32), nLORs, np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32), np.asarray(tofcenter_offset, dtype=np.float32), np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, ) return array_api_compat.to_device( xp.from_dlpack(back_img), array_api_compat.device(img_fwd) )
if cupy_enabled: def _cupy_unique_axis0( ar: cp.ndarray, return_index: bool = False, return_inverse: bool = False, return_counts: bool = False, ): """analogon of numpy's unique for a 2D array along axis 0 Parameters ---------- ar : cp.ndarray 2D array return_index : bool, optional see np.unique, by default False return_inverse : bool, optional see np.unique, by default False return_counts : bool, optional see np.unique, by default False Returns ------- see numpy.unique """ if len(ar.shape) != 2: raise ValueError("Input array must be 2D.") perm = cp.lexsort(ar.T[::-2]) aux = ar[perm] mask = cp.empty(ar.shape[0], dtype=cp.bool_) mask[0] = True mask[1:] = cp.any(aux[1:] != aux[:-1], axis=1) ret = aux[mask] if not return_index and not return_inverse and not return_counts: return ret ret = (ret,) if return_index: ret += (perm[mask],) if return_inverse: imask = cp.cumsum(mask) - 1 inv_idx = cp.empty(mask.shape, dtype=cp.intp) inv_idx[perm] = imask ret += (inv_idx,) if return_counts: nonzero = cp.nonzero(mask)[0] # may synchronize idx = cp.empty((nonzero.size + 1,), dtype=nonzero.dtype) idx[:-1] = nonzero idx[-1] = mask.size ret += (idx[1:] - idx[:-1],) return ret
[docs] def count_event_multiplicity(events: Array) -> Array: """Count the multiplicity of events in an LM file Parameters ---------- events : Array 2D (integer) array of LM events of shape (num_events, num_attributes) where the second axis encodes the event attributes (e.g. detectors numbers and TOF bins) Returns ------- Array 1D array containing the multiplicity of each event """ xp = array_api_compat.get_namespace(events) dev = array_api_compat.device(events) if is_cuda_array(events): if cupy_enabled: tmp = _cupy_unique_axis0( cp.asarray(events), return_counts=True, return_inverse=True ) else: tmp = np.unique( to_numpy_array(events), axis=0, return_counts=True, return_inverse=True, ) else: tmp = np.unique(events, axis=0, return_counts=True, return_inverse=True) mu = array_api_compat.to_device(xp.from_dlpack(tmp[2][tmp[1]]), dev) mu = xp.reshape(mu, (array_api_compat.size(mu),)) return mu
[docs] def to_numpy_array(x: Array) -> np.ndarray: """convert an array to a numpy array Parameters ---------- x : Array input array (numpy, cupy, torch tensor) Returns ------- np.ndarray """ if is_cuda_array(x): return np.asarray(array_api_compat.to_device(x, "cpu")) else: return np.asarray(x)