"""backend functions that interface the parallelproj C/CUDA libraries"""
from __future__ import annotations
import os
import shutil
import importlib.util
import math
import ctypes
from ctypes import POINTER
from ctypes.util import find_library
from pathlib import Path
from warnings import warn
import numpy as np
import array_api_compat
import numpy.ctypeslib as npct
from types import ModuleType
from typing import Union, TYPE_CHECKING
# check if cuda is present
cuda_present = shutil.which("nvidia-smi") is not None
# check if cupy is available
cupy_enabled = importlib.util.find_spec("cupy") is not None
# check if cupy is available
torch_enabled = importlib.util.find_spec("torch") is not None
if TYPE_CHECKING:
import array_api_compat.cupy as cp
import array_api_compat.torch as torch
Array = Union[np.ndarray, cp.ndarray, torch.Tensor] # Used for type checking
else:
Array = np.ndarray # Default at runtime
# numpy ctypes lib array definitions
ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags="C")
ar_1d_int = npct.ndpointer(dtype=ctypes.c_int, ndim=1, flags="C")
ar_1d_short = npct.ndpointer(dtype=ctypes.c_short, ndim=1, flags="C")
# ---------------------------------------------------------------------------------------
# ---- find the compiled C / CUDA libraries
lib_parallelproj_c_fname = None
if "PARALLELPROJ_C_LIB" in os.environ:
lib_parallelproj_c_fname = os.environ["PARALLELPROJ_C_LIB"]
else:
lib_parallelproj_c_fname = find_library("parallelproj_c")
if lib_parallelproj_c_fname is None:
raise ImportError(
"Cannot find parallelproj c lib. Consider setting the environment variable PARALLELPROJ_C_LIB."
)
else:
lib_parallelproj_c = npct.load_library(
os.path.basename(lib_parallelproj_c_fname),
os.path.dirname(lib_parallelproj_c_fname),
)
lib_parallelproj_c.joseph3d_fwd.restype = None
lib_parallelproj_c.joseph3d_fwd.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_ulonglong, # nlors
ar_1d_int, # img_dim
]
lib_parallelproj_c.joseph3d_back.restype = None
lib_parallelproj_c.joseph3d_back.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_ulonglong, # nlors
ar_1d_int, # img_dim
]
lib_parallelproj_c.joseph3d_fwd_tof_sino.restype = None
lib_parallelproj_c.joseph3d_fwd_tof_sino.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
]
lib_parallelproj_c.joseph3d_back_tof_sino.restype = None
lib_parallelproj_c.joseph3d_back_tof_sino.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
]
lib_parallelproj_c.joseph3d_fwd_tof_lm.restype = None
lib_parallelproj_c.joseph3d_fwd_tof_lm.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
]
lib_parallelproj_c.joseph3d_back_tof_lm.restype = None
lib_parallelproj_c.joseph3d_back_tof_lm.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
]
# ---------------------------------------------------------------------------------------
num_visible_cuda_devices = 0
lib_parallelproj_cuda_fname = None
cuda_kernel_file = None
if cuda_present:
if "PARALLELPROJ_CUDA_LIB" in os.environ:
lib_parallelproj_cuda_fname = os.environ["PARALLELPROJ_CUDA_LIB"]
else:
lib_parallelproj_cuda_fname = find_library("parallelproj_cuda")
if lib_parallelproj_cuda_fname is None:
raise ImportError(
"Cannot find parallelproj cuda lib. Consider settting the environment variable PARALLELPROJ_CUDA_LIB."
)
else:
lib_parallelproj_cuda = npct.load_library(
os.path.basename(lib_parallelproj_cuda_fname),
os.path.dirname(lib_parallelproj_cuda_fname),
)
# get the number of visible cuda devices
lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count()
if (num_visible_cuda_devices == 0) and cupy_enabled:
cupy_enabled = False
lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_int, # threadsperblock
]
lib_parallelproj_cuda.joseph3d_back_cuda.restype = None
lib_parallelproj_cuda.joseph3d_back_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_int, # threadsperblock
]
lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_end
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
ctypes.c_int, # threadsperblock
]
lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.restype = None
lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_end
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep.TOF sigma
ctypes.c_ubyte, # LOR dep.TOF center offset
ctypes.c_int,
] # threads per block
lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
ctypes.c_int,
] # threads per block
lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.restype = None
lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
ctypes.c_int,
] # threads per block
lib_parallelproj_cuda.copy_float_array_to_all_devices.restype = POINTER(
POINTER(ctypes.c_float)
)
lib_parallelproj_cuda.copy_float_array_to_all_devices.argtypes = [
ar_1d_single, # h_array
ctypes.c_longlong, # n
]
lib_parallelproj_cuda.free_float_array_on_all_devices.restype = None
lib_parallelproj_cuda.free_float_array_on_all_devices.argtypes = [
POINTER(POINTER(ctypes.c_float)) # d_array
]
lib_parallelproj_cuda.sum_float_arrays_on_first_device.restype = None
lib_parallelproj_cuda.sum_float_arrays_on_first_device.argtypes = [
POINTER(POINTER(ctypes.c_float)), # d_array
ctypes.c_longlong, # n
]
lib_parallelproj_cuda.get_float_array_from_device.restype = None
lib_parallelproj_cuda.get_float_array_from_device.argtypes = [
POINTER(POINTER(ctypes.c_float)), # d_array
ctypes.c_longlong, # n
ctypes.c_int, # i_dev
ar_1d_single, # h_array
]
# ---------------------------------------------------------------------------------------
if cupy_enabled:
import cupy as cp
if "PARALLELPROJ_CUDA_KERNEL_FILE" in os.environ:
cuda_kernel_file = Path(os.environ["PARALLELPROJ_CUDA_KERNEL_FILE"])
else:
# find all cuda kernel files installed with the parallelproj libs
cuda_kernel_files = sorted(
list(
(Path(lib_parallelproj_cuda_fname).parents[1] / "lib").glob(
"projector_kernels.cu.*"
)
)
)
if len(cuda_kernel_files) == 1:
cuda_kernel_file = cuda_kernel_files[0]
elif len(cuda_kernel_files) > 1:
cuda_kernel_file = cuda_kernel_files[-1]
warn("More than one kernel file available.")
else:
raise ImportError(
"No kernel file found. Consider setting the environment variable PARALLELPROJ_CUDA_KERNEL_FILE."
)
if cuda_kernel_file is not None:
# load a kernel defined in a external file
with open(cuda_kernel_file, "r", encoding="utf8") as f:
lines = f.read()
_joseph3d_fwd_cuda_kernel = cp.RawKernel(lines, "joseph3d_fwd_cuda_kernel")
_joseph3d_back_cuda_kernel = cp.RawKernel(
lines, "joseph3d_back_cuda_kernel"
)
_joseph3d_fwd_tof_sino_cuda_kernel = cp.RawKernel(
lines, "joseph3d_fwd_tof_sino_cuda_kernel"
)
_joseph3d_back_tof_sino_cuda_kernel = cp.RawKernel(
lines, "joseph3d_back_tof_sino_cuda_kernel"
)
_joseph3d_fwd_tof_lm_cuda_kernel = cp.RawKernel(
lines, "joseph3d_fwd_tof_lm_cuda_kernel"
)
_joseph3d_back_tof_lm_cuda_kernel = cp.RawKernel(
lines, "joseph3d_back_tof_lm_cuda_kernel"
)
else:
warn("cannot find cuda kernel file for cupy kernels")
[docs]
def calc_chunks(nLORs: int, num_chunks: int) -> list[int] | list[int]:
"""calculate indices to split an array of length nLORs into num_chunks chunks
example: splitting an array of length 10 into 3 chunks returns [0,4,7,10]
"""
rem = nLORs % num_chunks
div = nLORs // num_chunks
chunks = [0]
for i in range(num_chunks):
if i < rem:
nLORs_chunck = div + 1
else:
nLORs_chunck = div
chunks.append(chunks[i] + nLORs_chunck)
return chunks
[docs]
def is_cuda_array(x: Array) -> bool:
"""test whether an array is a cuda array
Parameters
----------
x : Array
array to be tested
Returns
-------
bool
"""
iscuda = False
if "cupy" in array_api_compat.get_namespace(x).__name__:
iscuda = True
elif "torch" in array_api_compat.get_namespace(x).__name__:
if array_api_compat.device(x).type == "cuda":
iscuda = True
return iscuda
[docs]
def empty_cuda_cache(xp: ModuleType) -> None:
"""Empty the CUDA cache
Parameters
----------
xp : ModuleType
array module type supporting CUDA arrays (cupy or torch)
"""
if xp.__name__ == "array_api_compat.cupy":
xp.get_default_memory_pool().free_all_blocks()
xp.get_default_pinned_memory_pool().free_all_blocks()
elif xp.__name__ == "array_api_compat.torch":
xp.torch.cuda.empty_cache()
elif xp.__name__ == "torch":
xp.cuda.empty_cache()
[docs]
def joseph3d_fwd(
xstart: Array,
xend: Array,
img: Array,
img_origin: Array,
voxsize: Array,
threadsperblock: int = 32,
num_chunks: int = 1,
) -> Array:
"""Non-TOF Joseph 3D forward projector
Parameters
----------
xstart : Array
start world coordinates of the LORs, shape (nLORs, 3)
xend : Array
end world coordinates of the LORs, shape (nLORs, 3)
img : Array
containing the 3D image to be projected
img_origin : Array
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : Array
array containing the voxel size
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
"""
nLORs = np.int64(array_api_compat.size(xstart) // 3)
xp = array_api_compat.get_namespace(img)
if is_cuda_array(img):
# projection of GPU array (cupy to torch GPU array) using the cupy raw
# kernel
img_fwd = cp.zeros(xstart.shape[:-1], dtype=cp.float32)
_joseph3d_fwd_cuda_kernel(
(math.ceil(nLORs / threadsperblock),),
(threadsperblock,),
(
cp.asarray(xstart, dtype=cp.float32).ravel(),
cp.asarray(xend, dtype=cp.float32).ravel(),
cp.asarray(img, dtype=cp.float32).ravel(),
cp.asarray(img_origin, dtype=cp.float32),
cp.asarray(voxsize, dtype=cp.float32),
img_fwd.ravel(),
nLORs,
cp.asarray(img.shape, dtype=cp.int32),
),
)
cp.cuda.Device().synchronize()
else:
img_fwd = np.zeros(xstart.shape[:-1], dtype=np.float32)
# projection of CPU array (numpy to torch CPU array)
if num_visible_cuda_devices > 0:
# projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(array_api_compat.size(img))
# send image to all devices
d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
np.asarray(img, dtype=np.float32).ravel(), num_voxel
)
# split call to GPU lib into chunks (useful for systems with
# limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
lib_parallelproj_cuda.joseph3d_fwd_cuda(
np.asarray(xstart, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
np.asarray(xend, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
d_img,
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
img_fwd.ravel()[ic[i] : ic[i + 1]],
ic[i + 1] - ic[i],
np.asarray(img.shape, dtype=np.int32),
threadsperblock,
)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
else:
# projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_fwd(
np.asarray(xstart, dtype=np.float32).ravel(),
np.asarray(xend, dtype=np.float32).ravel(),
np.asarray(img, dtype=np.float32).ravel(),
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
img_fwd.ravel(),
nLORs,
np.asarray(img.shape, dtype=np.int32),
)
return array_api_compat.to_device(
xp.from_dlpack(img_fwd), array_api_compat.device(img)
)
[docs]
def joseph3d_back(
xstart: Array,
xend: Array,
img_shape: tuple[int, int, int],
img_origin: Array,
voxsize: Array,
img_fwd: Array,
threadsperblock: int = 32,
num_chunks: int = 1,
) -> Array:
"""Non-TOF Joseph 3D back projector
Parameters
----------
xstart : Array
start world coordinates of the LORs, shape (nLORs, 3)
xend : Array
end world coordinates of the LORs, shape (nLORs, 3)
img_shape : tuple[int, int, int]
the shape of the back projected image
img_origin : Array
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : Array
array containing the voxel size
img_fwd : Array
array of length nLORs containing the values to be back projected
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the back projection in hybrid mode into chunks to
save memory on the GPU, by default 1
"""
nLORs = np.int64(array_api_compat.size(xstart) // 3)
xp = array_api_compat.get_namespace(img_fwd)
if is_cuda_array(img_fwd):
# back projection of cupy or torch GPU array using the cupy raw kernel
back_img = cp.zeros(img_shape, dtype=cp.float32)
_joseph3d_back_cuda_kernel(
(math.ceil(nLORs / threadsperblock),),
(threadsperblock,),
(
cp.asarray(xstart, dtype=cp.float32).ravel(),
cp.asarray(xend, dtype=cp.float32).ravel(),
back_img.ravel(),
cp.asarray(img_origin, dtype=cp.float32),
cp.asarray(voxsize, dtype=cp.float32),
cp.asarray(img_fwd, dtype=cp.float32).ravel(),
nLORs,
cp.asarray(back_img.shape, dtype=cp.int32),
),
)
cp.cuda.Device().synchronize()
else:
# back projection of numpy or torch CPU array
back_img = np.zeros(img_shape, dtype=np.float32)
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(array_api_compat.size(back_img))
# send image to all devices
d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
back_img.ravel(), num_voxel
)
# split call to GPU lib into chunks (useful for systems with
# limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
lib_parallelproj_cuda.joseph3d_back_cuda(
np.asarray(xstart, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
np.asarray(xend, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
d_back_img,
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
np.asarray(img_fwd, dtype=np.float32).ravel()[ic[i] : ic[i + 1]],
ic[i + 1] - ic[i],
np.asarray(back_img.shape, dtype=np.int32),
threadsperblock,
)
# sum all device arrays in the first device
lib_parallelproj_cuda.sum_float_arrays_on_first_device(
d_back_img, num_voxel
)
# copy summed image back from first device
lib_parallelproj_cuda.get_float_array_from_device(
d_back_img, num_voxel, 0, back_img.ravel()
)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back(
np.asarray(xstart, dtype=np.float32).ravel(),
np.asarray(xend, dtype=np.float32).ravel(),
back_img.ravel(),
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
np.asarray(img_fwd, dtype=np.float32).ravel(),
nLORs,
np.asarray(back_img.shape, dtype=np.int32),
)
return array_api_compat.to_device(
xp.from_dlpack(back_img), array_api_compat.device(img_fwd)
)
[docs]
def joseph3d_fwd_tof_sino(
xstart: Array,
xend: Array,
img: Array,
img_origin: Array,
voxsize: Array,
tofbin_width: float,
sigma_tof: Array,
tofcenter_offset: Array,
nsigmas: float,
ntofbins: int,
threadsperblock: int = 32,
num_chunks: int = 1,
) -> Array:
"""TOF Joseph 3D sinogram forward projector
Parameters
----------
xstart : Array
start world coordinates of the LORs, shape (nLORs, 3)
xend : Array
end world coordinates of the LORs, shape (nLORs, 3)
img : Array
containing the 3D image to be projected
img_origin : Array
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : Array
array containing the voxel size
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
sigma_tof : Array
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: Array
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length nLORs -> LOR dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
ntofbins: int
total number of TOF bins
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
Returns
-------
Array
"""
nLORs = np.int64(array_api_compat.size(xstart) // 3)
xp = array_api_compat.get_namespace(img)
lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs)
if is_cuda_array(img):
# projection of cupy or torch GPU array using the cupy raw kernel
img_fwd = cp.zeros(xstart.shape[:-1] + (ntofbins,), dtype=cp.float32)
_joseph3d_fwd_tof_sino_cuda_kernel(
(math.ceil(nLORs / threadsperblock),),
(threadsperblock,),
(
cp.asarray(xstart, dtype=cp.float32).ravel(),
cp.asarray(xend, dtype=cp.float32).ravel(),
cp.asarray(img, dtype=cp.float32).ravel(),
cp.asarray(img_origin, dtype=cp.float32),
cp.asarray(voxsize, dtype=cp.float32),
img_fwd.ravel(),
nLORs,
cp.asarray(img.shape, dtype=cp.int32),
np.int16(ntofbins),
np.float32(tofbin_width),
cp.asarray(sigma_tof, dtype=cp.float32).ravel(),
cp.asarray(tofcenter_offset, dtype=cp.float32).ravel(),
np.float32(nsigmas),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
),
)
cp.cuda.Device().synchronize()
else:
# projection of numpy or torch CPU array
img_fwd = np.zeros(xstart.shape[:-1] + (ntofbins,), dtype=np.float32)
if num_visible_cuda_devices > 0:
# projection using libparallelproj_cuda
num_voxel = ctypes.c_longlong(array_api_compat.size(img))
# send image to all devices
d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
np.asarray(img, dtype=np.float32).ravel(), num_voxel
)
# split call to GPU lib into chunks (useful for systems with
# limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda(
np.asarray(xstart, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
np.asarray(xend, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
d_img,
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
img_fwd.ravel()[ntofbins * ic[i] : ntofbins * (ic[i + 1])],
ic[i + 1] - ic[i],
np.asarray(img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32).ravel()[isig0:isig1],
np.asarray(tofcenter_offset, dtype=np.float32).ravel()[ioff0:ioff1],
np.float32(nsigmas),
np.int16(ntofbins),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
threadsperblock,
)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
else:
# projection the openmp libparallelproj_c
lib_parallelproj_c.joseph3d_fwd_tof_sino(
np.asarray(xstart, dtype=np.float32).ravel(),
np.asarray(xend, dtype=np.float32).ravel(),
np.asarray(img, dtype=np.float32).ravel(),
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
img_fwd.ravel(),
np.int64(nLORs),
np.asarray(img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32).ravel(),
np.asarray(tofcenter_offset, dtype=np.float32).ravel(),
np.float32(nsigmas),
np.int16(ntofbins),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
)
return array_api_compat.to_device(
xp.from_dlpack(img_fwd), array_api_compat.device(img)
)
[docs]
def joseph3d_back_tof_sino(
xstart: Array,
xend: Array,
img_shape: tuple[int, int, int],
img_origin: Array,
voxsize: Array,
img_fwd: Array,
tofbin_width: float,
sigma_tof: Array,
tofcenter_offset: Array,
nsigmas: float,
ntofbins: int,
threadsperblock: int = 32,
num_chunks: int = 1,
) -> Array:
"""TOF Joseph 3D sinogram back projector
Parameters
----------
xstart : Array
start world coordinates of the LORs, shape (nLORs, 3)
xend : Array
end world coordinates of the LORs, shape (nLORs, 3)
img_shape : tuple[int, int, int]
the shape of the back projected image
img_origin : Array
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : Array
array containing the voxel size
img_fwd : Array
array of size nLOR*ntofbins containing the values to be back projected
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
sigma_tof : Array
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: Array
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length nLORs -> LOR dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
ntofbins: int
total number of TOF bins
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
Returns
-------
Array
"""
nLORs = np.int64(array_api_compat.size(xstart) // 3)
xp = array_api_compat.get_namespace(img_fwd)
lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs)
if is_cuda_array(img_fwd):
# back projection of cupy or torch GPU array using the cupy raw kernel
back_img = cp.zeros(img_shape, dtype=cp.float32)
_joseph3d_back_tof_sino_cuda_kernel(
(math.ceil(nLORs / threadsperblock),),
(threadsperblock,),
(
cp.asarray(xstart, dtype=cp.float32).ravel(),
cp.asarray(xend, dtype=cp.float32).ravel(),
back_img.ravel(),
cp.asarray(img_origin, dtype=cp.float32),
cp.asarray(voxsize, dtype=cp.float32),
cp.asarray(img_fwd, dtype=cp.float32).ravel(),
nLORs,
cp.asarray(back_img.shape, dtype=cp.int32),
np.int16(ntofbins),
np.float32(tofbin_width),
cp.asarray(sigma_tof, dtype=cp.float32).ravel(),
cp.asarray(tofcenter_offset, dtype=cp.float32).ravel(),
np.float32(nsigmas),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
),
)
cp.cuda.Device().synchronize()
else:
# back projection of numpy or torch CPU array
back_img = np.zeros(img_shape, dtype=np.float32)
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(array_api_compat.size(back_img))
# send image to all devices
d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
back_img.ravel(), num_voxel
)
# split call to GPU lib into chunks (useful for systems with
# limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda(
np.asarray(xstart, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
np.asarray(xend, dtype=np.float32)
.reshape(-1, 3)[ic[i] : (ic[i + 1]), :]
.ravel(),
d_back_img,
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
np.asarray(img_fwd, dtype=np.float32).ravel()[
ntofbins * ic[i] : ntofbins * ic[i + 1]
],
ic[i + 1] - ic[i],
np.asarray(back_img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32).ravel()[isig0:isig1],
np.asarray(tofcenter_offset, dtype=np.float32).ravel()[ioff0:ioff1],
np.float32(nsigmas),
np.int16(ntofbins),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
threadsperblock,
)
# sum all device arrays in the first device
lib_parallelproj_cuda.sum_float_arrays_on_first_device(
d_back_img, num_voxel
)
# copy summed image back from first device
lib_parallelproj_cuda.get_float_array_from_device(
d_back_img, num_voxel, 0, back_img.ravel()
)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back_tof_sino(
np.asarray(xstart, dtype=np.float32).ravel(),
np.asarray(xend, dtype=np.float32).ravel(),
back_img.ravel(),
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
np.asarray(img_fwd, dtype=np.float32).ravel(),
nLORs,
np.asarray(back_img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32).ravel(),
np.asarray(tofcenter_offset, dtype=np.float32).ravel(),
np.float32(nsigmas),
np.int16(ntofbins),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
)
return array_api_compat.to_device(
xp.from_dlpack(back_img), array_api_compat.device(img_fwd)
)
[docs]
def joseph3d_fwd_tof_lm(
xstart: Array,
xend: Array,
img: Array,
img_origin: Array,
voxsize: Array,
tofbin_width: float,
sigma_tof: Array,
tofcenter_offset: Array,
nsigmas: float,
tofbin: Array,
threadsperblock: int = 32,
num_chunks: int = 1,
) -> Array:
"""TOF Joseph 3D listmode forward projector
Parameters
----------
xstart : Array
start world coordinates of the event LORs, shape (num_events, 3)
xend : Array
end world coordinates of the event LORs, shape (num_events, 3)
img : Array
containing the 3D image to be projected
img_origin : Array
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : Array
array containing the voxel size
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
sigma_tof : Array
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: Array
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all events
or an array of length num_events -> event dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
tofbin: Array
signed integer array with the tofbin of the events
the center of TOF bin 0 is assumed to be at the center of the LOR
(shifted by the tofcenter_offset)
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
Returns
-------
Array
"""
nLORs = np.int64(xstart.shape[0])
xp = array_api_compat.get_namespace(img)
if not xp.isdtype(tofbin.dtype, "integral"):
raise TypeError("tofbin must be an int array")
lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs)
if is_cuda_array(img):
# projection of cupy or torch GPU array using the cupy raw kernel
img_fwd = cp.zeros(nLORs, dtype=cp.float32)
_joseph3d_fwd_tof_lm_cuda_kernel(
(math.ceil(nLORs / threadsperblock),),
(threadsperblock,),
(
cp.asarray(xstart, dtype=cp.float32).ravel(),
cp.asarray(xend, dtype=cp.float32).ravel(),
cp.asarray(img, dtype=cp.float32).ravel(),
cp.asarray(img_origin, dtype=cp.float32),
cp.asarray(voxsize, dtype=cp.float32),
img_fwd,
nLORs,
cp.asarray(img.shape, dtype=cp.int32),
np.float32(tofbin_width),
cp.asarray(sigma_tof, dtype=cp.float32),
cp.asarray(tofcenter_offset, dtype=cp.float32),
np.float32(nsigmas),
cp.asarray(tofbin, dtype=cp.int16),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
),
)
cp.cuda.Device().synchronize()
else:
# projection of numpy or torch CPU array
img_fwd = np.zeros(nLORs, dtype=np.float32)
if num_visible_cuda_devices > 0:
# projection using libparallelproj_cuda
num_voxel = ctypes.c_longlong(array_api_compat.size(img))
# send image to all devices
d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
np.asarray(img, dtype=np.float32).ravel(), num_voxel
)
# split call to GPU lib into chunks (useful for systems with
# limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda(
np.asarray(xstart, dtype=np.float32)[
ic[i] : (ic[i + 1]), :
].ravel(),
np.asarray(xend, dtype=np.float32)[ic[i] : (ic[i + 1]), :].ravel(),
d_img,
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
img_fwd[ic[i] : (ic[i + 1])],
ic[i + 1] - ic[i],
np.asarray(img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1],
np.asarray(tofcenter_offset, dtype=np.float32)[ioff0:ioff1],
np.float32(nsigmas),
np.asarray(tofbin, dtype=np.int16)[ic[i] : (ic[i + 1])],
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
threadsperblock,
)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
else:
# projection the openmp libparallelproj_c
lib_parallelproj_c.joseph3d_fwd_tof_lm(
np.asarray(xstart, dtype=np.float32).ravel(),
np.asarray(xend, dtype=np.float32).ravel(),
np.asarray(img, dtype=np.float32).ravel(),
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
img_fwd,
np.int64(nLORs),
np.asarray(img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32),
np.asarray(tofcenter_offset, dtype=np.float32),
np.float32(nsigmas),
np.asarray(tofbin, dtype=np.int16),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
)
return array_api_compat.to_device(
xp.from_dlpack(img_fwd), array_api_compat.device(img)
)
[docs]
def joseph3d_back_tof_lm(
xstart: Array,
xend: Array,
img_shape: tuple[int, int, int],
img_origin: Array,
voxsize: Array,
img_fwd: Array,
tofbin_width: float,
sigma_tof: Array,
tofcenter_offset: Array,
nsigmas: float,
tofbin: Array,
threadsperblock: int = 32,
num_chunks: int = 1,
) -> Array:
"""TOF Joseph 3D listmode back projector
Parameters
----------
xstart : Array
start world coordinates of the event LORs, shape (nLORs, 3)
xend : Array
end world coordinates of the event LORs, shape (nLORs, 3)
img_shape : tuple[int, int, int]
the shape of the back projected image
img_origin : Array
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : Array
array containing the voxel size
img_fwd : Array
array of size num_events containing the values to be back projected
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
sigma_tof : Array
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length num_events -> event dependent sigma
tofcenter_offset: Array
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length num_events -> event dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
tofbin: Array
signed integer array with the tofbin of the events
the center of TOF bin 0 is assumed to be at the center of the LOR
(shifted by the tofcenter_offset)
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
Returns
-------
Array
"""
nLORs = np.int64(xstart.shape[0])
xp = array_api_compat.get_namespace(img_fwd)
if not xp.isdtype(tofbin.dtype, "integral"):
raise TypeError("tofbin must be an int array")
lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = np.uint8(tofcenter_offset.shape[0] == nLORs)
if is_cuda_array(img_fwd):
# back projection of cupy or torch GPU array using the cupy raw kernel
back_img = cp.zeros(img_shape, dtype=cp.float32)
_joseph3d_back_tof_lm_cuda_kernel(
(math.ceil(nLORs / threadsperblock),),
(threadsperblock,),
(
cp.asarray(xstart, dtype=cp.float32).ravel(),
cp.asarray(xend, dtype=cp.float32).ravel(),
back_img.ravel(),
cp.asarray(img_origin, dtype=cp.float32),
cp.asarray(voxsize, dtype=cp.float32),
cp.asarray(img_fwd, dtype=cp.float32),
nLORs,
cp.asarray(back_img.shape, dtype=cp.int32),
np.float32(tofbin_width),
cp.asarray(sigma_tof, dtype=cp.float32),
cp.asarray(tofcenter_offset, dtype=cp.float32),
np.float32(nsigmas),
cp.asarray(tofbin, dtype=cp.int16),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
),
)
cp.cuda.Device().synchronize()
else:
# back projection of numpy or torch CPU array
back_img = np.zeros(img_shape, dtype=np.float32)
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(array_api_compat.size(back_img))
# send image to all devices
d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
back_img.ravel(), num_voxel
)
# split call to GPU lib into chunks (useful for systems with
# limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda(
np.asarray(xstart, dtype=np.float32)[
ic[i] : (ic[i + 1]), :
].ravel(),
np.asarray(xend, dtype=np.float32)[ic[i] : (ic[i + 1]), :].ravel(),
d_back_img,
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
np.asarray(img_fwd, dtype=np.float32)[ic[i] : ic[i + 1]],
ic[i + 1] - ic[i],
np.asarray(back_img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1],
np.asarray(tofcenter_offset, dtype=np.float32)[ioff0:ioff1],
np.float32(nsigmas),
np.asarray(tofbin, dtype=np.int16)[ic[i] : (ic[i + 1])],
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
threadsperblock,
)
# sum all device arrays in the first device
lib_parallelproj_cuda.sum_float_arrays_on_first_device(
d_back_img, num_voxel
)
# copy summed image back from first device
lib_parallelproj_cuda.get_float_array_from_device(
d_back_img, num_voxel, 0, back_img.ravel()
)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back_tof_lm(
np.asarray(xstart, dtype=np.float32).ravel(),
np.asarray(xend, dtype=np.float32).ravel(),
back_img.ravel(),
np.asarray(img_origin, dtype=np.float32),
np.asarray(voxsize, dtype=np.float32),
np.asarray(img_fwd, dtype=np.float32),
nLORs,
np.asarray(back_img.shape, dtype=np.int32),
np.float32(tofbin_width),
np.asarray(sigma_tof, dtype=np.float32),
np.asarray(tofcenter_offset, dtype=np.float32),
np.float32(nsigmas),
np.asarray(tofbin, dtype=np.int16),
lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset,
)
return array_api_compat.to_device(
xp.from_dlpack(back_img), array_api_compat.device(img_fwd)
)
if cupy_enabled:
def _cupy_unique_axis0(
ar: cp.ndarray,
return_index: bool = False,
return_inverse: bool = False,
return_counts: bool = False,
):
"""analogon of numpy's unique for a 2D array along axis 0
Parameters
----------
ar : cp.ndarray
2D array
return_index : bool, optional
see np.unique, by default False
return_inverse : bool, optional
see np.unique, by default False
return_counts : bool, optional
see np.unique, by default False
Returns
-------
see numpy.unique
"""
if len(ar.shape) != 2:
raise ValueError("Input array must be 2D.")
perm = cp.lexsort(ar.T[::-2])
aux = ar[perm]
mask = cp.empty(ar.shape[0], dtype=cp.bool_)
mask[0] = True
mask[1:] = cp.any(aux[1:] != aux[:-1], axis=1)
ret = aux[mask]
if not return_index and not return_inverse and not return_counts:
return ret
ret = (ret,)
if return_index:
ret += (perm[mask],)
if return_inverse:
imask = cp.cumsum(mask) - 1
inv_idx = cp.empty(mask.shape, dtype=cp.intp)
inv_idx[perm] = imask
ret += (inv_idx,)
if return_counts:
nonzero = cp.nonzero(mask)[0] # may synchronize
idx = cp.empty((nonzero.size + 1,), dtype=nonzero.dtype)
idx[:-1] = nonzero
idx[-1] = mask.size
ret += (idx[1:] - idx[:-1],)
return ret
[docs]
def count_event_multiplicity(events: Array) -> Array:
"""Count the multiplicity of events in an LM file
Parameters
----------
events : Array
2D (integer) array of LM events of shape (num_events, num_attributes)
where the second axis encodes the event attributes
(e.g. detectors numbers and TOF bins)
Returns
-------
Array
1D array containing the multiplicity of each event
"""
xp = array_api_compat.get_namespace(events)
dev = array_api_compat.device(events)
if is_cuda_array(events):
if cupy_enabled:
tmp = _cupy_unique_axis0(
cp.asarray(events), return_counts=True, return_inverse=True
)
else:
tmp = np.unique(
to_numpy_array(events),
axis=0,
return_counts=True,
return_inverse=True,
)
else:
tmp = np.unique(events, axis=0, return_counts=True, return_inverse=True)
mu = array_api_compat.to_device(xp.from_dlpack(tmp[2][tmp[1]]), dev)
mu = xp.reshape(mu, (array_api_compat.size(mu),))
return mu
[docs]
def to_numpy_array(x: Array) -> np.ndarray:
"""convert an array to a numpy array
Parameters
----------
x : Array
input array (numpy, cupy, torch tensor)
Returns
-------
np.ndarray
"""
if is_cuda_array(x):
return np.asarray(array_api_compat.to_device(x, "cpu"))
else:
return np.asarray(x)