from __future__ import annotations
import os
import importlib
import distutils
import math
import ctypes
from ctypes import POINTER
from ctypes.util import find_library
from pathlib import Path
from warnings import warn
import numpy as np
import numpy.ctypeslib as npct
import numpy.typing as npt
from typing import Union
from types import ModuleType
# check if cuda is present
cuda_present = distutils.spawn.find_executable('nvidia-smi') is not None
# check if cupy is available
cupy_enabled = (importlib.util.find_spec('cupy') is not None)
# define type for cupy or numpy array
if cupy_enabled:
import cupy as cp
import cupy.typing as cpt
XPArray = Union[npt.NDArray, cpt.NDArray]
XPFloat32Array = Union[npt.NDArray[np.float32], cpt.NDArray[np.float32]]
XPShortArray = Union[npt.NDArray[np.int16], cpt.NDArray[np.int16]]
else:
XPArray = npt.NDArray
XPFloat32Array = npt.NDArray[np.float32]
XPShortArray = npt.NDArray[np.int16]
# numpy ctypes lib array definitions
ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags='C')
ar_1d_int = npct.ndpointer(dtype=ctypes.c_int, ndim=1, flags='C')
ar_1d_short = npct.ndpointer(dtype=ctypes.c_short, ndim=1, flags='C')
#---------------------------------------------------------------------------------------
#---- find the compiled C / CUDA libraries
lib_parallelproj_c_fname = None
if 'PARALLELPROJ_C_LIB' in os.environ:
lib_parallelproj_c_fname = os.environ['PARALLELPROJ_C_LIB']
else:
lib_parallelproj_c_fname = find_library('parallelproj_c')
if lib_parallelproj_c_fname is None:
raise ImportError(
'Cannot find parallelproj c lib. Consider setting the environment variable PARALLELPROJ_C_LIB.'
)
else:
lib_parallelproj_c = npct.load_library(
os.path.basename(lib_parallelproj_c_fname),
os.path.dirname(lib_parallelproj_c_fname))
lib_parallelproj_c.joseph3d_fwd.restype = None
lib_parallelproj_c.joseph3d_fwd.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_ulonglong, # nlors
ar_1d_int # img_dim
]
lib_parallelproj_c.joseph3d_back.restype = None
lib_parallelproj_c.joseph3d_back.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_ulonglong, # nlors
ar_1d_int # img_dim
]
lib_parallelproj_c.joseph3d_fwd_tof_sino.restype = None
lib_parallelproj_c.joseph3d_fwd_tof_sino.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte # LOR dep. TOF center offset
]
lib_parallelproj_c.joseph3d_back_tof_sino.restype = None
lib_parallelproj_c.joseph3d_back_tof_sino.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte # LOR dep. TOF center offset
]
lib_parallelproj_c.joseph3d_fwd_tof_lm.restype = None
lib_parallelproj_c.joseph3d_fwd_tof_lm.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte # LOR dep. TOF center offset
]
lib_parallelproj_c.joseph3d_back_tof_lm.restype = None
lib_parallelproj_c.joseph3d_back_tof_lm.argtypes = [
ar_1d_single, # xstart
ar_1d_single, # xend
ar_1d_single, # img
ar_1d_single, # img_origin
ar_1d_single, # voxsize
ar_1d_single, # p
ctypes.c_longlong, # nlors
ar_1d_int, # img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte # LOR dep. TOF center offset
]
#---------------------------------------------------------------------------------------
num_visible_cuda_devices = 0
if cuda_present:
if 'PARALLELPROJ_CUDA_LIB' in os.environ:
lib_parallelproj_cuda_fname = os.environ['PARALLELPROJ_CUDA_LIB']
else:
lib_parallelproj_cuda_fname = find_library('parallelproj_cuda')
if lib_parallelproj_cuda_fname is None:
raise ImportError(
'Cannot find parallelproj cuda lib. Consider settting the environment variable PARALLELPROJ_CUDA_LIB.'
)
else:
lib_parallelproj_cuda = npct.load_library(
os.path.basename(lib_parallelproj_cuda_fname),
os.path.dirname(lib_parallelproj_cuda_fname))
# get the number of visible cuda devices
lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count(
)
if (num_visible_cuda_devices == 0) and cupy_enabled:
cupy_enabled = False
lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_int # threadsperblock
]
lib_parallelproj_cuda.joseph3d_back_cuda.restype = None
lib_parallelproj_cuda.joseph3d_back_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_int # threadsperblock
]
lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_end
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
ctypes.c_int # threadsperblock
]
lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.restype = None
lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_end
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ctypes.c_short, # n_tofbins
ctypes.c_ubyte, # LOR dep.TOF sigma
ctypes.c_ubyte, # LOR dep.TOF center offset
ctypes.c_int
] # threads per block
lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.restype = None
lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
ctypes.c_int
] # threads per block
lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.restype = None
lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.argtypes = [
ar_1d_single, # h_xstart
ar_1d_single, # h_xend
POINTER(POINTER(ctypes.c_float)), # d_img
ar_1d_single, # h_img_origin
ar_1d_single, # h_voxsize
ar_1d_single, # h_p
ctypes.c_longlong, # nlors
ar_1d_int, # h_img_dim
ctypes.c_float, # tofbin_width
ar_1d_single, # sigma tof
ar_1d_single, # tofcenter_offset
ctypes.c_float, # n_sigmas
ar_1d_short, # tof bin
ctypes.c_ubyte, # LOR dep. TOF sigma
ctypes.c_ubyte, # LOR dep. TOF center offset
ctypes.c_int
] # threads per block
lib_parallelproj_cuda.copy_float_array_to_all_devices.restype = POINTER(
POINTER(ctypes.c_float))
lib_parallelproj_cuda.copy_float_array_to_all_devices.argtypes = [
ar_1d_single, # h_array
ctypes.c_longlong # n
]
lib_parallelproj_cuda.free_float_array_on_all_devices.restype = None
lib_parallelproj_cuda.free_float_array_on_all_devices.argtypes = [
POINTER(POINTER(ctypes.c_float)) # d_array
]
lib_parallelproj_cuda.sum_float_arrays_on_first_device.restype = None
lib_parallelproj_cuda.sum_float_arrays_on_first_device.argtypes = [
POINTER(POINTER(ctypes.c_float)), # d_array
ctypes.c_longlong # n
]
lib_parallelproj_cuda.get_float_array_from_device.restype = None
lib_parallelproj_cuda.get_float_array_from_device.argtypes = [
POINTER(POINTER(ctypes.c_float)), # d_array
ctypes.c_longlong, # n
ctypes.c_int, # i_dev
ar_1d_single # h_array
]
#---------------------------------------------------------------------------------------
if cupy_enabled:
# find all cuda kernel files installed with the parallelproj libs
cuda_kernel_files = sorted(
list((Path(lib_parallelproj_cuda_fname).parents[1] /
'lib').glob('projector_kernels.cu.*')))
if len(cuda_kernel_files) == 1:
cuda_kernel_file = cuda_kernel_files[0]
elif len(cuda_kernel_files) > 1:
cuda_kernel_file = cuda_kernel_files[-1]
warn('More than one kernel file available.')
else:
raise ImportError('No kernel file found.')
if cuda_kernel_file is not None:
import cupy as cp
# load a kernel defined in a external file
with open(cuda_kernel_file, 'r', encoding='utf8') as f:
lines = f.read()
_joseph3d_fwd_cuda_kernel = cp.RawKernel(
lines, 'joseph3d_fwd_cuda_kernel')
_joseph3d_back_cuda_kernel = cp.RawKernel(
lines, 'joseph3d_back_cuda_kernel')
_joseph3d_fwd_tof_sino_cuda_kernel = cp.RawKernel(
lines, 'joseph3d_fwd_tof_sino_cuda_kernel')
_joseph3d_back_tof_sino_cuda_kernel = cp.RawKernel(
lines, 'joseph3d_back_tof_sino_cuda_kernel')
_joseph3d_fwd_tof_lm_cuda_kernel = cp.RawKernel(
lines, 'joseph3d_fwd_tof_lm_cuda_kernel')
_joseph3d_back_tof_lm_cuda_kernel = cp.RawKernel(
lines, 'joseph3d_back_tof_lm_cuda_kernel')
else:
warn('cannot find cuda kernel file for cupy kernels')
def calc_chunks(nLORs: int, num_chunks: int) -> list[int]:
""" calculate indices to split an array of length nLORs into num_chunks chunks
example: splitting an array of length 10 into 3 chunks returns [0,4,7,10]
"""
rem = nLORs % num_chunks
div = (nLORs // num_chunks)
chunks = [0]
for i in range(num_chunks):
if i < rem:
nLORs_chunck = div + 1
else:
nLORs_chunck = div
chunks.append(chunks[i] + nLORs_chunck)
return chunks
[docs]def joseph3d_fwd(xstart: XPFloat32Array,
xend: XPFloat32Array,
img: XPFloat32Array,
img_origin: XPFloat32Array,
voxsize: XPFloat32Array,
img_fwd: XPFloat32Array,
threadsperblock: int = 32,
num_chunks: int = 1) -> None:
"""Non-TOF Joseph 3D forward projector
Parameters
----------
xstart : XPFloat32Array (float32 numpy or cupy array)
start world coordinates of the LORs, shape (nLORs, 3)
xend : XPFloat32Array (float32 numpy or cupy array)
end world coordinates of the LORs, shape (nLORs, 3)
img : XPFloat32Array (float32 numpy or cupy array)
containing the 3D image to be projected
img_origin : XPFloat32Array (float32 numpy or cupy array)
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : XPFloat32Array (float32 numpy or cupy array)
array containing the voxel size
img_fwd : XPFloat32Array (float32 numpy or cupy array)
output array of length nLORs for storing the forward projection
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
"""
img_dim = np.array(img.shape, dtype=np.int32)
nLORs = xstart.shape[0]
# check whether the input image is a numpy or cupy array
xp = get_array_module(img)
if (xp.__name__ == 'cupy'):
# projection of cupy GPU array using the cupy raw kernel
_joseph3d_fwd_cuda_kernel(
(math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
(xstart.ravel(), xend.ravel(), img.ravel(), xp.asarray(img_origin),
xp.asarray(voxsize), img_fwd, np.int64(nLORs),
xp.asarray(img_dim)))
xp.cuda.Device().synchronize()
else:
if num_visible_cuda_devices > 0:
# projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
# send image to all devices
d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
img.ravel(), num_voxel)
# split call to GPU lib into chunks (useful for systems with limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
lib_parallelproj_cuda.joseph3d_fwd_cuda(
xstart[ic[i]:(ic[i + 1]), :].ravel(),
xend[ic[i]:(ic[i + 1]), :].ravel(), d_img, img_origin,
voxsize,
img_fwd.ravel()[ic[i]:ic[i + 1]], ic[i + 1] - ic[i],
img_dim, threadsperblock)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
else:
# projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_fwd(xstart.ravel(), xend.ravel(),
img.ravel(), img_origin, voxsize,
img_fwd.ravel(), np.int64(nLORs),
img_dim)
[docs]def joseph3d_back(xstart: XPFloat32Array,
xend: XPFloat32Array,
back_img: XPFloat32Array,
img_origin: XPFloat32Array,
voxsize: XPFloat32Array,
sino: XPFloat32Array,
threadsperblock: int = 32,
num_chunks: int = 1) -> None:
"""Non-TOF Joseph 3D back projector
Parameters
----------
xstart : XPFloat32Array (float32 numpy or cupy array)
start world coordinates of the LORs, shape (nLORs, 3)
xend : XPFloat32Array (float32 numpy or cupy array)
end world coordinates of the LORs, shape (nLORs, 3)
back_img : XPFloat32Array (float32 numpy or cupy array)
output array for the back projection
img_origin : XPFloat32Array (float32 numpy or cupy array)
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : XPFloat32Array (float32 numpy or cupy array)
array containing the voxel size
sino : XPFloat32Array (float32 numpy or cupy array)
array of length nLORs containing the values to be back projected
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the back projection in hybrid mode into chunks to
save memory on the GPU, by default 1
"""
img_dim = np.array(back_img.shape, dtype=np.int32)
nLORs = xstart.shape[0]
# check whether the input image is a numpy or cupy array
xp = get_array_module(sino)
if (xp.__name__ == 'cupy'):
# back projection of cupy GPU array using the cupy raw kernel
_joseph3d_back_cuda_kernel(
(math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
(xstart.ravel(), xend.ravel(), back_img.ravel(),
xp.asarray(img_origin), xp.asarray(voxsize), sino.ravel(),
np.int64(nLORs), xp.asarray(img_dim)))
xp.cuda.Device().synchronize()
else:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
# send image to all devices
d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
back_img.ravel(), num_voxel)
# split call to GPU lib into chunks (useful for systems with limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
lib_parallelproj_cuda.joseph3d_back_cuda(
xstart[ic[i]:(ic[i + 1]), :].ravel(),
xend[ic[i]:(ic[i + 1]), :].ravel(), d_back_img, img_origin,
voxsize,
sino.ravel()[ic[i]:ic[i + 1]], ic[i + 1] - ic[i], img_dim,
threadsperblock)
# sum all device arrays in the first device
lib_parallelproj_cuda.sum_float_arrays_on_first_device(
d_back_img, num_voxel)
# copy summed image back from first device
lib_parallelproj_cuda.get_float_array_from_device(
d_back_img, num_voxel, 0, back_img.ravel())
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back(xstart.ravel(), xend.ravel(),
back_img.ravel(), img_origin,
voxsize, sino.ravel(),
np.int64(nLORs), img_dim)
[docs]def joseph3d_fwd_tof_sino(xstart: XPFloat32Array,
xend: XPFloat32Array,
img: XPFloat32Array,
img_origin: XPFloat32Array,
voxsize: XPFloat32Array,
img_fwd: XPFloat32Array,
tofbin_width: float,
sigma_tof: XPFloat32Array,
tofcenter_offset: XPFloat32Array,
nsigmas: float,
ntofbins: int,
threadsperblock: int = 32,
num_chunks: int = 1) -> None:
"""TOF Joseph 3D sinogram forward projector
Parameters
----------
xstart : XPFloat32Array (float32 numpy or cupy array)
start world coordinates of the LORs, shape (nLORs, 3)
xend : XPFloat32Array (float32 numpy or cupy array)
end world coordinates of the LORs, shape (nLORs, 3)
img : XPFloat32Array (float32 numpy or cupy array)
containing the 3D image to be projected
img_origin : XPFloat32Array (float32 numpy or cupy array)
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : XPFloat32Array (float32 numpy or cupy array)
array containing the voxel size
img_fwd : XPFloat32Array (float32 numpy or cupy array)
output array of size (nLORs*ntofbins) for storing the forward projection
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
sigma_tof : XPFloat32Array (float32 numpy or cupy array)
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: XPFloat32Array (float32 numpy or cupy array)
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length nLORs -> LOR dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
ntofbins: int
total number of TOF bins
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
"""
img_dim = np.array(img.shape, dtype=np.int32)
nLORs = xstart.shape[0]
lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = np.uint8(
tofcenter_offset.shape[0] == nLORs)
# check whether the input image is a numpy or cupy array
xp = get_array_module(img)
if (xp.__name__ == 'cupy'):
# projection of cupy GPU array using the cupy raw kernel
_joseph3d_fwd_tof_sino_cuda_kernel(
(math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
(xstart.ravel(), xend.ravel(), img.ravel(), xp.asarray(img_origin),
xp.asarray(voxsize), img_fwd.ravel(), np.int64(nLORs),
xp.asarray(img_dim), xp.int16(ntofbins), np.float32(tofbin_width),
xp.asarray(sigma_tof.ravel()), xp.asarray(
tofcenter_offset.ravel()), xp.float32(nsigmas),
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
# send image to all devices
d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
img.ravel(), num_voxel)
# split call to GPU lib into chunks (useful for systems with limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda(
xstart[ic[i]:(ic[i + 1]), :].ravel(),
xend[ic[i]:(ic[i + 1]), :].ravel(), d_img, img_origin,
voxsize, img_fwd[ic[i]:(ic[i + 1]), :].ravel(),
np.int64(ic[i + 1] - ic[i]), img_dim,
np.float32(tofbin_width),
sigma_tof.ravel()[isig0:isig1],
tofcenter_offset.ravel()[ioff0:ioff1], np.float32(nsigmas),
np.int16(ntofbins), lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset, threadsperblock)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_fwd_tof_sino(
xstart.ravel(), xend.ravel(), img.ravel(), img_origin, voxsize,
img_fwd.ravel(), np.int64(nLORs), img_dim, tofbin_width,
sigma_tof.ravel(), tofcenter_offset.ravel(), nsigmas, ntofbins,
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)
[docs]def joseph3d_back_tof_sino(xstart: XPFloat32Array,
xend: XPFloat32Array,
back_img: XPFloat32Array,
img_origin: XPFloat32Array,
voxsize: XPFloat32Array,
sino: XPFloat32Array,
tofbin_width: float,
sigma_tof: XPFloat32Array,
tofcenter_offset: XPFloat32Array,
nsigmas: float,
ntofbins: int,
threadsperblock: int = 32,
num_chunks: int = 1) -> None:
"""TOF Joseph 3D sinogram back projector
Parameters
----------
xstart : XPFloat32Array (float32 numpy or cupy array)
start world coordinates of the LORs, shape (nLORs, 3)
xend : XPFloat32Array (float32 numpy or cupy array)
end world coordinates of the LORs, shape (nLORs, 3)
back_img : XPFloat32Array (float32 numpy or cupy array)
output array to be back projected
img_origin : XPFloat32Array (float32 numpy or cupy array)
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : XPFloat32Array (float32 numpy or cupy array)
array containing the voxel size
sino : XPFloat32Array (float32 numpy or cupy array)
TOF sinogram of size (nLORs*ntofbins) to be back projected
threadsperblock : int, optional
by default 32
sigma_tof : XPFloat32Array (float32 numpy or cupy array)
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: XPFloat32Array (float32 numpy or cupy array)
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length nLORs -> LOR dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
ntofbins: int
total number of TOF bins
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
"""
img_dim = np.array(back_img.shape, dtype=np.int32)
nLORs = xstart.shape[0]
lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = np.uint8(
tofcenter_offset.shape[0] == nLORs)
# check whether the input image is a numpy or cupy array
xp = get_array_module(sino)
if (xp.__name__ == 'cupy'):
# projection of cupy GPU array using the cupy raw kernel
_joseph3d_back_tof_sino_cuda_kernel(
(math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
(xstart.ravel(), xend.ravel(), back_img, xp.asarray(img_origin),
xp.asarray(voxsize), sino.ravel(), np.int64(nLORs),
xp.asarray(img_dim), np.int16(ntofbins), np.float32(tofbin_width),
xp.asarray(sigma_tof).ravel(),
xp.asarray(tofcenter_offset).ravel(), np.float32(nsigmas),
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
# send image to all devices
d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
back_img.ravel(), num_voxel)
# split call to GPU lib into chunks (useful for systems with limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda(
xstart[ic[i]:(ic[i + 1]), :].ravel(),
xend[ic[i]:(ic[i + 1]), :].ravel(), d_back_img, img_origin,
voxsize, sino[ic[i]:(ic[i + 1]), :].ravel(),
ic[i + 1] - ic[i], img_dim, tofbin_width,
sigma_tof.ravel()[isig0:isig1],
tofcenter_offset.ravel()[ioff0:ioff1], nsigmas, ntofbins,
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset,
threadsperblock)
# sum all device arrays in the first device
lib_parallelproj_cuda.sum_float_arrays_on_first_device(
d_back_img, num_voxel)
# copy summed image back from first device
lib_parallelproj_cuda.get_float_array_from_device(
d_back_img, num_voxel, 0, back_img.ravel())
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back_tof_sino(
xstart.ravel(), xend.ravel(), back_img.ravel(), img_origin,
voxsize, sino.ravel(), np.int64(nLORs), img_dim, tofbin_width,
sigma_tof.ravel(), tofcenter_offset.ravel(), nsigmas, ntofbins,
lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)
[docs]def joseph3d_fwd_tof_lm(xstart: XPFloat32Array,
xend: XPFloat32Array,
img: XPFloat32Array,
img_origin: XPFloat32Array,
voxsize: XPFloat32Array,
img_fwd: XPFloat32Array,
tofbin_width: float,
sigma_tof: XPFloat32Array,
tofcenter_offset: XPFloat32Array,
nsigmas: float,
tofbin: XPShortArray,
threadsperblock: int = 32,
num_chunks: int = 1) -> None:
"""TOF Joseph 3D listmode forward projector
Parameters
----------
xstart : XPFloat32Array (float32 numpy or cupy array)
start world coordinates of the LORs, shape (nLORs, 3)
xend : XPFloat32Array (float32 numpy or cupy array)
end world coordinates of the LORs, shape (nLORs, 3)
img : XPFloat32Array (float32 numpy or cupy array)
containing the 3D image to be projected
img_origin : XPFloat32Array (float32 numpy or cupy array)
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : XPFloat32Array (float32 numpy or cupy array)
array containing the voxel size
img_fwd : XPFloat32Array (float32 numpy or cupy array)
output array of size (xstart.shape[0] / nevents) for storing the forward projection
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
sigma_tof : XPFloat32Array (float32 numpy or cupy array)
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: XPFloat32Array (float32 numpy or cupy array)
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length nLORs -> LOR dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
tofbin: XPShortArray
array containing the tof bin of the events
threadsperblock : int, optional
by default 32
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
"""
img_dim = np.array(img.shape, dtype=np.int32)
nLORs = xstart.shape[0]
lor_dependent_sigma_tof = int(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = int(tofcenter_offset.shape[0] == nLORs)
# check whether the input image is a numpy or cupy array
xp = get_array_module(img)
if (xp.__name__ == 'cupy'):
# projection of cupy GPU array using the cupy raw kernel
_joseph3d_fwd_tof_lm_cuda_kernel(
(math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
(xstart.ravel(), xend.ravel(), img.ravel(), xp.asarray(img_origin),
xp.asarray(voxsize), img_fwd, np.int64(nLORs),
xp.asarray(img_dim), np.float32(tofbin_width),
xp.asarray(sigma_tof), xp.asarray(tofcenter_offset),
np.float32(nsigmas), tofbin, lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if num_visible_cuda_devices > 0:
# projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
# send image to all devices
d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
img.ravel(), num_voxel)
# split call to GPU lib into chunks (useful for systems with limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda(
xstart.ravel()[(3 * ic[i]):(3 * ic[i + 1])],
xend.ravel()[(3 * ic[i]):(3 * ic[i + 1])], d_img,
img_origin, voxsize, img_fwd[ic[i]:ic[i + 1]],
ic[i + 1] - ic[i], img_dim, tofbin_width,
sigma_tof[isig0:isig1], tofcenter_offset[ioff0:ioff1],
nsigmas, tofbin[ic[i]:ic[i + 1]], lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset, threadsperblock)
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(
d_img, num_voxel)
else:
# projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_fwd_tof_lm(
xstart.ravel(), xend.ravel(), img.ravel(), img_origin, voxsize,
img_fwd, np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
tofcenter_offset, nsigmas, tofbin, lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset)
[docs]def joseph3d_back_tof_lm(xstart: XPFloat32Array,
xend: XPFloat32Array,
back_img: XPFloat32Array,
img_origin: XPFloat32Array,
voxsize: XPFloat32Array,
lst: XPFloat32Array,
tofbin_width: float,
sigma_tof: XPFloat32Array,
tofcenter_offset: XPFloat32Array,
nsigmas: float,
tofbin: XPShortArray,
threadsperblock: int = 32,
num_chunks: int = 1) -> None:
"""TOF Joseph 3D listmode back projector
Parameters
----------
xstart : XPFloat32Array (float32 numpy or cupy array)
start world coordinates of the LORs, shape (nLORs, 3)
xend : XPFloat32Array (float32 numpy or cupy array)
end world coordinates of the LORs, shape (nLORs, 3)
back_img : XPFloat32Array (float32 numpy or cupy array)
output array to be back projected
img_origin : XPFloat32Array (float32 numpy or cupy array)
containing the world coordinates of the image origin (voxel [0,0,0])
voxsize : XPFloat32Array (float32 numpy or cupy array)
array containing the voxel size
lst : XPFloat32Array (float32 numpy or cupy array)
"list" of values to be projected
threadsperblock : int, optional
by default 32
sigma_tof : XPFloat32Array (float32 numpy or cupy array)
sigma of Gaussian TOF kernel in spatial units (same units as xstart)
can be an array of length 1 -> same sigma for all LORs
or an array of length nLORs -> LOR dependent sigma
tofcenter_offset: XPFloat32Array (float32 numpy or cupy array)
center offset of the central TOF bin in spatial units (same units as xstart)
can be an array of length 1 -> same offset for all LORs
or an array of length nLORs -> LOR dependent offset
nsigmas: float
number of sigmas to consider when Gaussian kernel is evaluated (truncated)
tofbin: XPShortArray
array containing the tof bin of the events
num_chunks : int, optional
break down the projection in hybrid mode into chunks to
save memory on the GPU, by default 1
tofbin_width : float
width of the TOF bin in spatial units (same units as xstart)
"""
img_dim = np.array(back_img.shape, dtype=np.int32)
nLORs = xstart.shape[0]
lor_dependent_sigma_tof = int(sigma_tof.shape[0] == nLORs)
lor_dependent_tofcenter_offset = int(tofcenter_offset.shape[0] == nLORs)
# check whether the input is a numpy or cupy array
xp = get_array_module(lst)
if (xp.__name__ == 'cupy'):
# projection of cupy GPU array using the cupy raw kernel
_joseph3d_back_tof_lm_cuda_kernel(
(math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
(xstart.ravel(), xend.ravel(), back_img.ravel(),
xp.asarray(img_origin), xp.asarray(voxsize), lst, np.int64(nLORs),
xp.asarray(img_dim), np.float32(tofbin_width),
xp.asarray(sigma_tof), xp.asarray(tofcenter_offset),
np.float32(nsigmas), tofbin, lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset))
xp.cuda.Device().synchronize()
else:
if num_visible_cuda_devices > 0:
# back projection of numpy array using the cuda parallelproj lib
num_voxel = ctypes.c_longlong(img_dim[0] * img_dim[1] * img_dim[2])
# send image to all devices
d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
back_img.ravel(), num_voxel)
# split call to GPU lib into chunks (useful for systems with limited memory)
ic = calc_chunks(nLORs, num_chunks)
for i in range(num_chunks):
if lor_dependent_sigma_tof:
isig0 = ic[i]
isig1 = ic[i + 1]
else:
isig0 = 0
isig1 = 1
if lor_dependent_tofcenter_offset:
ioff0 = ic[i]
ioff1 = ic[i + 1]
else:
ioff0 = 0
ioff1 = 1
lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda(
xstart.ravel()[(3 * ic[i]):(3 * ic[i + 1])],
xend.ravel()[(3 * ic[i]):(3 * ic[i + 1])], d_back_img,
img_origin, voxsize, lst[ic[i]:ic[i + 1]],
ic[i + 1] - ic[i], img_dim, tofbin_width,
sigma_tof[isig0:isig1], tofcenter_offset[ioff0:ioff1],
nsigmas, tofbin[ic[i]:ic[i + 1]], lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset, threadsperblock)
# sum all device arrays in the first device
lib_parallelproj_cuda.sum_float_arrays_on_first_device(
d_back_img, num_voxel)
# copy summed image back from first device
lib_parallelproj_cuda.get_float_array_from_device(
d_back_img, num_voxel, 0, back_img.ravel())
# free image device arrays
lib_parallelproj_cuda.free_float_array_on_all_devices(
d_back_img, num_voxel)
else:
# back projection of numpy array using the openmp parallelproj lib
lib_parallelproj_c.joseph3d_back_tof_lm(
xstart.ravel(),
xend.ravel(), back_img.ravel(), img_origin, voxsize, lst,
np.int64(nLORs), img_dim, tofbin_width, sigma_tof,
tofcenter_offset, nsigmas, tofbin, lor_dependent_sigma_tof,
lor_dependent_tofcenter_offset)
#-----------------------------------------------------------------------------
def get_array_module(array: npt.NDArray | cpt.NDArray) -> ModuleType:
"""get the module of a numpy or cupy array
Parameters
----------
array : npt.NDArray | cpt.NDArray
a numpy or cupy array
Returns
-------
ModuleType
the numpy or cupy module
"""
if cupy_enabled:
return cp.get_array_module(array)
else:
return np