Source code for parallelproj.backend

from __future__ import annotations

import os
import importlib
import distutils
import math

import ctypes
from ctypes import POINTER
from ctypes.util import find_library

from pathlib import Path
from warnings import warn

import numpy as np
import numpy.ctypeslib as npct
import numpy.typing as npt

import array_api_compat

# check if cuda is present
cuda_present = distutils.spawn.find_executable('nvidia-smi') is not None

# check if cupy is available
cupy_enabled = (importlib.util.find_spec('cupy') is not None)

# check if cupy is available
torch_enabled = (importlib.util.find_spec('torch') is not None)

# define type for cupy or numpy array
if cupy_enabled:
    import cupy as cp

# numpy ctypes lib array definitions
ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags='C')
ar_1d_int = npct.ndpointer(dtype=ctypes.c_int, ndim=1, flags='C')
ar_1d_short = npct.ndpointer(dtype=ctypes.c_short, ndim=1, flags='C')

#---------------------------------------------------------------------------------------
#---- find the compiled C / CUDA libraries

lib_parallelproj_c_fname = None
if 'PARALLELPROJ_C_LIB' in os.environ:
    lib_parallelproj_c_fname = os.environ['PARALLELPROJ_C_LIB']
else:
    lib_parallelproj_c_fname = find_library('parallelproj_c')

if lib_parallelproj_c_fname is None:
    raise ImportError(
        'Cannot find parallelproj c lib. Consider setting the environment variable PARALLELPROJ_C_LIB.'
    )
else:
    lib_parallelproj_c = npct.load_library(
        os.path.basename(lib_parallelproj_c_fname),
        os.path.dirname(lib_parallelproj_c_fname))

    lib_parallelproj_c.joseph3d_fwd.restype = None
    lib_parallelproj_c.joseph3d_fwd.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_ulonglong,  # nlors
        ar_1d_int  # img_dim
    ]

    lib_parallelproj_c.joseph3d_back.restype = None
    lib_parallelproj_c.joseph3d_back.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p            
        ctypes.c_ulonglong,  # nlors
        ar_1d_int  # img_dim
    ]
    lib_parallelproj_c.joseph3d_fwd_tof_sino.restype = None
    lib_parallelproj_c.joseph3d_fwd_tof_sino.argtypes = [
        ar_1d_single,  #  xstart
        ar_1d_single,  #  xend
        ar_1d_single,  #  img
        ar_1d_single,  #  img_origin
        ar_1d_single,  #  voxsize
        ar_1d_single,  #  p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ctypes.c_short,  # n_tofbins
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_back_tof_sino.restype = None
    lib_parallelproj_c.joseph3d_back_tof_sino.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ctypes.c_short,  # n_tofbins
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_fwd_tof_lm.restype = None
    lib_parallelproj_c.joseph3d_fwd_tof_lm.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ar_1d_short,  # tof bin
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_back_tof_lm.restype = None
    lib_parallelproj_c.joseph3d_back_tof_lm.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ar_1d_short,  # tof bin
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

#---------------------------------------------------------------------------------------

num_visible_cuda_devices = 0

if cuda_present:
    if 'PARALLELPROJ_CUDA_LIB' in os.environ:
        lib_parallelproj_cuda_fname = os.environ['PARALLELPROJ_CUDA_LIB']
    else:
        lib_parallelproj_cuda_fname = find_library('parallelproj_cuda')

    if lib_parallelproj_cuda_fname is None:
        raise ImportError(
            'Cannot find parallelproj cuda lib. Consider settting the environment variable PARALLELPROJ_CUDA_LIB.'
        )
    else:
        lib_parallelproj_cuda = npct.load_library(
            os.path.basename(lib_parallelproj_cuda_fname),
            os.path.dirname(lib_parallelproj_cuda_fname))

        # get the number of visible cuda devices
        lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
        num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count(
        )

        if (num_visible_cuda_devices == 0) and cupy_enabled:
            cupy_enabled = False

        lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_int  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_back_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_int  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_end
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ctypes.c_short,  # n_tofbins
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_end
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ctypes.c_short,  # n_tofbins
            ctypes.c_ubyte,  # LOR dep.TOF sigma
            ctypes.c_ubyte,  # LOR dep.TOF center offset
            ctypes.c_int
        ]  # threads per block

        lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ar_1d_short,  # tof bin
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int
        ]  # threads per block

        lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ar_1d_short,  # tof bin
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int
        ]  # threads per block

        lib_parallelproj_cuda.copy_float_array_to_all_devices.restype = POINTER(
            POINTER(ctypes.c_float))
        lib_parallelproj_cuda.copy_float_array_to_all_devices.argtypes = [
            ar_1d_single,  # h_array
            ctypes.c_longlong  # n
        ]

        lib_parallelproj_cuda.free_float_array_on_all_devices.restype = None
        lib_parallelproj_cuda.free_float_array_on_all_devices.argtypes = [
            POINTER(POINTER(ctypes.c_float))  # d_array
        ]

        lib_parallelproj_cuda.sum_float_arrays_on_first_device.restype = None
        lib_parallelproj_cuda.sum_float_arrays_on_first_device.argtypes = [
            POINTER(POINTER(ctypes.c_float)),  # d_array
            ctypes.c_longlong  # n
        ]

        lib_parallelproj_cuda.get_float_array_from_device.restype = None
        lib_parallelproj_cuda.get_float_array_from_device.argtypes = [
            POINTER(POINTER(ctypes.c_float)),  # d_array
            ctypes.c_longlong,  # n
            ctypes.c_int,  # i_dev
            ar_1d_single  # h_array
        ]

    #---------------------------------------------------------------------------------------
    if cupy_enabled:
        # find all cuda kernel files installed with the parallelproj libs
        cuda_kernel_files = sorted(
            list((Path(lib_parallelproj_cuda_fname).parents[1] /
                  'lib').glob('projector_kernels.cu.*')))
        if len(cuda_kernel_files) == 1:
            cuda_kernel_file = cuda_kernel_files[0]
        elif len(cuda_kernel_files) > 1:
            cuda_kernel_file = cuda_kernel_files[-1]
            warn('More than one kernel file available.')
        else:
            raise ImportError('No kernel file found.')

        if cuda_kernel_file is not None:
            import cupy as cp

            # load a kernel defined in a external file
            with open(cuda_kernel_file, 'r', encoding='utf8') as f:
                lines = f.read()

            _joseph3d_fwd_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_fwd_cuda_kernel')
            _joseph3d_back_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_back_cuda_kernel')
            _joseph3d_fwd_tof_sino_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_fwd_tof_sino_cuda_kernel')
            _joseph3d_back_tof_sino_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_back_tof_sino_cuda_kernel')
            _joseph3d_fwd_tof_lm_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_fwd_tof_lm_cuda_kernel')
            _joseph3d_back_tof_lm_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_back_tof_lm_cuda_kernel')
        else:
            warn('cannot find cuda kernel file for cupy kernels')


def calc_chunks(nLORs: int | np.int64,
                num_chunks: int) -> list[int] | list[np.int64]:
    """ calculate indices to split an array of length nLORs into num_chunks chunks

        example: splitting an array of length 10 into 3 chunks returns [0,4,7,10]
    """
    rem = nLORs % num_chunks
    div = (nLORs // num_chunks)

    chunks = [0]

    for i in range(num_chunks):
        if i < rem:
            nLORs_chunck = div + 1
        else:
            nLORs_chunck = div

        chunks.append(chunks[i] + nLORs_chunck)

    return chunks


def is_cuda_array(x: npt.ArrayLike) -> bool:
    """test whether an array is a cuda array

    Parameters
    ----------
    x : Array (numpy, cupy, torch, ...)
        array to be tested

    Returns
    -------
    bool
    """    

    iscuda = False

    if 'cupy' in array_api_compat.get_namespace(x).__name__:
        iscuda = True
    elif 'torch' in array_api_compat.get_namespace(x).__name__:
        if array_api_compat.device(x).type == 'cuda':
            iscuda = True

    return iscuda


[docs]def joseph3d_fwd(xstart: npt.ArrayLike, xend: npt.ArrayLike, img: npt.ArrayLike, img_origin: npt.ArrayLike, voxsize: npt.ArrayLike, threadsperblock: int = 32, num_chunks: int = 1) -> npt.ArrayLike: """Non-TOF Joseph 3D forward projector Parameters ---------- xstart : npt.ArrayLike (numpy/cupy array or torch tensor) start world coordinates of the LORs, shape (nLORs, 3) xend : npt.ArrayLike (numpy/cupy array or torch tensor) end world coordinates of the LORs, shape (nLORs, 3) img : npt.ArrayLike (numpy/cupy array or torch tensor) containing the 3D image to be projected img_origin : npt.ArrayLike (numpy/cupy array or torch tensor) containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : npt.ArrayLike (numpy/cupy array or torch tensor) array containing the voxel size threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img) if is_cuda_array(img): # projection of GPU array (cupy to torch GPU array) using the cupy raw kernel img_fwd = cp.zeros(xstart.shape[:-1], dtype=cp.float32) _joseph3d_fwd_cuda_kernel( (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ), (cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), cp.asarray(img, dtype=cp.float32).ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), img_fwd.ravel(), nLORs, cp.asarray(img.shape, dtype=cp.int32))) cp.cuda.Device().synchronize() else: img_fwd = np.zeros(xstart.shape[:-1], dtype=np.float32) # projection of CPU array (numpy to torch CPU array) if num_visible_cuda_devices > 0: # projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(img)) # send image to all devices d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( np.asarray(img, dtype=np.float32).ravel(), num_voxel) # split call to GPU lib into chunks (useful for systems with limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): lib_parallelproj_cuda.joseph3d_fwd_cuda( np.asarray(xstart, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), np.asarray(xend, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel()[ic[i]:ic[i + 1]], ic[i + 1] - ic[i], np.asarray(img.shape, dtype=np.int32), threadsperblock) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_img) else: # projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_fwd( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), np.asarray(img, dtype=np.float32).ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel(), nLORs, np.asarray(img.shape, dtype=np.int32)) return xp.asarray(img_fwd, device=array_api_compat.device(img))
[docs]def joseph3d_back(xstart: npt.ArrayLike, xend: npt.ArrayLike, img_shape: tuple[int, int, int], img_origin: npt.ArrayLike, voxsize: npt.ArrayLike, img_fwd: npt.ArrayLike, threadsperblock: int = 32, num_chunks: int = 1) -> npt.ArrayLike: """Non-TOF Joseph 3D back projector Parameters ---------- xstart : npt.ArrayLike (numpy/cupy array or torch tensor) start world coordinates of the LORs, shape (nLORs, 3) xend : npt.ArrayLike (numpy/cupy array or torch tensor) end world coordinates of the LORs, shape (nLORs, 3) img_shape : tuple[int, int, int] the shape of the back projected image img_origin : npt.ArrayLike (numpy/cupy array or torch tensor) containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : npt.ArrayLike (numpy/cupy array or torch tensor) array containing the voxel size img_fwd : npt.ArrayLike (numpy/cupy array or torch tensor) array of length nLORs containing the values to be back projected threadsperblock : int, optional by default 32 num_chunks : int, optional break down the back projection in hybrid mode into chunks to save memory on the GPU, by default 1 """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img_fwd) if (is_cuda_array(img_fwd)): # back projection of cupy or torch GPU array using the cupy raw kernel back_img = cp.zeros(img_shape, dtype=cp.float32) _joseph3d_back_cuda_kernel( (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ), (cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), cp.asarray(img_fwd, dtype=cp.float32).ravel(), nLORs, cp.asarray(back_img.shape, dtype=cp.int32))) cp.cuda.Device().synchronize() else: # back projection of numpy or torch CPU array back_img = np.zeros(img_shape, dtype=np.float32) if num_visible_cuda_devices > 0: # back projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(back_img)) # send image to all devices d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( back_img.ravel(), num_voxel) # split call to GPU lib into chunks (useful for systems with limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): lib_parallelproj_cuda.joseph3d_back_cuda( np.asarray(xstart, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), np.asarray(xend, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_back_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel()[ic[i]:ic[i + 1]], ic[i + 1] - ic[i], np.asarray(back_img.shape, dtype=np.int32), threadsperblock) # sum all device arrays in the first device lib_parallelproj_cuda.sum_float_arrays_on_first_device( d_back_img, num_voxel) # copy summed image back from first device lib_parallelproj_cuda.get_float_array_from_device( d_back_img, num_voxel, 0, back_img.ravel()) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img) else: # back projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_back( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel(), nLORs, np.asarray(back_img.shape, dtype=np.int32)) return xp.asarray(back_img, device=array_api_compat.device(img_fwd))
[docs]def joseph3d_fwd_tof_sino(xstart: npt.ArrayLike, xend: npt.ArrayLike, img: npt.ArrayLike, img_origin: npt.ArrayLike, voxsize: npt.ArrayLike, tofbin_width: float, sigma_tof: npt.ArrayLike, tofcenter_offset: npt.ArrayLike, nsigmas: float, ntofbins: int, threadsperblock: int = 32, num_chunks: int = 1) -> npt.ArrayLike: """TOF Joseph 3D sinogram forward projector Parameters ---------- xstart : npt.ArrayLike (numpy/cupy array or torch tensor) start world coordinates of the LORs, shape (nLORs, 3) xend : npt.ArrayLike (numpy/cupy array or torch tensor) end world coordinates of the LORs, shape (nLORs, 3) img : npt.ArrayLike (numpy/cupy array or torch tensor) containing the 3D image to be projected img_origin : npt.ArrayLike (numpy/cupy array or torch tensor) containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : npt.ArrayLike (numpy/cupy array or torch tensor) array containing the voxel size tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor) sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length nLORs -> LOR dependent sigma tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor) center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all LORs or an array of length nLORs -> LOR dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) ntofbins: int total number of TOF bins threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- npt.ArrayLike (numpy/cupy array or torch tensor) """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img) lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8( tofcenter_offset.shape[0] == nLORs) if (is_cuda_array(img)): # projection of cupy or torch GPU array using the cupy raw kernel img_fwd = cp.zeros(xstart.shape[:-1] + (ntofbins, ), dtype=cp.float32) _joseph3d_fwd_tof_sino_cuda_kernel( (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ), (cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), cp.asarray(img, dtype=cp.float32).ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), img_fwd.ravel(), nLORs, cp.asarray(img.shape, dtype=cp.int32), np.int16(ntofbins), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32).ravel(), cp.asarray(tofcenter_offset, dtype=cp.float32).ravel(), np.float32(nsigmas), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)) cp.cuda.Device().synchronize() else: # projection of numpy or torch CPU array img_fwd = np.zeros(xstart.shape[:-1] + (ntofbins, ), dtype=np.float32) if num_visible_cuda_devices > 0: # projection using libparallelproj_cuda num_voxel = ctypes.c_longlong(array_api_compat.size(img)) # send image to all devices d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( np.asarray(img, dtype=np.float32).ravel(), num_voxel) # split call to GPU lib into chunks (useful for systems with limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda( np.asarray(xstart, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), np.asarray(xend, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel()[ic[i]:(ic[i + 1])], ic[i + 1] - ic[i], np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel()[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32).ravel()[ioff0:ioff1], np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_img) else: # projection the openmp libparallelproj_c lib_parallelproj_c.joseph3d_fwd_tof_sino( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), np.asarray(img, dtype=np.float32).ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd.ravel(), np.int64(nLORs), np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel(), np.asarray(tofcenter_offset, dtype=np.float32).ravel(), np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset) return xp.asarray(img_fwd, device=array_api_compat.device(img))
[docs]def joseph3d_back_tof_sino(xstart: npt.ArrayLike, xend: npt.ArrayLike, img_shape: tuple[int, int, int], img_origin: npt.ArrayLike, voxsize: npt.ArrayLike, img_fwd: npt.ArrayLike, tofbin_width: float, sigma_tof: npt.ArrayLike, tofcenter_offset: npt.ArrayLike, nsigmas: float, ntofbins: int, threadsperblock: int = 32, num_chunks: int = 1) -> npt.ArrayLike: """TOF Joseph 3D sinogram back projector Parameters ---------- xstart : npt.ArrayLike (numpy/cupy array or torch tensor) start world coordinates of the LORs, shape (nLORs, 3) xend : npt.ArrayLike (numpy/cupy array or torch tensor) end world coordinates of the LORs, shape (nLORs, 3) img_shape : tuple[int, int, int] the shape of the back projected image img_origin : npt.ArrayLike (numpy/cupy array or torch tensor) containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : npt.ArrayLike (numpy/cupy array or torch tensor) array containing the voxel size img_fwd : npt.ArrayLike (numpy/cupy array or torch tensor) array of size nLOR*ntofbins containing the values to be back projected tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor) sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length nLORs -> LOR dependent sigma tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor) center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all LORs or an array of length nLORs -> LOR dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) ntofbins: int total number of TOF bins threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- npt.ArrayLike (numpy/cupy array or torch tensor) """ nLORs = np.int64(array_api_compat.size(xstart) // 3) xp = array_api_compat.get_namespace(img_fwd) lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8( tofcenter_offset.shape[0] == nLORs) if (is_cuda_array(img_fwd)): # back projection of cupy or torch GPU array using the cupy raw kernel back_img = cp.zeros(img_shape, dtype=cp.float32) _joseph3d_back_tof_sino_cuda_kernel( (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ), (cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), cp.asarray(img_fwd, dtype=cp.float32).ravel(), nLORs, cp.asarray(back_img.shape, dtype=cp.int32), np.int16(ntofbins), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32).ravel(), cp.asarray(tofcenter_offset, dtype=cp.float32).ravel(), np.float32(nsigmas), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)) cp.cuda.Device().synchronize() else: # back projection of numpy or torch CPU array back_img = np.zeros(img_shape, dtype=np.float32) if num_visible_cuda_devices > 0: # back projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(back_img)) # send image to all devices d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( back_img.ravel(), num_voxel) # split call to GPU lib into chunks (useful for systems with limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda( np.asarray(xstart, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), np.asarray(xend, dtype=np.float32).reshape( -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_back_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel()[ic[i]:ic[i + 1]], ic[i + 1] - ic[i], np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel()[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32).ravel()[ioff0:ioff1], np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock) # sum all device arrays in the first device lib_parallelproj_cuda.sum_float_arrays_on_first_device( d_back_img, num_voxel) # copy summed image back from first device lib_parallelproj_cuda.get_float_array_from_device( d_back_img, num_voxel, 0, back_img.ravel()) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img) else: # back projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_back_tof_sino( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32).ravel(), nLORs, np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32).ravel(), np.asarray(tofcenter_offset, dtype=np.float32).ravel(), np.float32(nsigmas), np.int16(ntofbins), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset) return xp.asarray(back_img, device=array_api_compat.device(img_fwd))
[docs]def joseph3d_fwd_tof_lm(xstart: npt.ArrayLike, xend: npt.ArrayLike, img: npt.ArrayLike, img_origin: npt.ArrayLike, voxsize: npt.ArrayLike, tofbin_width: float, sigma_tof: npt.ArrayLike, tofcenter_offset: npt.ArrayLike, nsigmas: float, tofbin: npt.ArrayLike, threadsperblock: int = 32, num_chunks: int = 1) -> npt.ArrayLike: """TOF Joseph 3D listmode forward projector Parameters ---------- xstart : npt.ArrayLike (numpy/cupy array or torch tensor) start world coordinates of the event LORs, shape (num_events, 3) xend : npt.ArrayLike (numpy/cupy array or torch tensor) end world coordinates of the event LORs, shape (num_events, 3) img : npt.ArrayLike (numpy/cupy array or torch tensor) containing the 3D image to be projected img_origin : npt.ArrayLike (numpy/cupy array or torch tensor) containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : npt.ArrayLike (numpy/cupy array or torch tensor) array containing the voxel size tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor) sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length nLORs -> LOR dependent sigma tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor) center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all events or an array of length num_events -> event dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) tofbin: npt.ArrayLike (numpy/cupy array or torch tensor) array containing the tof bin of the events threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- npt.ArrayLike (numpy/cupy array or torch tensor) """ nLORs = np.int64(xstart.shape[0]) xp = array_api_compat.get_namespace(img) lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8( tofcenter_offset.shape[0] == nLORs) if (is_cuda_array(img)): # projection of cupy or torch GPU array using the cupy raw kernel img_fwd = cp.zeros(nLORs, dtype=cp.float32) _joseph3d_fwd_tof_lm_cuda_kernel( (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ), (cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), cp.asarray(img, dtype=cp.float32).ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), img_fwd, nLORs, cp.asarray(img.shape, dtype=cp.int32), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32), cp.asarray(tofcenter_offset, dtype=cp.float32), np.float32(nsigmas), cp.asarray(tofbin, dtype=cp.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)) cp.cuda.Device().synchronize() else: # projection of numpy or torch CPU array img_fwd = np.zeros(nLORs, dtype=np.float32) if num_visible_cuda_devices > 0: # projection using libparallelproj_cuda num_voxel = ctypes.c_longlong(array_api_compat.size(img)) # send image to all devices d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( np.asarray(img, dtype=np.float32).ravel(), num_voxel) # split call to GPU lib into chunks (useful for systems with limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda( np.asarray(xstart, dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(), np.asarray(xend, dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(), d_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd[ic[i]:(ic[i + 1])], ic[i + 1] - ic[i], np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32)[ioff0:ioff1], np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16)[ic[i]:(ic[i + 1])], lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_img) else: # projection the openmp libparallelproj_c lib_parallelproj_c.joseph3d_fwd_tof_lm( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), np.asarray(img, dtype=np.float32).ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), img_fwd, np.int64(nLORs), np.asarray(img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32), np.asarray(tofcenter_offset, dtype=np.float32), np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset) return xp.asarray(img_fwd, device=array_api_compat.device(img))
[docs]def joseph3d_back_tof_lm(xstart: npt.ArrayLike, xend: npt.ArrayLike, img_shape: tuple[int, int, int], img_origin: npt.ArrayLike, voxsize: npt.ArrayLike, img_fwd: npt.ArrayLike, tofbin_width: float, sigma_tof: npt.ArrayLike, tofcenter_offset: npt.ArrayLike, nsigmas: float, tofbin: npt.ArrayLike, threadsperblock: int = 32, num_chunks: int = 1) -> npt.ArrayLike: """TOF Joseph 3D listmode back projector Parameters ---------- xstart : npt.ArrayLike (numpy/cupy array or torch tensor) start world coordinates of the event LORs, shape (nLORs, 3) xend : npt.ArrayLike (numpy/cupy array or torch tensor) end world coordinates of the event LORs, shape (nLORs, 3) img_shape : tuple[int, int, int] the shape of the back projected image img_origin : npt.ArrayLike (numpy/cupy array or torch tensor) containing the world coordinates of the image origin (voxel [0,0,0]) voxsize : npt.ArrayLike (numpy/cupy array or torch tensor) array containing the voxel size img_fwd : npt.ArrayLike (numpy/cupy array or torch tensor) array of size num_events containing the values to be back projected tofbin_width : float width of the TOF bin in spatial units (same units as xstart) sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor) sigma of Gaussian TOF kernel in spatial units (same units as xstart) can be an array of length 1 -> same sigma for all LORs or an array of length num_events -> event dependent sigma tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor) center offset of the central TOF bin in spatial units (same units as xstart) can be an array of length 1 -> same offset for all LORs or an array of length num_events -> event dependent offset nsigmas: float number of sigmas to consider when Gaussian kernel is evaluated (truncated) tofbin: npt.ArrayLike array with the tofbin of the events threadsperblock : int, optional by default 32 num_chunks : int, optional break down the projection in hybrid mode into chunks to save memory on the GPU, by default 1 Returns ------- npt.ArrayLike (numpy/cupy array or torch tensor) """ nLORs = np.int64(xstart.shape[0]) xp = array_api_compat.get_namespace(img_fwd) lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs) lor_dependent_tofcenter_offset = np.uint8( tofcenter_offset.shape[0] == nLORs) if (is_cuda_array(img_fwd)): # back projection of cupy or torch GPU array using the cupy raw kernel back_img = cp.zeros(img_shape, dtype=cp.float32) _joseph3d_back_tof_lm_cuda_kernel( (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ), (cp.asarray(xstart, dtype=cp.float32).ravel(), cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(), cp.asarray(img_origin, dtype=cp.float32), cp.asarray(voxsize, dtype=cp.float32), cp.asarray(img_fwd, dtype=cp.float32), nLORs, cp.asarray(back_img.shape, dtype=cp.int32), np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32), cp.asarray(tofcenter_offset, dtype=cp.float32), np.float32(nsigmas), cp.asarray(tofbin, dtype=cp.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)) cp.cuda.Device().synchronize() else: # back projection of numpy or torch CPU array back_img = np.zeros(img_shape, dtype=np.float32) if num_visible_cuda_devices > 0: # back projection of numpy array using the cuda parallelproj lib num_voxel = ctypes.c_longlong(array_api_compat.size(back_img)) # send image to all devices d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices( back_img.ravel(), num_voxel) # split call to GPU lib into chunks (useful for systems with limited memory) ic = calc_chunks(nLORs, num_chunks) for i in range(num_chunks): if lor_dependent_sigma_tof: isig0 = ic[i] isig1 = ic[i + 1] else: isig0 = 0 isig1 = 1 if lor_dependent_tofcenter_offset: ioff0 = ic[i] ioff1 = ic[i + 1] else: ioff0 = 0 ioff1 = 1 lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda( np.asarray(xstart, dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(), np.asarray(xend, dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(), d_back_img, np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32)[ic[i]:ic[i + 1]], ic[i + 1] - ic[i], np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1], np.asarray(tofcenter_offset, dtype=np.float32)[ioff0:ioff1], np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16)[ic[i]:(ic[i + 1])], lor_dependent_sigma_tof, lor_dependent_tofcenter_offset, threadsperblock) # sum all device arrays in the first device lib_parallelproj_cuda.sum_float_arrays_on_first_device( d_back_img, num_voxel) # copy summed image back from first device lib_parallelproj_cuda.get_float_array_from_device( d_back_img, num_voxel, 0, back_img.ravel()) # free image device arrays lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img) else: # back projection of numpy array using the openmp parallelproj lib lib_parallelproj_c.joseph3d_back_tof_lm( np.asarray(xstart, dtype=np.float32).ravel(), np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(), np.asarray(img_origin, dtype=np.float32), np.asarray(voxsize, dtype=np.float32), np.asarray(img_fwd, dtype=np.float32), nLORs, np.asarray(back_img.shape, dtype=np.int32), np.float32(tofbin_width), np.asarray(sigma_tof, dtype=np.float32), np.asarray(tofcenter_offset, dtype=np.float32), np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16), lor_dependent_sigma_tof, lor_dependent_tofcenter_offset) return xp.asarray(back_img, device=array_api_compat.device(img_fwd))