Source code for parallelproj.backend

from __future__ import annotations

import os
import importlib
import distutils
import math

import ctypes
from ctypes import POINTER
from ctypes.util import find_library

from pathlib import Path
from warnings import warn

import numpy as np
import numpy.ctypeslib as npct
import numpy.typing as npt

import array_api_compat

# check if cuda is present
cuda_present = distutils.spawn.find_executable('nvidia-smi') is not None

# check if cupy is available
cupy_enabled = (importlib.util.find_spec('cupy') is not None)

# check if cupy is available
torch_enabled = (importlib.util.find_spec('torch') is not None)

# define type for cupy or numpy array
if cupy_enabled:
    import cupy as cp

# numpy ctypes lib array definitions
ar_1d_single = npct.ndpointer(dtype=ctypes.c_float, ndim=1, flags='C')
ar_1d_int = npct.ndpointer(dtype=ctypes.c_int, ndim=1, flags='C')
ar_1d_short = npct.ndpointer(dtype=ctypes.c_short, ndim=1, flags='C')

#---------------------------------------------------------------------------------------
#---- find the compiled C / CUDA libraries

lib_parallelproj_c_fname = None
if 'PARALLELPROJ_C_LIB' in os.environ:
    lib_parallelproj_c_fname = os.environ['PARALLELPROJ_C_LIB']
else:
    lib_parallelproj_c_fname = find_library('parallelproj_c')

if lib_parallelproj_c_fname is None:
    raise ImportError(
        'Cannot find parallelproj c lib. Consider setting the environment variable PARALLELPROJ_C_LIB.'
    )
else:
    lib_parallelproj_c = npct.load_library(
        os.path.basename(lib_parallelproj_c_fname),
        os.path.dirname(lib_parallelproj_c_fname))

    lib_parallelproj_c.joseph3d_fwd.restype = None
    lib_parallelproj_c.joseph3d_fwd.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p
        ctypes.c_ulonglong,  # nlors
        ar_1d_int  # img_dim
    ]

    lib_parallelproj_c.joseph3d_back.restype = None
    lib_parallelproj_c.joseph3d_back.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p            
        ctypes.c_ulonglong,  # nlors
        ar_1d_int  # img_dim
    ]
    lib_parallelproj_c.joseph3d_fwd_tof_sino.restype = None
    lib_parallelproj_c.joseph3d_fwd_tof_sino.argtypes = [
        ar_1d_single,  #  xstart
        ar_1d_single,  #  xend
        ar_1d_single,  #  img
        ar_1d_single,  #  img_origin
        ar_1d_single,  #  voxsize
        ar_1d_single,  #  p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ctypes.c_short,  # n_tofbins
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_back_tof_sino.restype = None
    lib_parallelproj_c.joseph3d_back_tof_sino.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ctypes.c_short,  # n_tofbins
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_fwd_tof_lm.restype = None
    lib_parallelproj_c.joseph3d_fwd_tof_lm.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ar_1d_short,  # tof bin
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

    lib_parallelproj_c.joseph3d_back_tof_lm.restype = None
    lib_parallelproj_c.joseph3d_back_tof_lm.argtypes = [
        ar_1d_single,  # xstart
        ar_1d_single,  # xend
        ar_1d_single,  # img
        ar_1d_single,  # img_origin
        ar_1d_single,  # voxsize
        ar_1d_single,  # p         
        ctypes.c_longlong,  # nlors
        ar_1d_int,  # img_dim
        ctypes.c_float,  # tofbin_width
        ar_1d_single,  # sigma tof
        ar_1d_single,  # tofcenter_offset
        ctypes.c_float,  # n_sigmas
        ar_1d_short,  # tof bin
        ctypes.c_ubyte,  # LOR dep. TOF sigma
        ctypes.c_ubyte  # LOR dep. TOF center offset
    ]

#---------------------------------------------------------------------------------------

num_visible_cuda_devices = 0

if cuda_present:
    if 'PARALLELPROJ_CUDA_LIB' in os.environ:
        lib_parallelproj_cuda_fname = os.environ['PARALLELPROJ_CUDA_LIB']
    else:
        lib_parallelproj_cuda_fname = find_library('parallelproj_cuda')

    if lib_parallelproj_cuda_fname is None:
        raise ImportError(
            'Cannot find parallelproj cuda lib. Consider settting the environment variable PARALLELPROJ_CUDA_LIB.'
        )
    else:
        lib_parallelproj_cuda = npct.load_library(
            os.path.basename(lib_parallelproj_cuda_fname),
            os.path.dirname(lib_parallelproj_cuda_fname))

        # get the number of visible cuda devices
        lib_parallelproj_cuda.get_cuda_device_count.restype = np.int32
        num_visible_cuda_devices = lib_parallelproj_cuda.get_cuda_device_count(
        )

        if (num_visible_cuda_devices == 0) and cupy_enabled:
            cupy_enabled = False

        lib_parallelproj_cuda.joseph3d_fwd_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_int  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_back_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_int  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_end
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ctypes.c_short,  # n_tofbins
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int  # threadsperblock
        ]

        lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_end
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ctypes.c_short,  # n_tofbins
            ctypes.c_ubyte,  # LOR dep.TOF sigma
            ctypes.c_ubyte,  # LOR dep.TOF center offset
            ctypes.c_int
        ]  # threads per block

        lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ar_1d_short,  # tof bin
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int
        ]  # threads per block

        lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.restype = None
        lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda.argtypes = [
            ar_1d_single,  # h_xstart
            ar_1d_single,  # h_xend
            POINTER(POINTER(ctypes.c_float)),  # d_img
            ar_1d_single,  # h_img_origin
            ar_1d_single,  # h_voxsize
            ar_1d_single,  # h_p
            ctypes.c_longlong,  # nlors
            ar_1d_int,  # h_img_dim
            ctypes.c_float,  # tofbin_width
            ar_1d_single,  # sigma tof
            ar_1d_single,  # tofcenter_offset
            ctypes.c_float,  # n_sigmas
            ar_1d_short,  # tof bin
            ctypes.c_ubyte,  # LOR dep. TOF sigma
            ctypes.c_ubyte,  # LOR dep. TOF center offset
            ctypes.c_int
        ]  # threads per block

        lib_parallelproj_cuda.copy_float_array_to_all_devices.restype = POINTER(
            POINTER(ctypes.c_float))
        lib_parallelproj_cuda.copy_float_array_to_all_devices.argtypes = [
            ar_1d_single,  # h_array
            ctypes.c_longlong  # n
        ]

        lib_parallelproj_cuda.free_float_array_on_all_devices.restype = None
        lib_parallelproj_cuda.free_float_array_on_all_devices.argtypes = [
            POINTER(POINTER(ctypes.c_float))  # d_array
        ]

        lib_parallelproj_cuda.sum_float_arrays_on_first_device.restype = None
        lib_parallelproj_cuda.sum_float_arrays_on_first_device.argtypes = [
            POINTER(POINTER(ctypes.c_float)),  # d_array
            ctypes.c_longlong  # n
        ]

        lib_parallelproj_cuda.get_float_array_from_device.restype = None
        lib_parallelproj_cuda.get_float_array_from_device.argtypes = [
            POINTER(POINTER(ctypes.c_float)),  # d_array
            ctypes.c_longlong,  # n
            ctypes.c_int,  # i_dev
            ar_1d_single  # h_array
        ]

    #---------------------------------------------------------------------------------------
    if cupy_enabled:
        # find all cuda kernel files installed with the parallelproj libs
        cuda_kernel_files = sorted(
            list((Path(lib_parallelproj_cuda_fname).parents[1] /
                  'lib').glob('projector_kernels.cu.*')))
        if len(cuda_kernel_files) == 1:
            cuda_kernel_file = cuda_kernel_files[0]
        elif len(cuda_kernel_files) > 1:
            cuda_kernel_file = cuda_kernel_files[-1]
            warn('More than one kernel file available.')
        else:
            raise ImportError('No kernel file found.')

        if cuda_kernel_file is not None:
            import cupy as cp

            # load a kernel defined in a external file
            with open(cuda_kernel_file, 'r', encoding='utf8') as f:
                lines = f.read()

            _joseph3d_fwd_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_fwd_cuda_kernel')
            _joseph3d_back_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_back_cuda_kernel')
            _joseph3d_fwd_tof_sino_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_fwd_tof_sino_cuda_kernel')
            _joseph3d_back_tof_sino_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_back_tof_sino_cuda_kernel')
            _joseph3d_fwd_tof_lm_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_fwd_tof_lm_cuda_kernel')
            _joseph3d_back_tof_lm_cuda_kernel = cp.RawKernel(
                lines, 'joseph3d_back_tof_lm_cuda_kernel')
        else:
            warn('cannot find cuda kernel file for cupy kernels')


def calc_chunks(nLORs: int | np.int64,
                num_chunks: int) -> list[int] | list[np.int64]:
    """ calculate indices to split an array of length nLORs into num_chunks chunks

        example: splitting an array of length 10 into 3 chunks returns [0,4,7,10]
    """
    rem = nLORs % num_chunks
    div = (nLORs // num_chunks)

    chunks = [0]

    for i in range(num_chunks):
        if i < rem:
            nLORs_chunck = div + 1
        else:
            nLORs_chunck = div

        chunks.append(chunks[i] + nLORs_chunck)

    return chunks


def is_cuda_array(x: npt.ArrayLike) -> bool:
    """test whether an array is a cuda array

    Parameters
    ----------
    x : Array (numpy, cupy, torch, ...)
        array to be tested

    Returns
    -------
    bool
    """    

    iscuda = False

    if 'cupy' in array_api_compat.get_namespace(x).__name__:
        iscuda = True
    elif 'torch' in array_api_compat.get_namespace(x).__name__:
        if array_api_compat.device(x).type == 'cuda':
            iscuda = True

    return iscuda


[docs]def joseph3d_fwd(xstart: npt.ArrayLike,
                 xend: npt.ArrayLike,
                 img: npt.ArrayLike,
                 img_origin: npt.ArrayLike,
                 voxsize: npt.ArrayLike,
                 threadsperblock: int = 32,
                 num_chunks: int = 1) -> npt.ArrayLike:
    """Non-TOF Joseph 3D forward projector

    Parameters
    ----------
    xstart : npt.ArrayLike (numpy/cupy array or torch tensor)
        start world coordinates of the LORs, shape (nLORs, 3)
    xend : npt.ArrayLike (numpy/cupy array or torch tensor)
        end world coordinates of the LORs, shape (nLORs, 3)
    img : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the 3D image to be projected
    img_origin : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the world coordinates of the image origin (voxel [0,0,0])
    voxsize : npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the voxel size
    threadsperblock : int, optional
        by default 32
    num_chunks : int, optional
        break down the projection in hybrid mode into chunks to
        save memory on the GPU, by default 1
    """
    nLORs = np.int64(array_api_compat.size(xstart) // 3)
    xp = array_api_compat.get_namespace(img)

    if is_cuda_array(img):
        # projection of GPU array (cupy to torch GPU array) using the cupy raw kernel
        img_fwd = cp.zeros(xstart.shape[:-1], dtype=cp.float32)

        _joseph3d_fwd_cuda_kernel(
            (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
            (cp.asarray(xstart, dtype=cp.float32).ravel(),
             cp.asarray(xend, dtype=cp.float32).ravel(),
             cp.asarray(img, dtype=cp.float32).ravel(),
             cp.asarray(img_origin, dtype=cp.float32),
             cp.asarray(voxsize, dtype=cp.float32), img_fwd.ravel(), nLORs,
             cp.asarray(img.shape, dtype=cp.int32)))
        cp.cuda.Device().synchronize()
    else:
        img_fwd = np.zeros(xstart.shape[:-1], dtype=np.float32)

        # projection of CPU array (numpy to torch CPU array)
        if num_visible_cuda_devices > 0:
            # projection of numpy array using the cuda parallelproj lib
            num_voxel = ctypes.c_longlong(array_api_compat.size(img))

            # send image to all devices
            d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
                np.asarray(img, dtype=np.float32).ravel(), num_voxel)

            # split call to GPU lib into chunks (useful for systems with limited memory)
            ic = calc_chunks(nLORs, num_chunks)

            for i in range(num_chunks):
                lib_parallelproj_cuda.joseph3d_fwd_cuda(
                    np.asarray(xstart, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(),
                    np.asarray(xend, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_img,
                    np.asarray(img_origin, dtype=np.float32),
                    np.asarray(voxsize, dtype=np.float32),
                    img_fwd.ravel()[ic[i]:ic[i + 1]], ic[i + 1] - ic[i],
                    np.asarray(img.shape, dtype=np.int32), threadsperblock)

            # free image device arrays
            lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
        else:
            # projection of numpy array using the openmp parallelproj lib
            lib_parallelproj_c.joseph3d_fwd(
                np.asarray(xstart, dtype=np.float32).ravel(),
                np.asarray(xend, dtype=np.float32).ravel(),
                np.asarray(img, dtype=np.float32).ravel(),
                np.asarray(img_origin, dtype=np.float32),
                np.asarray(voxsize, dtype=np.float32), img_fwd.ravel(), nLORs,
                np.asarray(img.shape, dtype=np.int32))

    return xp.asarray(img_fwd, device=array_api_compat.device(img))


[docs]def joseph3d_back(xstart: npt.ArrayLike,
                  xend: npt.ArrayLike,
                  img_shape: tuple[int, int, int],
                  img_origin: npt.ArrayLike,
                  voxsize: npt.ArrayLike,
                  img_fwd: npt.ArrayLike,
                  threadsperblock: int = 32,
                  num_chunks: int = 1) -> npt.ArrayLike:
    """Non-TOF Joseph 3D back projector

    Parameters
    ----------
    xstart : npt.ArrayLike (numpy/cupy array or torch tensor)
        start world coordinates of the LORs, shape (nLORs, 3)
    xend : npt.ArrayLike (numpy/cupy array or torch tensor)
        end world coordinates of the LORs, shape (nLORs, 3)
    img_shape : tuple[int, int, int]
        the shape of the back projected image
    img_origin : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the world coordinates of the image origin (voxel [0,0,0])
    voxsize : npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the voxel size
    img_fwd : npt.ArrayLike (numpy/cupy array or torch tensor)
        array of length nLORs containing the values to be back projected
    threadsperblock : int, optional
        by default 32
    num_chunks : int, optional
        break down the back projection in hybrid mode into chunks to
        save memory on the GPU, by default 1
    """
    nLORs = np.int64(array_api_compat.size(xstart) // 3)
    xp = array_api_compat.get_namespace(img_fwd)

    if (is_cuda_array(img_fwd)):
        # back projection of cupy or torch GPU array using the cupy raw kernel
        back_img = cp.zeros(img_shape, dtype=cp.float32)

        _joseph3d_back_cuda_kernel(
            (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
            (cp.asarray(xstart, dtype=cp.float32).ravel(),
             cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(),
             cp.asarray(img_origin, dtype=cp.float32),
             cp.asarray(voxsize, dtype=cp.float32),
             cp.asarray(img_fwd, dtype=cp.float32).ravel(), nLORs,
             cp.asarray(back_img.shape, dtype=cp.int32)))
        cp.cuda.Device().synchronize()
    else:
        # back projection of numpy or torch CPU array
        back_img = np.zeros(img_shape, dtype=np.float32)

        if num_visible_cuda_devices > 0:
            # back projection of numpy array using the cuda parallelproj lib
            num_voxel = ctypes.c_longlong(array_api_compat.size(back_img))
            # send image to all devices
            d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
                back_img.ravel(), num_voxel)

            # split call to GPU lib into chunks (useful for systems with limited memory)
            ic = calc_chunks(nLORs, num_chunks)

            for i in range(num_chunks):
                lib_parallelproj_cuda.joseph3d_back_cuda(
                    np.asarray(xstart, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(),
                    np.asarray(xend, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_back_img,
                    np.asarray(img_origin, dtype=np.float32),
                    np.asarray(voxsize, dtype=np.float32),
                    np.asarray(img_fwd,
                               dtype=np.float32).ravel()[ic[i]:ic[i + 1]],
                    ic[i + 1] - ic[i],
                    np.asarray(back_img.shape,
                               dtype=np.int32), threadsperblock)

            # sum all device arrays in the first device
            lib_parallelproj_cuda.sum_float_arrays_on_first_device(
                d_back_img, num_voxel)

            # copy summed image back from first device
            lib_parallelproj_cuda.get_float_array_from_device(
                d_back_img, num_voxel, 0, back_img.ravel())

            # free image device arrays
            lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
        else:
            # back projection of numpy array using the openmp parallelproj lib
            lib_parallelproj_c.joseph3d_back(
                np.asarray(xstart, dtype=np.float32).ravel(),
                np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(),
                np.asarray(img_origin, dtype=np.float32),
                np.asarray(voxsize, dtype=np.float32),
                np.asarray(img_fwd, dtype=np.float32).ravel(), nLORs,
                np.asarray(back_img.shape, dtype=np.int32))

    return xp.asarray(back_img, device=array_api_compat.device(img_fwd))


[docs]def joseph3d_fwd_tof_sino(xstart: npt.ArrayLike,
                          xend: npt.ArrayLike,
                          img: npt.ArrayLike,
                          img_origin: npt.ArrayLike,
                          voxsize: npt.ArrayLike,
                          tofbin_width: float,
                          sigma_tof: npt.ArrayLike,
                          tofcenter_offset: npt.ArrayLike,
                          nsigmas: float,
                          ntofbins: int,
                          threadsperblock: int = 32,
                          num_chunks: int = 1) -> npt.ArrayLike:
    """TOF Joseph 3D sinogram forward projector

    Parameters
    ----------
    xstart : npt.ArrayLike (numpy/cupy array or torch tensor)
        start world coordinates of the LORs, shape (nLORs, 3)
    xend : npt.ArrayLike (numpy/cupy array or torch tensor)
        end world coordinates of the LORs, shape (nLORs, 3)
    img : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the 3D image to be projected
    img_origin : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the world coordinates of the image origin (voxel [0,0,0])
    voxsize : npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the voxel size
    tofbin_width : float
        width of the TOF bin in spatial units (same units as xstart)
    sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor)
        sigma of Gaussian TOF kernel in spatial units (same units as xstart)
        can be an array of length 1 -> same sigma for all LORs
        or an array of length nLORs -> LOR dependent sigma
    tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor)
        center offset of the central TOF bin in spatial units (same units as xstart)
        can be an array of length 1 -> same offset for all LORs
        or an array of length nLORs -> LOR dependent offset
    nsigmas: float
        number of sigmas to consider when Gaussian kernel is evaluated (truncated)
    ntofbins: int
        total number of TOF bins
    threadsperblock : int, optional
        by default 32
    num_chunks : int, optional
        break down the projection in hybrid mode into chunks to
        save memory on the GPU, by default 1

    Returns
    -------
    npt.ArrayLike (numpy/cupy array or torch tensor)
    """

    nLORs = np.int64(array_api_compat.size(xstart) // 3)
    xp = array_api_compat.get_namespace(img)

    lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
    lor_dependent_tofcenter_offset = np.uint8(
        tofcenter_offset.shape[0] == nLORs)

    if (is_cuda_array(img)):
        # projection of cupy or torch GPU array using the cupy raw kernel
        img_fwd = cp.zeros(xstart.shape[:-1] + (ntofbins, ), dtype=cp.float32)

        _joseph3d_fwd_tof_sino_cuda_kernel(
            (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
            (cp.asarray(xstart, dtype=cp.float32).ravel(),
             cp.asarray(xend, dtype=cp.float32).ravel(),
             cp.asarray(img, dtype=cp.float32).ravel(),
             cp.asarray(img_origin, dtype=cp.float32),
             cp.asarray(voxsize, dtype=cp.float32), img_fwd.ravel(), nLORs,
             cp.asarray(img.shape, dtype=cp.int32), np.int16(ntofbins),
             np.float32(tofbin_width), cp.asarray(sigma_tof,
                                                  dtype=cp.float32).ravel(),
             cp.asarray(tofcenter_offset,
                        dtype=cp.float32).ravel(), np.float32(nsigmas),
             lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
        cp.cuda.Device().synchronize()
    else:
        # projection of numpy or torch CPU array
        img_fwd = np.zeros(xstart.shape[:-1] + (ntofbins, ), dtype=np.float32)

        if num_visible_cuda_devices > 0:
            # projection using libparallelproj_cuda
            num_voxel = ctypes.c_longlong(array_api_compat.size(img))

            # send image to all devices
            d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
                np.asarray(img, dtype=np.float32).ravel(), num_voxel)

            # split call to GPU lib into chunks (useful for systems with limited memory)
            ic = calc_chunks(nLORs, num_chunks)

            for i in range(num_chunks):
                if lor_dependent_sigma_tof:
                    isig0 = ic[i]
                    isig1 = ic[i + 1]
                else:
                    isig0 = 0
                    isig1 = 1

                if lor_dependent_tofcenter_offset:
                    ioff0 = ic[i]
                    ioff1 = ic[i + 1]
                else:
                    ioff0 = 0
                    ioff1 = 1

                lib_parallelproj_cuda.joseph3d_fwd_tof_sino_cuda(
                    np.asarray(xstart, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(),
                    np.asarray(xend, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_img,
                    np.asarray(img_origin, dtype=np.float32),
                    np.asarray(voxsize, dtype=np.float32),
                    img_fwd.ravel()[ic[i]:(ic[i + 1])], ic[i + 1] - ic[i],
                    np.asarray(img.shape, dtype=np.int32),
                    np.float32(tofbin_width),
                    np.asarray(sigma_tof,
                               dtype=np.float32).ravel()[isig0:isig1],
                    np.asarray(tofcenter_offset,
                               dtype=np.float32).ravel()[ioff0:ioff1],
                    np.float32(nsigmas), np.int16(ntofbins),
                    lor_dependent_sigma_tof, lor_dependent_tofcenter_offset,
                    threadsperblock)

            # free image device arrays
            lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
        else:
            # projection the openmp libparallelproj_c
            lib_parallelproj_c.joseph3d_fwd_tof_sino(
                np.asarray(xstart, dtype=np.float32).ravel(),
                np.asarray(xend, dtype=np.float32).ravel(),
                np.asarray(img, dtype=np.float32).ravel(),
                np.asarray(img_origin, dtype=np.float32),
                np.asarray(voxsize, dtype=np.float32), img_fwd.ravel(),
                np.int64(nLORs), np.asarray(img.shape, dtype=np.int32),
                np.float32(tofbin_width),
                np.asarray(sigma_tof, dtype=np.float32).ravel(),
                np.asarray(tofcenter_offset, dtype=np.float32).ravel(),
                np.float32(nsigmas), np.int16(ntofbins),
                lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)

    return xp.asarray(img_fwd, device=array_api_compat.device(img))


[docs]def joseph3d_back_tof_sino(xstart: npt.ArrayLike,
                           xend: npt.ArrayLike,
                           img_shape: tuple[int, int, int],
                           img_origin: npt.ArrayLike,
                           voxsize: npt.ArrayLike,
                           img_fwd: npt.ArrayLike,
                           tofbin_width: float,
                           sigma_tof: npt.ArrayLike,
                           tofcenter_offset: npt.ArrayLike,
                           nsigmas: float,
                           ntofbins: int,
                           threadsperblock: int = 32,
                           num_chunks: int = 1) -> npt.ArrayLike:
    """TOF Joseph 3D sinogram back projector

    Parameters
    ----------
    xstart : npt.ArrayLike (numpy/cupy array or torch tensor)
        start world coordinates of the LORs, shape (nLORs, 3)
    xend : npt.ArrayLike (numpy/cupy array or torch tensor)
        end world coordinates of the LORs, shape (nLORs, 3)
    img_shape : tuple[int, int, int]
        the shape of the back projected image
    img_origin : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the world coordinates of the image origin (voxel [0,0,0])
    voxsize : npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the voxel size
    img_fwd : npt.ArrayLike (numpy/cupy array or torch tensor)
        array of size nLOR*ntofbins containing the values to be back projected
    tofbin_width : float
        width of the TOF bin in spatial units (same units as xstart)
    sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor)
        sigma of Gaussian TOF kernel in spatial units (same units as xstart)
        can be an array of length 1 -> same sigma for all LORs
        or an array of length nLORs -> LOR dependent sigma
    tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor)
        center offset of the central TOF bin in spatial units (same units as xstart)
        can be an array of length 1 -> same offset for all LORs
        or an array of length nLORs -> LOR dependent offset
    nsigmas: float
        number of sigmas to consider when Gaussian kernel is evaluated (truncated)
    ntofbins: int
        total number of TOF bins
    threadsperblock : int, optional
        by default 32
    num_chunks : int, optional
        break down the projection in hybrid mode into chunks to
        save memory on the GPU, by default 1

    Returns
    -------
    npt.ArrayLike (numpy/cupy array or torch tensor)
    """

    nLORs = np.int64(array_api_compat.size(xstart) // 3)
    xp = array_api_compat.get_namespace(img_fwd)

    lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
    lor_dependent_tofcenter_offset = np.uint8(
        tofcenter_offset.shape[0] == nLORs)

    if (is_cuda_array(img_fwd)):
        # back projection of cupy or torch GPU array using the cupy raw kernel
        back_img = cp.zeros(img_shape, dtype=cp.float32)

        _joseph3d_back_tof_sino_cuda_kernel(
            (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
            (cp.asarray(xstart, dtype=cp.float32).ravel(),
             cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(),
             cp.asarray(img_origin, dtype=cp.float32),
             cp.asarray(voxsize, dtype=cp.float32),
             cp.asarray(img_fwd, dtype=cp.float32).ravel(), nLORs,
             cp.asarray(back_img.shape, dtype=cp.int32), np.int16(ntofbins),
             np.float32(tofbin_width), cp.asarray(sigma_tof,
                                                  dtype=cp.float32).ravel(),
             cp.asarray(tofcenter_offset,
                        dtype=cp.float32).ravel(), np.float32(nsigmas),
             lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
        cp.cuda.Device().synchronize()
    else:
        # back projection of numpy or torch CPU array
        back_img = np.zeros(img_shape, dtype=np.float32)

        if num_visible_cuda_devices > 0:
            # back projection of numpy array using the cuda parallelproj lib
            num_voxel = ctypes.c_longlong(array_api_compat.size(back_img))
            # send image to all devices
            d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
                back_img.ravel(), num_voxel)

            # split call to GPU lib into chunks (useful for systems with limited memory)
            ic = calc_chunks(nLORs, num_chunks)

            for i in range(num_chunks):
                if lor_dependent_sigma_tof:
                    isig0 = ic[i]
                    isig1 = ic[i + 1]
                else:
                    isig0 = 0
                    isig1 = 1

                if lor_dependent_tofcenter_offset:
                    ioff0 = ic[i]
                    ioff1 = ic[i + 1]
                else:
                    ioff0 = 0
                    ioff1 = 1

                lib_parallelproj_cuda.joseph3d_back_tof_sino_cuda(
                    np.asarray(xstart, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(),
                    np.asarray(xend, dtype=np.float32).reshape(
                        -1, 3)[ic[i]:(ic[i + 1]), :].ravel(), d_back_img,
                    np.asarray(img_origin, dtype=np.float32),
                    np.asarray(voxsize, dtype=np.float32),
                    np.asarray(img_fwd,
                               dtype=np.float32).ravel()[ic[i]:ic[i + 1]],
                    ic[i + 1] - ic[i],
                    np.asarray(back_img.shape, dtype=np.int32),
                    np.float32(tofbin_width),
                    np.asarray(sigma_tof,
                               dtype=np.float32).ravel()[isig0:isig1],
                    np.asarray(tofcenter_offset,
                               dtype=np.float32).ravel()[ioff0:ioff1],
                    np.float32(nsigmas), np.int16(ntofbins),
                    lor_dependent_sigma_tof, lor_dependent_tofcenter_offset,
                    threadsperblock)

            # sum all device arrays in the first device
            lib_parallelproj_cuda.sum_float_arrays_on_first_device(
                d_back_img, num_voxel)

            # copy summed image back from first device
            lib_parallelproj_cuda.get_float_array_from_device(
                d_back_img, num_voxel, 0, back_img.ravel())

            # free image device arrays
            lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
        else:
            # back projection of numpy array using the openmp parallelproj lib
            lib_parallelproj_c.joseph3d_back_tof_sino(
                np.asarray(xstart, dtype=np.float32).ravel(),
                np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(),
                np.asarray(img_origin, dtype=np.float32),
                np.asarray(voxsize, dtype=np.float32),
                np.asarray(img_fwd, dtype=np.float32).ravel(), nLORs,
                np.asarray(back_img.shape, dtype=np.int32),
                np.float32(tofbin_width),
                np.asarray(sigma_tof, dtype=np.float32).ravel(),
                np.asarray(tofcenter_offset, dtype=np.float32).ravel(),
                np.float32(nsigmas), np.int16(ntofbins),
                lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)

    return xp.asarray(back_img, device=array_api_compat.device(img_fwd))


[docs]def joseph3d_fwd_tof_lm(xstart: npt.ArrayLike,
                        xend: npt.ArrayLike,
                        img: npt.ArrayLike,
                        img_origin: npt.ArrayLike,
                        voxsize: npt.ArrayLike,
                        tofbin_width: float,
                        sigma_tof: npt.ArrayLike,
                        tofcenter_offset: npt.ArrayLike,
                        nsigmas: float,
                        tofbin: npt.ArrayLike,
                        threadsperblock: int = 32,
                        num_chunks: int = 1) -> npt.ArrayLike:
    """TOF Joseph 3D listmode forward projector

    Parameters
    ----------
    xstart : npt.ArrayLike (numpy/cupy array or torch tensor)
        start world coordinates of the event LORs, shape (num_events, 3)
    xend : npt.ArrayLike (numpy/cupy array or torch tensor)
        end world coordinates of the event LORs, shape (num_events, 3)
    img : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the 3D image to be projected
    img_origin : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the world coordinates of the image origin (voxel [0,0,0])
    voxsize : npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the voxel size
    tofbin_width : float
        width of the TOF bin in spatial units (same units as xstart)
    sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor)
        sigma of Gaussian TOF kernel in spatial units (same units as xstart)
        can be an array of length 1 -> same sigma for all LORs
        or an array of length nLORs -> LOR dependent sigma
    tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor)
        center offset of the central TOF bin in spatial units (same units as xstart)
        can be an array of length 1 -> same offset for all events
        or an array of length num_events -> event dependent offset
    nsigmas: float
        number of sigmas to consider when Gaussian kernel is evaluated (truncated)
    tofbin: npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the tof bin of the events
    threadsperblock : int, optional
        by default 32
    num_chunks : int, optional
        break down the projection in hybrid mode into chunks to
        save memory on the GPU, by default 1

    Returns
    -------
    npt.ArrayLike (numpy/cupy array or torch tensor)
    """

    nLORs = np.int64(xstart.shape[0])
    xp = array_api_compat.get_namespace(img)

    lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
    lor_dependent_tofcenter_offset = np.uint8(
        tofcenter_offset.shape[0] == nLORs)

    if (is_cuda_array(img)):
        # projection of cupy or torch GPU array using the cupy raw kernel
        img_fwd = cp.zeros(nLORs, dtype=cp.float32)

        _joseph3d_fwd_tof_lm_cuda_kernel(
            (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
            (cp.asarray(xstart, dtype=cp.float32).ravel(),
             cp.asarray(xend, dtype=cp.float32).ravel(),
             cp.asarray(img, dtype=cp.float32).ravel(),
             cp.asarray(img_origin, dtype=cp.float32),
             cp.asarray(voxsize, dtype=cp.float32), img_fwd, nLORs,
             cp.asarray(img.shape, dtype=cp.int32), np.float32(tofbin_width),
             cp.asarray(sigma_tof, dtype=cp.float32),
             cp.asarray(tofcenter_offset, dtype=cp.float32),
             np.float32(nsigmas), cp.asarray(tofbin, dtype=cp.int16),
             lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
        cp.cuda.Device().synchronize()
    else:
        # projection of numpy or torch CPU array
        img_fwd = np.zeros(nLORs, dtype=np.float32)

        if num_visible_cuda_devices > 0:
            # projection using libparallelproj_cuda
            num_voxel = ctypes.c_longlong(array_api_compat.size(img))

            # send image to all devices
            d_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
                np.asarray(img, dtype=np.float32).ravel(), num_voxel)

            # split call to GPU lib into chunks (useful for systems with limited memory)
            ic = calc_chunks(nLORs, num_chunks)

            for i in range(num_chunks):
                if lor_dependent_sigma_tof:
                    isig0 = ic[i]
                    isig1 = ic[i + 1]
                else:
                    isig0 = 0
                    isig1 = 1

                if lor_dependent_tofcenter_offset:
                    ioff0 = ic[i]
                    ioff1 = ic[i + 1]
                else:
                    ioff0 = 0
                    ioff1 = 1

                lib_parallelproj_cuda.joseph3d_fwd_tof_lm_cuda(
                    np.asarray(xstart,
                               dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(),
                    np.asarray(xend,
                               dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(),
                    d_img, np.asarray(img_origin, dtype=np.float32),
                    np.asarray(voxsize, dtype=np.float32),
                    img_fwd[ic[i]:(ic[i + 1])], ic[i + 1] - ic[i],
                    np.asarray(img.shape, dtype=np.int32),
                    np.float32(tofbin_width),
                    np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1],
                    np.asarray(tofcenter_offset,
                               dtype=np.float32)[ioff0:ioff1],
                    np.float32(nsigmas),
                    np.asarray(tofbin, dtype=np.int16)[ic[i]:(ic[i + 1])],
                    lor_dependent_sigma_tof, lor_dependent_tofcenter_offset,
                    threadsperblock)

            # free image device arrays
            lib_parallelproj_cuda.free_float_array_on_all_devices(d_img)
        else:
            # projection the openmp libparallelproj_c
            lib_parallelproj_c.joseph3d_fwd_tof_lm(
                np.asarray(xstart, dtype=np.float32).ravel(),
                np.asarray(xend, dtype=np.float32).ravel(),
                np.asarray(img, dtype=np.float32).ravel(),
                np.asarray(img_origin, dtype=np.float32),
                np.asarray(voxsize, dtype=np.float32), img_fwd,
                np.int64(nLORs), np.asarray(img.shape, dtype=np.int32),
                np.float32(tofbin_width),
                np.asarray(sigma_tof, dtype=np.float32),
                np.asarray(tofcenter_offset, dtype=np.float32),
                np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16),
                lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)

    return xp.asarray(img_fwd, device=array_api_compat.device(img))


[docs]def joseph3d_back_tof_lm(xstart: npt.ArrayLike,
                         xend: npt.ArrayLike,
                         img_shape: tuple[int, int, int],
                         img_origin: npt.ArrayLike,
                         voxsize: npt.ArrayLike,
                         img_fwd: npt.ArrayLike,
                         tofbin_width: float,
                         sigma_tof: npt.ArrayLike,
                         tofcenter_offset: npt.ArrayLike,
                         nsigmas: float,
                         tofbin: npt.ArrayLike,
                         threadsperblock: int = 32,
                         num_chunks: int = 1) -> npt.ArrayLike:
    """TOF Joseph 3D listmode back projector

    Parameters
    ----------
    xstart : npt.ArrayLike (numpy/cupy array or torch tensor)
        start world coordinates of the event LORs, shape (nLORs, 3)
    xend : npt.ArrayLike (numpy/cupy array or torch tensor)
        end world coordinates of the event LORs, shape (nLORs, 3)
    img_shape : tuple[int, int, int]
        the shape of the back projected image
    img_origin : npt.ArrayLike (numpy/cupy array or torch tensor)
        containing the world coordinates of the image origin (voxel [0,0,0])
    voxsize : npt.ArrayLike (numpy/cupy array or torch tensor)
        array containing the voxel size
    img_fwd : npt.ArrayLike (numpy/cupy array or torch tensor)
        array of size num_events containing the values to be back projected
    tofbin_width : float
        width of the TOF bin in spatial units (same units as xstart)
    sigma_tof : npt.ArrayLike (numpy/cupy array or torch tensor)
        sigma of Gaussian TOF kernel in spatial units (same units as xstart)
        can be an array of length 1 -> same sigma for all LORs
        or an array of length num_events -> event dependent sigma
    tofcenter_offset: npt.ArrayLike (numpy/cupy array or torch tensor)
        center offset of the central TOF bin in spatial units (same units as xstart)
        can be an array of length 1 -> same offset for all LORs
        or an array of length num_events -> event dependent offset
    nsigmas: float
        number of sigmas to consider when Gaussian kernel is evaluated (truncated)
    tofbin: npt.ArrayLike
        array with the tofbin of the events
    threadsperblock : int, optional
        by default 32
    num_chunks : int, optional
        break down the projection in hybrid mode into chunks to
        save memory on the GPU, by default 1

    Returns
    -------
    npt.ArrayLike (numpy/cupy array or torch tensor)
    """

    nLORs = np.int64(xstart.shape[0])
    xp = array_api_compat.get_namespace(img_fwd)

    lor_dependent_sigma_tof = np.uint8(sigma_tof.shape[0] == nLORs)
    lor_dependent_tofcenter_offset = np.uint8(
        tofcenter_offset.shape[0] == nLORs)

    if (is_cuda_array(img_fwd)):
        # back projection of cupy or torch GPU array using the cupy raw kernel
        back_img = cp.zeros(img_shape, dtype=cp.float32)

        _joseph3d_back_tof_lm_cuda_kernel(
            (math.ceil(nLORs / threadsperblock), ), (threadsperblock, ),
            (cp.asarray(xstart, dtype=cp.float32).ravel(),
             cp.asarray(xend, dtype=cp.float32).ravel(), back_img.ravel(),
             cp.asarray(img_origin, dtype=cp.float32),
             cp.asarray(voxsize, dtype=cp.float32),
             cp.asarray(img_fwd, dtype=cp.float32), nLORs,
             cp.asarray(back_img.shape, dtype=cp.int32),
             np.float32(tofbin_width), cp.asarray(sigma_tof, dtype=cp.float32),
             cp.asarray(tofcenter_offset, dtype=cp.float32),
             np.float32(nsigmas), cp.asarray(tofbin, dtype=cp.int16),
             lor_dependent_sigma_tof, lor_dependent_tofcenter_offset))
        cp.cuda.Device().synchronize()
    else:
        # back projection of numpy or torch CPU array
        back_img = np.zeros(img_shape, dtype=np.float32)

        if num_visible_cuda_devices > 0:
            # back projection of numpy array using the cuda parallelproj lib
            num_voxel = ctypes.c_longlong(array_api_compat.size(back_img))
            # send image to all devices
            d_back_img = lib_parallelproj_cuda.copy_float_array_to_all_devices(
                back_img.ravel(), num_voxel)

            # split call to GPU lib into chunks (useful for systems with limited memory)
            ic = calc_chunks(nLORs, num_chunks)

            for i in range(num_chunks):
                if lor_dependent_sigma_tof:
                    isig0 = ic[i]
                    isig1 = ic[i + 1]
                else:
                    isig0 = 0
                    isig1 = 1

                if lor_dependent_tofcenter_offset:
                    ioff0 = ic[i]
                    ioff1 = ic[i + 1]
                else:
                    ioff0 = 0
                    ioff1 = 1

                lib_parallelproj_cuda.joseph3d_back_tof_lm_cuda(
                    np.asarray(xstart,
                               dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(),
                    np.asarray(xend,
                               dtype=np.float32)[ic[i]:(ic[i + 1]), :].ravel(),
                    d_back_img, np.asarray(img_origin, dtype=np.float32),
                    np.asarray(voxsize, dtype=np.float32),
                    np.asarray(img_fwd, dtype=np.float32)[ic[i]:ic[i + 1]],
                    ic[i + 1] - ic[i],
                    np.asarray(back_img.shape, dtype=np.int32),
                    np.float32(tofbin_width),
                    np.asarray(sigma_tof, dtype=np.float32)[isig0:isig1],
                    np.asarray(tofcenter_offset,
                               dtype=np.float32)[ioff0:ioff1],
                    np.float32(nsigmas),
                    np.asarray(tofbin, dtype=np.int16)[ic[i]:(ic[i + 1])],
                    lor_dependent_sigma_tof, lor_dependent_tofcenter_offset,
                    threadsperblock)

            # sum all device arrays in the first device
            lib_parallelproj_cuda.sum_float_arrays_on_first_device(
                d_back_img, num_voxel)

            # copy summed image back from first device
            lib_parallelproj_cuda.get_float_array_from_device(
                d_back_img, num_voxel, 0, back_img.ravel())

            # free image device arrays
            lib_parallelproj_cuda.free_float_array_on_all_devices(d_back_img)
        else:
            # back projection of numpy array using the openmp parallelproj lib
            lib_parallelproj_c.joseph3d_back_tof_lm(
                np.asarray(xstart, dtype=np.float32).ravel(),
                np.asarray(xend, dtype=np.float32).ravel(), back_img.ravel(),
                np.asarray(img_origin, dtype=np.float32),
                np.asarray(voxsize, dtype=np.float32),
                np.asarray(img_fwd, dtype=np.float32), nLORs,
                np.asarray(back_img.shape, dtype=np.int32),
                np.float32(tofbin_width),
                np.asarray(sigma_tof, dtype=np.float32),
                np.asarray(tofcenter_offset, dtype=np.float32),
                np.float32(nsigmas), np.asarray(tofbin, dtype=np.int16),
                lor_dependent_sigma_tof, lor_dependent_tofcenter_offset)

    return xp.asarray(back_img, device=array_api_compat.device(img_fwd))