Image Perspective Transformation with CUDA in C++

Question

This is a follow-up question for Image Stitching using SIFT Keypoint Descriptor in C++ and SIFT Keypoint Detection for Image in C++. In the previous post, the image stitching algorithm is implemented; however, it takes long time to compute warp_perspective. I am trying to implement warp_perspective_cuda template function which performs perspective transformation on GPU.

The experimental implementation

warp_perspective_cuda template function implementation (in file image_operations_cuda.h)

/**
* @brief GPU-accelerated perspective transformation using CUDA.
*
* This function is a wrapper that manages memory transfers to and from the GPU
* and launches the CUDA kernel to perform the image warping.
*
* @tparam ElementT The pixel element type of the image. Must be an arithmetic type.
* @tparam FloatingType The floating-point type for matrix calculations (e.g., double, float).
* @param src The source image to be warped.
* @param H The 3x3 homography matrix.
* @param out_width The desired width of the output image.
* @param out_height The desired height of the output image.
* @return The warped image.
*/
template<
    arithmetic ElementT,
    std::floating_point FloatingType = double,
    typename InterpolatorType = BicubicInterpolatorDevice<ElementT, FloatingType>
>
Image<ElementT> warp_perspective_cuda(
    const Image<ElementT>& src,
    const linalg::Matrix<FloatingType>& H,
    const std::size_t out_width,
    const std::size_t out_height
);

//  warp_perspective_cuda overload for multi-channel images
template<
    typename ElementT,
    std::floating_point FloatingType = double
>
requires((std::same_as<ElementT, RGB>) || (std::same_as<ElementT, RGB_DOUBLE>) || (std::same_as<ElementT, HSV>))
auto warp_perspective_cuda(
    const Image<ElementT>& src,
    const linalg::Matrix<FloatingType>& H,
    const std::size_t out_width,
    const std::size_t out_height
)
{
    return apply_each(src, [&](auto&& planes) { return warp_perspective_cuda(planes, H, out_width, out_height); });
}

warp_perspective_cuda template function implementation (in file image_operations_cuda.cu)

// Explicit template instantiation for float and double types with GrayScale.
// This is required for the linker to find the implementation across translation units.
template Image<unsigned char> warp_perspective_cuda<unsigned char, float>(
    const Image<unsigned char>& src, const linalg::Matrix<float>& H,
    const std::size_t out_width, const std::size_t out_height);

template Image<unsigned char> warp_perspective_cuda<unsigned char, double>(
    const Image<unsigned char>& src, const linalg::Matrix<double>& H,
    const std::size_t out_width, const std::size_t out_height);


/**
* warp_perspective_cuda template function implementation
* @brief The C++ wrapper function implementation.
* This is the function that is exposed to the rest of the C++ code.
*/
template<
    arithmetic ElementT,
    std::floating_point FloatingType,
    typename InterpolatorType
>
Image<ElementT> warp_perspective_cuda(
    const Image<ElementT>& src,
    const linalg::Matrix<FloatingType>& H_inv,
    const std::size_t out_width,
    const std::size_t out_height
)
{
    // 1. Allocate memory on the GPU (device)
    ElementT* d_src_data = nullptr;
    ElementT* d_warped_data = nullptr;
    const size_t src_bytes = src.count() * sizeof(ElementT);
    const size_t warped_bytes = out_width * out_height * sizeof(ElementT);

    // Wrap the CUDA operations in a try-catch block to ensure cleanup
    // (cudaFree) happens even if a CUDA call fails.
    try
    {
        CUDA_CHECK(cudaMalloc(&d_src_data, src_bytes));
        CUDA_CHECK(cudaMalloc(&d_warped_data, warped_bytes));

        // 2. Copy source image data from CPU (host) to GPU (device)
        CUDA_CHECK(cudaMemcpy(d_src_data, src.getImageData().data(), src_bytes, cudaMemcpyHostToDevice));
        // Initialize warped image memory to 0
        CUDA_CHECK(cudaMemset(d_warped_data, 0, warped_bytes));

        // 3. Configure and launch the kernel
        // Threads per block (a common choice is 16x16 or 32x32)
        dim3 threads_per_block(16, 16);
        // Number of blocks in the grid
        dim3 num_blocks(
            (out_width + threads_per_block.x - 1) / threads_per_block.x,
            (out_height + threads_per_block.y - 1) / threads_per_block.y
        );

        warp_perspective_kernel<ElementT, FloatingType, InterpolatorType><<<num_blocks, threads_per_block>>>(
            d_warped_data,
            d_src_data,
            src.getWidth(),
            src.getHeight(),
            out_width,
            out_height,
            H_inv.at(0,0), H_inv.at(0,1), H_inv.at(0,2),
            H_inv.at(1,0), H_inv.at(1,1), H_inv.at(1,2),
            H_inv.at(2,0), H_inv.at(2,1), H_inv.at(2,2),
            InterpolatorType{} // Pass a default-constructed functor
        );

        // Check for any errors during kernel launch
        CUDA_CHECK(cudaGetLastError());
        // Synchronize to ensure the kernel has finished before copying back data
        CUDA_CHECK(cudaDeviceSynchronize());

        // 4. Copy the result from GPU (device) back to CPU (host)
        Image<ElementT> warped_image(out_width, out_height);
        CUDA_CHECK(cudaMemcpy((void*)warped_image.getImageData().data(), d_warped_data, warped_bytes, cudaMemcpyDeviceToHost));

        // 5. Free GPU memory
        CUDA_CHECK(cudaFree(d_src_data));
        CUDA_CHECK(cudaFree(d_warped_data));

        return warped_image;
    }
    catch (...)
    {
        // If an exception occurred, try to free memory before re-throwing
        if (d_src_data)
        {
            cudaFree(d_src_data);
        }
        if (d_warped_data)
        {
            cudaFree(d_warped_data);
        }
        // Re-throw the exception to be handled by the caller
        throw;
    }
}

BicubicInterpolatorDevice struct implementation (in file image_operations_cuda.cu)

/**
* BicubicInterpolatorDevice struct implementation
* @brief Functor for Bicubic Interpolation (Device-Side).
*/
template<arithmetic ElementT, arithmetic FloatingType>
struct BicubicInterpolatorDevice
{
    __device__ auto operator()(
        const ElementT* src_data,
        const int src_width,
        const int src_height,
        const FloatingType x,
        const FloatingType y) const
    {
        // Fallback to a simple interpolation (nearest neighbor) for edges for simplicity in CUDA
        if (x < 1 || x >= src_width - 2 || y < 1 || y >= src_height - 2)
        {
            int ix = static_cast<int>(roundf(x));
            int iy = static_cast<int>(roundf(y));
            ix = max(0, min(src_width - 1, ix));
            iy = max(0, min(src_height - 1, iy));
            return src_data[iy * src_width + ix];
        }

        int x_floor = static_cast<int>(floorf(x));
        int y_floor = static_cast<int>(floorf(y));

        FloatingType total_value{};

        for (int j = 0; j <= 3; ++j)
        {
            int v = y_floor - 1 + j;
            FloatingType row_value{};
            for (int i = 0; i <= 3; ++i)
            {
                int u = x_floor - 1 + i;
                row_value += static_cast<FloatingType>(src_data[v * src_width + u]) * cubic_kernel_device(x - u);
            }
            total_value += row_value * cubic_kernel_device(y - v);
        }

        if constexpr (std::is_integral_v<ElementT>)
        {
            if (total_value > 255.0)
            {
                return static_cast<ElementT>(255); // Simplified clamping for byte images
            }
            if (total_value < 0.0)
            {
                return static_cast<ElementT>(0);
            }
        }

        return static_cast<ElementT>(total_value);
    }
};

warp_perspective_kernel template function implementation (in file image_operations_cuda.cu)

/**
* warp_perspective_kernel template function implementation
* @brief The CUDA kernel that performs the perspective warp.
* Each thread in the grid computes one pixel of the output image.
*/
template<arithmetic ElementT, std::floating_point FloatingType, typename InterpolatorFunc>
__global__ void warp_perspective_kernel(
    ElementT* warped_data,
    const ElementT* src_data,
    const int src_width,
    const int src_height,
    const int out_width,
    const int out_height,
    const FloatingType h11, const FloatingType h12, const FloatingType h13,
    const FloatingType h21, const FloatingType h22, const FloatingType h23,
    const FloatingType h31, const FloatingType h32, const FloatingType h33,
    InterpolatorFunc interpolator = {})
{
    // Calculate the unique global x and y coordinates for this thread
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // Ensure the thread is within the bounds of the output image
    if (x < out_width && y < out_height)
    {
        // Apply inverse homography to find the corresponding source coordinate
        const FloatingType w = h31 * x + h32 * y + h33;

        if (fabsf(w) > 1e-6) // Avoid division by zero
        {
            const FloatingType src_x = (h11 * x + h12 * y + h13) / w;
            const FloatingType src_y = (h21 * x + h22 * y + h23) / w;

            // If the source coordinate is within the source image bounds, interpolate and write the pixel
            if (src_x >= 0 && src_x < src_width && src_y >= 0 && src_y < src_height)
            {
                warped_data[y * out_width + x] = interpolator(
                    src_data, src_width, src_height, src_x, src_y);
            }
        }
    }
}

TinyDIP on GitHub

All suggestions are welcome.

The summary information:

Which question it is a follow-up to?

Image Stitching using SIFT Keypoint Descriptor in C++ and

SIFT Keypoint Detection for Image in C++
What changes has been made in the code since last question?

I implemented warp_perspective_cuda, warp_perspective_kernel template functions for image stitching in this post.
Why a new review is being asked for?

Please review the implementation of warp_perspective_cuda, warp_perspective_kernel and related functions.

G. Sliepen · Accepted Answer · 2025-12-11 14:50:27Z

Naming

H is a variable name from a formula, but you are not referencing any formula in your source code. Since it's the homography matrix, I would rename this to homography.
Why is it H in the header file but H_inv in the source file? Make sure variable names are the same in both; it avoids confusion and makes searching easier.
d_src_data should be renamed device_src or perhaps gpu_src.

Rounding to integers

When rounding to an integer, consider that static_cast<int> will already do the equivalent of std::floor() for you (well, at least for positive values). So to calculate the floor, write:

int x_floor = static_cast<int>(x);

If you want to round positive values, then you can write this:

int ix = static_cast<int>(x + 0.5);

Avoid branches

GPUs can suffer a performance penalty when there are branches in the code. It is best to avoid them, unless you have some insight into how often branches diverge within a warp.

Instead of making the fallback for edges a special case, consider just skipping it and instead clamping the x and y values:

int x_floor = static_cast<int>(std::clamp(x, 1, src_width - 2));
int y_floor = static_cast<int>(std::clamp(x, 1, src_height - 2));

Avoid hardcoding assumptioms

What if ElementT is std::uint16_t? Then you are clamping values to 255, even thought the elements can go as high as 65535. Use std::numeric_limits<> to find out the lowest and highest value a given type can have. In fact, I think you can avoid the check for std::is_integral_v<ElementT>, and just write:

total_value = std::clamp(total_value, std::numeric_limits<ElementT>::lowest(),
                                      std::numeric_limits<ElementT>::max());
return static_cast<ElementT>(total_value);

The compiler should be able to optimize away the clamping when ElementT is a floating point value, although it's best to check this.

Use `std::` math functions

Functions like roundf() and floorf() will always return a float. But if FloatingType is double, you are losing precision. Use std::round(), std::floor() and so on instead.

Pass many parameters in a single `struct`

Your warp_perspective_kernel() takes 16 parameters. That's a lot! I would just pass the homography matrix as a linalg::Matrix<FloatingType>, but you can go further and create a struct that contains all the parameters, and pass that.

Unnecessary check for division by zero?

There is a check for w being very small. Do you really need that? Even if it is completely zero, a division by it will result in +inf or -inf, and then the check for if (src_x >= 0 && src_x < src_width && …) will catch it anyway.

Loki Astari · Accepted Answer · 2025-12-11 22:08:47Z

A bit suprised at your use of try catch block:

ElementT* d_src_data = nullptr;
ElementT* d_warped_data = nullptr;

<<STUFF 1>>

try
{
    CUDA_CHECK(cudaMalloc(&d_src_data, src_bytes));
    CUDA_CHECK(cudaMalloc(&d_warped_data, warped_bytes));

    <<STUFF 2>>

    // 5. Free GPU memory
    CUDA_CHECK(cudaFree(d_src_data));
    CUDA_CHECK(cudaFree(d_warped_data));

    return warped_image;
}
catch (...)
{
    // If an exception occurred, try to free memory before re-throwing
    if (d_src_data)
    {
        cudaFree(d_src_data);
    }
    if (d_warped_data)
    {
        cudaFree(d_warped_data);
    }
    // Re-throw the exception to be handled by the caller
    throw;
}

This would be handled more logically via RAII.

 void freeMemory(ElementT* block)
 {
     CUDA_CHECK(cudaFree(block));
 }
 using CudaFreeType   = decltype(&freeMemory);
 using UniqueElementT = std::unique_ptr<ElementT, CudaFreeType>;
 UniqueElementT allocMemory(std::size_t size)
 {
     ElementT*      block = nullptr;
     CUDA_CHECK(cudaMalloc(&block, size));
     return UniqueElementT(block, freeMemory);
 }

Then your code is simplified to:

<<STUFF 1>>

UniqueElementT d_src_data     = allocMemory(src_bytes);
UniqueElementT d_warped_data  = allocMemory(warped_bytes);

<<STUFF 2>>

return warped_image;

Stack Exchange Network

Image Perspective Transformation with CUDA in C++

2 Answers 2

Naming

Rounding to integers

Avoid branches

Avoid hardcoding assumptioms

Use `std::` math functions

Pass many parameters in a single `struct`

Unnecessary check for division by zero?

You must log in to answer this question.

Linked

Hot Network Questions

Image Perspective Transformation with CUDA in C++

2 Answers 2

Naming

Rounding to integers

Avoid branches

Avoid hardcoding assumptioms

Use std:: math functions

Pass many parameters in a single struct

Unnecessary check for division by zero?

You must log in to answer this question.

Linked

Related

Hot Network Questions

Use `std::` math functions

Pass many parameters in a single `struct`