4
\$\begingroup\$

This is a follow-up question for Image Stitching using SIFT Keypoint Descriptor in C++ and SIFT Keypoint Detection for Image in C++. In the previous post, the image stitching algorithm is implemented; however, it takes long time to compute warp_perspective. I am trying to implement warp_perspective_cuda template function which performs perspective transformation on GPU.

The experimental implementation

  • warp_perspective_cuda template function implementation (in file image_operations_cuda.h)

    /**
    * @brief GPU-accelerated perspective transformation using CUDA.
    *
    * This function is a wrapper that manages memory transfers to and from the GPU
    * and launches the CUDA kernel to perform the image warping.
    *
    * @tparam ElementT The pixel element type of the image. Must be an arithmetic type.
    * @tparam FloatingType The floating-point type for matrix calculations (e.g., double, float).
    * @param src The source image to be warped.
    * @param H The 3x3 homography matrix.
    * @param out_width The desired width of the output image.
    * @param out_height The desired height of the output image.
    * @return The warped image.
    */
    template<
        arithmetic ElementT,
        std::floating_point FloatingType = double,
        typename InterpolatorType = BicubicInterpolatorDevice<ElementT, FloatingType>
    >
    Image<ElementT> warp_perspective_cuda(
        const Image<ElementT>& src,
        const linalg::Matrix<FloatingType>& H,
        const std::size_t out_width,
        const std::size_t out_height
    );
    
    //  warp_perspective_cuda overload for multi-channel images
    template<
        typename ElementT,
        std::floating_point FloatingType = double
    >
    requires((std::same_as<ElementT, RGB>) || (std::same_as<ElementT, RGB_DOUBLE>) || (std::same_as<ElementT, HSV>))
    auto warp_perspective_cuda(
        const Image<ElementT>& src,
        const linalg::Matrix<FloatingType>& H,
        const std::size_t out_width,
        const std::size_t out_height
    )
    {
        return apply_each(src, [&](auto&& planes) { return warp_perspective_cuda(planes, H, out_width, out_height); });
    }
    
  • warp_perspective_cuda template function implementation (in file image_operations_cuda.cu)

    // Explicit template instantiation for float and double types with GrayScale.
    // This is required for the linker to find the implementation across translation units.
    template Image<unsigned char> warp_perspective_cuda<unsigned char, float>(
        const Image<unsigned char>& src, const linalg::Matrix<float>& H,
        const std::size_t out_width, const std::size_t out_height);
    
    template Image<unsigned char> warp_perspective_cuda<unsigned char, double>(
        const Image<unsigned char>& src, const linalg::Matrix<double>& H,
        const std::size_t out_width, const std::size_t out_height);
    
    
    /**
    * warp_perspective_cuda template function implementation
    * @brief The C++ wrapper function implementation.
    * This is the function that is exposed to the rest of the C++ code.
    */
    template<
        arithmetic ElementT,
        std::floating_point FloatingType,
        typename InterpolatorType
    >
    Image<ElementT> warp_perspective_cuda(
        const Image<ElementT>& src,
        const linalg::Matrix<FloatingType>& H_inv,
        const std::size_t out_width,
        const std::size_t out_height
    )
    {
        // 1. Allocate memory on the GPU (device)
        ElementT* d_src_data = nullptr;
        ElementT* d_warped_data = nullptr;
        const size_t src_bytes = src.count() * sizeof(ElementT);
        const size_t warped_bytes = out_width * out_height * sizeof(ElementT);
    
        // Wrap the CUDA operations in a try-catch block to ensure cleanup
        // (cudaFree) happens even if a CUDA call fails.
        try
        {
            CUDA_CHECK(cudaMalloc(&d_src_data, src_bytes));
            CUDA_CHECK(cudaMalloc(&d_warped_data, warped_bytes));
    
            // 2. Copy source image data from CPU (host) to GPU (device)
            CUDA_CHECK(cudaMemcpy(d_src_data, src.getImageData().data(), src_bytes, cudaMemcpyHostToDevice));
            // Initialize warped image memory to 0
            CUDA_CHECK(cudaMemset(d_warped_data, 0, warped_bytes));
    
            // 3. Configure and launch the kernel
            // Threads per block (a common choice is 16x16 or 32x32)
            dim3 threads_per_block(16, 16);
            // Number of blocks in the grid
            dim3 num_blocks(
                (out_width + threads_per_block.x - 1) / threads_per_block.x,
                (out_height + threads_per_block.y - 1) / threads_per_block.y
            );
    
            warp_perspective_kernel<ElementT, FloatingType, InterpolatorType><<<num_blocks, threads_per_block>>>(
                d_warped_data,
                d_src_data,
                src.getWidth(),
                src.getHeight(),
                out_width,
                out_height,
                H_inv.at(0,0), H_inv.at(0,1), H_inv.at(0,2),
                H_inv.at(1,0), H_inv.at(1,1), H_inv.at(1,2),
                H_inv.at(2,0), H_inv.at(2,1), H_inv.at(2,2),
                InterpolatorType{} // Pass a default-constructed functor
            );
    
            // Check for any errors during kernel launch
            CUDA_CHECK(cudaGetLastError());
            // Synchronize to ensure the kernel has finished before copying back data
            CUDA_CHECK(cudaDeviceSynchronize());
    
            // 4. Copy the result from GPU (device) back to CPU (host)
            Image<ElementT> warped_image(out_width, out_height);
            CUDA_CHECK(cudaMemcpy((void*)warped_image.getImageData().data(), d_warped_data, warped_bytes, cudaMemcpyDeviceToHost));
    
            // 5. Free GPU memory
            CUDA_CHECK(cudaFree(d_src_data));
            CUDA_CHECK(cudaFree(d_warped_data));
    
            return warped_image;
        }
        catch (...)
        {
            // If an exception occurred, try to free memory before re-throwing
            if (d_src_data)
            {
                cudaFree(d_src_data);
            }
            if (d_warped_data)
            {
                cudaFree(d_warped_data);
            }
            // Re-throw the exception to be handled by the caller
            throw;
        }
    }
    
  • BicubicInterpolatorDevice struct implementation (in file image_operations_cuda.cu)

    /**
    * BicubicInterpolatorDevice struct implementation
    * @brief Functor for Bicubic Interpolation (Device-Side).
    */
    template<arithmetic ElementT, arithmetic FloatingType>
    struct BicubicInterpolatorDevice
    {
        __device__ auto operator()(
            const ElementT* src_data,
            const int src_width,
            const int src_height,
            const FloatingType x,
            const FloatingType y) const
        {
            // Fallback to a simple interpolation (nearest neighbor) for edges for simplicity in CUDA
            if (x < 1 || x >= src_width - 2 || y < 1 || y >= src_height - 2)
            {
                int ix = static_cast<int>(roundf(x));
                int iy = static_cast<int>(roundf(y));
                ix = max(0, min(src_width - 1, ix));
                iy = max(0, min(src_height - 1, iy));
                return src_data[iy * src_width + ix];
            }
    
            int x_floor = static_cast<int>(floorf(x));
            int y_floor = static_cast<int>(floorf(y));
    
            FloatingType total_value{};
    
            for (int j = 0; j <= 3; ++j)
            {
                int v = y_floor - 1 + j;
                FloatingType row_value{};
                for (int i = 0; i <= 3; ++i)
                {
                    int u = x_floor - 1 + i;
                    row_value += static_cast<FloatingType>(src_data[v * src_width + u]) * cubic_kernel_device(x - u);
                }
                total_value += row_value * cubic_kernel_device(y - v);
            }
    
            if constexpr (std::is_integral_v<ElementT>)
            {
                if (total_value > 255.0)
                {
                    return static_cast<ElementT>(255); // Simplified clamping for byte images
                }
                if (total_value < 0.0)
                {
                    return static_cast<ElementT>(0);
                }
            }
    
            return static_cast<ElementT>(total_value);
        }
    };
    
  • warp_perspective_kernel template function implementation (in file image_operations_cuda.cu)

    /**
    * warp_perspective_kernel template function implementation
    * @brief The CUDA kernel that performs the perspective warp.
    * Each thread in the grid computes one pixel of the output image.
    */
    template<arithmetic ElementT, std::floating_point FloatingType, typename InterpolatorFunc>
    __global__ void warp_perspective_kernel(
        ElementT* warped_data,
        const ElementT* src_data,
        const int src_width,
        const int src_height,
        const int out_width,
        const int out_height,
        const FloatingType h11, const FloatingType h12, const FloatingType h13,
        const FloatingType h21, const FloatingType h22, const FloatingType h23,
        const FloatingType h31, const FloatingType h32, const FloatingType h33,
        InterpolatorFunc interpolator = {})
    {
        // Calculate the unique global x and y coordinates for this thread
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;
    
        // Ensure the thread is within the bounds of the output image
        if (x < out_width && y < out_height)
        {
            // Apply inverse homography to find the corresponding source coordinate
            const FloatingType w = h31 * x + h32 * y + h33;
    
            if (fabsf(w) > 1e-6) // Avoid division by zero
            {
                const FloatingType src_x = (h11 * x + h12 * y + h13) / w;
                const FloatingType src_y = (h21 * x + h22 * y + h23) / w;
    
                // If the source coordinate is within the source image bounds, interpolate and write the pixel
                if (src_x >= 0 && src_x < src_width && src_y >= 0 && src_y < src_height)
                {
                    warped_data[y * out_width + x] = interpolator(
                        src_data, src_width, src_height, src_x, src_y);
                }
            }
        }
    }
    

TinyDIP on GitHub

All suggestions are welcome.

The summary information:

\$\endgroup\$

2 Answers 2

2
\$\begingroup\$

Naming

  • H is a variable name from a formula, but you are not referencing any formula in your source code. Since it's the homography matrix, I would rename this to homography.
  • Why is it H in the header file but H_inv in the source file? Make sure variable names are the same in both; it avoids confusion and makes searching easier.
  • d_src_data should be renamed device_src or perhaps gpu_src.

Rounding to integers

When rounding to an integer, consider that static_cast<int> will already do the equivalent of std::floor() for you (well, at least for positive values). So to calculate the floor, write:

int x_floor = static_cast<int>(x);

If you want to round positive values, then you can write this:

int ix = static_cast<int>(x + 0.5);

Avoid branches

GPUs can suffer a performance penalty when there are branches in the code. It is best to avoid them, unless you have some insight into how often branches diverge within a warp.

Instead of making the fallback for edges a special case, consider just skipping it and instead clamping the x and y values:

int x_floor = static_cast<int>(std::clamp(x, 1, src_width - 2));
int y_floor = static_cast<int>(std::clamp(x, 1, src_height - 2));

Avoid hardcoding assumptioms

What if ElementT is std::uint16_t? Then you are clamping values to 255, even thought the elements can go as high as 65535. Use std::numeric_limits<> to find out the lowest and highest value a given type can have. In fact, I think you can avoid the check for std::is_integral_v<ElementT>, and just write:

total_value = std::clamp(total_value, std::numeric_limits<ElementT>::lowest(),
                                      std::numeric_limits<ElementT>::max());
return static_cast<ElementT>(total_value);

The compiler should be able to optimize away the clamping when ElementT is a floating point value, although it's best to check this.

Use std:: math functions

Functions like roundf() and floorf() will always return a float. But if FloatingType is double, you are losing precision. Use std::round(), std::floor() and so on instead.

Pass many parameters in a single struct

Your warp_perspective_kernel() takes 16 parameters. That's a lot! I would just pass the homography matrix as a linalg::Matrix<FloatingType>, but you can go further and create a struct that contains all the parameters, and pass that.

Unnecessary check for division by zero?

There is a check for w being very small. Do you really need that? Even if it is completely zero, a division by it will result in +inf or -inf, and then the check for if (src_x >= 0 && src_x < src_width && …) will catch it anyway.

\$\endgroup\$
2
\$\begingroup\$

A bit suprised at your use of try catch block:

ElementT* d_src_data = nullptr;
ElementT* d_warped_data = nullptr;

<<STUFF 1>>

try
{
    CUDA_CHECK(cudaMalloc(&d_src_data, src_bytes));
    CUDA_CHECK(cudaMalloc(&d_warped_data, warped_bytes));

    <<STUFF 2>>

    // 5. Free GPU memory
    CUDA_CHECK(cudaFree(d_src_data));
    CUDA_CHECK(cudaFree(d_warped_data));

    return warped_image;
}
catch (...)
{
    // If an exception occurred, try to free memory before re-throwing
    if (d_src_data)
    {
        cudaFree(d_src_data);
    }
    if (d_warped_data)
    {
        cudaFree(d_warped_data);
    }
    // Re-throw the exception to be handled by the caller
    throw;
}

This would be handled more logically via RAII.

 void freeMemory(ElementT* block)
 {
     CUDA_CHECK(cudaFree(block));
 }
 using CudaFreeType   = decltype(&freeMemory);
 using UniqueElementT = std::unique_ptr<ElementT, CudaFreeType>;
 UniqueElementT allocMemory(std::size_t size)
 {
     ElementT*      block = nullptr;
     CUDA_CHECK(cudaMalloc(&block, size));
     return UniqueElementT(block, freeMemory);
 }

Then your code is simplified to:

<<STUFF 1>>

UniqueElementT d_src_data     = allocMemory(src_bytes);
UniqueElementT d_warped_data  = allocMemory(warped_bytes);

<<STUFF 2>>

return warped_image;
\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.