Merge pull request #28118 from sinkboy-chen:bugfix/cuda-race-condition

stitching: pass warp params by value to avoid CUDA constant races #28118 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under the Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on code under GPL or another license that is incompatible with OpenCV. - [x] The PR is proposed to the proper branch. - [x] There is a reference to the original bug report and related work. - [ ] There are accuracy tests, performance tests, and test data in the opencv_extra repository, if applicable. The patch to opencv_extra uses the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake configuration. Fixes #26870. In `modules/stitching/src/cuda/build_warp_maps.cu`, the original implementation copied parameters into global GPU constant symbols: ```cpp cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9 * sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9 * sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float))); ``` As discussed in the issue, this can cause race conditions when multiple warps are built concurrently. This patch removes the use of these global constant symbols and instead passes the required data as kernel parameters (a total of 11 floats encapsulated in `WarpParams`). One potential concern is increased register pressure due to additional kernel arguments. However, based on experiments using the test case from issue #26870, there is no significant performance regression; in fact, a small speed‑up was observed. Testing was performed on an NVIDIA GeForce RTX 4090 (single GPU). Note: ./bin/opencv_perf_stitching did not run successfully on my system even with an unmodified git checkout, so performance was evaluated using the test case from issue #26870 instead.
2026-01-15 12:15:17 +00:00 · 2025-12-03 13:51:59 +08:00
parent a08565253e
commit 4c7fd071f0
1 changed files with 58 additions and 32 deletions
--- a/modules/stitching/src/cuda/build_warp_maps.cu
+++ b/modules/stitching/src/cuda/build_warp_maps.cu
@@ -54,23 +54,23 @@ namespace cv { namespace cuda { namespace device
    {
        // TODO use intrinsics like __sinf and so on

-        namespace build_warp_maps
+        struct WarpParams
        {
-
-            __constant__ float ck_rinv[9];
-            __constant__ float cr_kinv[9];
-            __constant__ float ct[3];
-            __constant__ float cscale;
-        }
-
+            float k_rinv[9];
+            float r_kinv[9];
+            float t[3];
+            float scale;
+        };

        class PlaneMapper
        {
        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y,
+                                                               const WarpParams& params)
            {
-                using namespace build_warp_maps;
-
+                const float *ck_rinv = params.k_rinv;
+                const float *ct = params.t;
+                const float cscale = params.scale;
                float x_ = u / cscale - ct[0];
                float y_ = v / cscale - ct[1];

@@ -88,10 +88,11 @@ namespace cv { namespace cuda { namespace device
        class CylindricalMapper
        {
        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y,
+                                                               const WarpParams& params)
            {
-                using namespace build_warp_maps;
-
+                const float *ck_rinv = params.k_rinv;
+                const float cscale = params.scale;
                u /= cscale;
                float x_ = ::sinf(u);
                float y_ = v / cscale;
@@ -111,10 +112,11 @@ namespace cv { namespace cuda { namespace device
        class SphericalMapper
        {
        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y,
+                                                               const WarpParams& params)
            {
-                using namespace build_warp_maps;
-
+                const float *ck_rinv = params.k_rinv;
+                const float cscale = params.scale;
                v /= cscale;
                u /= cscale;

@@ -136,7 +138,8 @@ namespace cv { namespace cuda { namespace device

        template <typename Mapper>
        __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
-                                            PtrStepf map_x, PtrStepf map_y)
+                                            PtrStepf map_x, PtrStepf map_y,
+                                            const WarpParams params)
        {
            int du = blockIdx.x * blockDim.x + threadIdx.x;
            int dv = blockIdx.y * blockDim.y + threadIdx.y;
@@ -145,7 +148,7 @@ namespace cv { namespace cuda { namespace device
                float u = tl_u + du;
                float v = tl_v + dv;
                float x, y;
-                Mapper::mapBackward(u, v, x, y);
+                Mapper::mapBackward(u, v, x, y, params);
                map_x.ptr(dv)[du] = x;
                map_y.ptr(dv)[du] = y;
            }
@@ -156,10 +159,16 @@ namespace cv { namespace cuda { namespace device
                                const float k_rinv[9], const float r_kinv[9], const float t[3],
                                float scale, cudaStream_t stream)
        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+            WarpParams params;
+            for (int i = 0; i < 9; ++i)
+            {
+                params.k_rinv[i] = k_rinv[i];
+                params.r_kinv[i] = r_kinv[i];
+            }
+            params.t[0] = t[0];
+            params.t[1] = t[1];
+            params.t[2] = t[2];
+            params.scale = scale;

            int cols = map_x.cols;
            int rows = map_x.rows;
@@ -167,7 +176,8 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8);
            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));

-            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows,
+                                                               map_x, map_y, params);
            cudaSafeCall(cudaGetLastError());
            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());
@@ -178,9 +188,16 @@ namespace cv { namespace cuda { namespace device
                                      const float k_rinv[9], const float r_kinv[9], float scale,
                                      cudaStream_t stream)
        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+            WarpParams params;
+            for (int i = 0; i < 9; ++i)
+            {
+                params.k_rinv[i] = k_rinv[i];
+                params.r_kinv[i] = r_kinv[i];
+            }
+            params.t[0] = 0.f;
+            params.t[1] = 0.f;
+            params.t[2] = 0.f;
+            params.scale = scale;

            int cols = map_x.cols;
            int rows = map_x.rows;
@@ -188,7 +205,8 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8);
            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));

-            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows,
+                                                                     map_x, map_y, params);
            cudaSafeCall(cudaGetLastError());
            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());
@@ -199,9 +217,16 @@ namespace cv { namespace cuda { namespace device
                                    const float k_rinv[9], const float r_kinv[9], float scale,
                                    cudaStream_t stream)
        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+            WarpParams params;
+            for (int i = 0; i < 9; ++i)
+            {
+                params.k_rinv[i] = k_rinv[i];
+                params.r_kinv[i] = r_kinv[i];
+            }
+            params.t[0] = 0.f;
+            params.t[1] = 0.f;
+            params.t[2] = 0.f;
+            params.scale = scale;

            int cols = map_x.cols;
            int rows = map_x.rows;
@@ -209,7 +234,8 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8);
            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));

-            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows,
+                                                                   map_x, map_y, params);
            cudaSafeCall(cudaGetLastError());
            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());