Merge pull request #3081 from asmorkalov:as/stereo_bm_uniqueness_ratio

Uniqueness ratio support for cuda::StereoBM * Naive implementation of uniqueness ratio option for cuda::StereoBM. * Optimized memory consumption in cuda::stereoBM with uniqueness check. * Got rid of line_disps array. * Reduced line_ssds. * Apply streams for all parts of cuda::stereoBM::compute. * Added perf test for CUDA stereoBM with uniqueness check. * Optimized global memory transactions. * Restored sync data transfers as they use stack variables. * Do not use constant memory in stereoBM to exclude stream races. * Code review fixes.
2025-10-19 11:21:39 +08:00 · 2021-12-20 13:51:03 +03:00
parent 80ff29a653
commit ab006c9201
4 changed files with 255 additions and 66 deletions
--- a/modules/cudastereo/perf/perf_stereo.cpp
+++ b/modules/cudastereo/perf/perf_stereo.cpp
@@ -87,6 +87,45 @@ PERF_TEST_P(ImagePair, StereoBM,
    }
 }

+PERF_TEST_P(ImagePair, StereoBMwithUniqueness,
+            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 256;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::StereoBM> d_bm = cv::cuda::createStereoBM(ndisp);
+        d_bm->setUniquenessRatio(10);
+
+        const cv::cuda::GpuMat d_imgLeft(imgLeft);
+        const cv::cuda::GpuMat d_imgRight(imgRight);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Ptr<cv::StereoBM> bm = cv::StereoBM::create(ndisp);
+        bm->setUniquenessRatio(10);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation

--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -43,6 +43,7 @@
 #if !defined CUDA_DISABLER

 #include "opencv2/core/cuda/common.hpp"
+#include <limits.h>

 namespace cv { namespace cuda { namespace device
 {
@@ -60,18 +61,13 @@ namespace cv { namespace cuda { namespace device
        #define STEREO_MIND 0                    // The minimum d range to check
        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing

-        __constant__ unsigned int* cminSSDImage;
-        __constant__ size_t cminSSD_step;
-        __constant__ int cwidth;
-        __constant__ int cheight;
-
        __device__ __forceinline__ int SQ(int a)
        {
            return a * a;
        }

        template<int RADIUS>
-        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth)
        {
            unsigned int cache = 0;
            unsigned int cache2 = 0;
@@ -99,26 +95,24 @@ namespace cv { namespace cuda { namespace device
        }

        template<int RADIUS>
-        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd)
        {
-            unsigned int ssd[N_DISPARITIES];
-
            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
            __syncthreads();
-            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X);
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);

            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));

@@ -235,26 +229,27 @@ namespace cv { namespace cuda { namespace device
        }

        template<int RADIUS>
-        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
+        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp,
+                                     int uniquenessRatio, unsigned int* cminSSDImage, size_t cminSSD_step, int cwidth, int cheight)
        {
            extern __shared__ unsigned int col_ssd_cache[];
-            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
-            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
+            uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check
+            uint* batch_ssds = line_ssds + 2;

-            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
-            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
-            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
-            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
-            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
+            uint line_ssd_tails[3*ROWSperTHREAD];
+            uchar uniqueness_approved[ROWSperTHREAD];
+            uchar local_disparity[ROWSperTHREAD];
+
+            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
+            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;
+
+            const int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
+            const int Y = (blockIdx.y * ROWSperTHREAD + RADIUS);

            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
            unsigned char* disparImage = disp.data + X + Y * disp.step;
-            //if (X < cwidth)
-            //{
-            //    unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-            //    for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-            //        *ptr = 0xFFFFFFFF;
-            //}
+            float thresh_scale;
+
            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
            int y_tex;
            int x_tex = X - RADIUS;
@@ -262,6 +257,25 @@ namespace cv { namespace cuda { namespace device
            if (x_tex >= cwidth)
                return;

+            for(int i = 0; i < ROWSperTHREAD; i++)
+                local_disparity[i] = 0;
+
+            for(int i = 0; i < 3*ROWSperTHREAD; i++)
+            {
+                line_ssd_tails[i] = UINT_MAX;
+            }
+
+            if (uniquenessRatio > 0)
+            {
+                batch_ssds[6] = UINT_MAX;
+                batch_ssds[7] = UINT_MAX;
+                thresh_scale = (1.0 + uniquenessRatio / 100.0f);
+                for(int i = 0; i < ROWSperTHREAD; i++)
+                {
+                    uniqueness_approved[i] = 1;
+                }
+            }
+
            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
            {
                y_tex = Y - RADIUS;
@@ -276,10 +290,10 @@ namespace cv { namespace cuda { namespace device

                if (Y < cheight - RADIUS)
                {
-                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
+                    uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);

                    // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
-                    // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
+                    // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
                    //
                    // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
                    // must also call "MinSSD" to avoid deadlock. (#13850)
@@ -290,10 +304,50 @@ namespace cv { namespace cuda { namespace device

                    if (X < cwidth - RADIUS)
                    {
-                        if (minSSD.x < minSSDImage[0])
+                        unsigned int last_opt = line_ssd_tails[3*0 + 0];
+                        unsigned int opt = ::min(last_opt, batch_opt.x);
+
+                        if (uniquenessRatio > 0)
                        {
-                            disparImage[0] = (unsigned char)(d + minSSD.y);
-                            minSSDImage[0] = minSSD.x;
+                            line_ssds[0] = line_ssd_tails[3*0 + 1];
+                            line_ssds[1] = line_ssd_tails[3*0 + 2];
+
+                            float thresh = thresh_scale * opt;
+                            int dtest = local_disparity[0];
+
+                            if(batch_opt.x < last_opt)
+                            {
+                                uniqueness_approved[0] = 1;
+                                dtest = d + batch_opt.y;
+                                if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh))
+                                {
+                                    uniqueness_approved[0] = 0;
+                                }
+                            }
+
+                            if(uniqueness_approved[0])
+                            {
+                                // the trial to decompose the code on 2 loops without ld vs dtest makes
+                                // uniqueness check dramatically slow. at least on gf 1080
+                                for (int ld = d-2; ld < d + N_DISPARITIES; ld++)
+                                {
+                                    if ((ld < dtest-1 || ld > dtest+1) && (line_ssds[ld-d+2] <= thresh))
+                                    {
+                                        uniqueness_approved[0] = 0;
+                                        break;
+                                    }
+                                }
+                            }
+
+
+                            line_ssd_tails[3*0 + 1] = batch_ssds[6];
+                            line_ssd_tails[3*0 + 2] = batch_ssds[7];
+                        }
+
+                        line_ssd_tails[3*0 + 0] = opt;
+                        if (batch_opt.x < last_opt)
+                        {
+                            local_disparity[0] = (unsigned char)(d + batch_opt.y);
                        }
                    }
                }
@@ -313,14 +367,13 @@ namespace cv { namespace cuda { namespace device

                    y_tex += 1;

-                    __syncthreads(); //before MinSSD function
+                    __syncthreads();

                    if (row < cheight - RADIUS - Y)
                    {
-                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
-
+                        uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
                        // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
-                        // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
+                        // computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
                        //
                        // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
                        // must also call "MinSSD" to avoid deadlock. (#13850)
@@ -331,11 +384,47 @@ namespace cv { namespace cuda { namespace device

                        if (X < cwidth - RADIUS)
                        {
-                            int idx = row * cminSSD_step;
-                            if (minSSD.x < minSSDImage[idx])
+                            unsigned int last_opt = line_ssd_tails[3*row + 0];
+                            unsigned int opt = ::min(last_opt, batch_opt.x);
+                            if (uniquenessRatio > 0)
                            {
-                                disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                                minSSDImage[idx] = minSSD.x;
+                                line_ssds[0] = line_ssd_tails[3*row + 1];
+                                line_ssds[1] = line_ssd_tails[3*row + 2];
+
+                                float thresh = thresh_scale * opt;
+                                int dtest = local_disparity[row];
+
+                                if(batch_opt.x < last_opt)
+                                {
+                                    uniqueness_approved[row] = 1;
+                                    dtest = d + batch_opt.y;
+                                    if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh))
+                                    {
+                                        uniqueness_approved[row] = 0;
+                                    }
+                                }
+
+                                if(uniqueness_approved[row])
+                                {
+                                    for (int ld = 0; ld < N_DISPARITIES + 2; ld++)
+                                    {
+                                        if (((d+ld-2 < dtest-1) || (d+ld-2 > dtest+1)) && (line_ssds[ld] <= thresh))
+                                        {
+                                            uniqueness_approved[row] = 0;
+                                            break;
+                                        }
+                                    }
+                                }
+
+                                line_ssd_tails[3*row + 1] = batch_ssds[6];
+                                line_ssd_tails[3*row + 2] = batch_ssds[7];
+                            }
+
+                            line_ssd_tails[3*row + 0] = opt;
+
+                            if (batch_opt.x < last_opt)
+                            {
+                                local_disparity[row] = (unsigned char)(d + batch_opt.y);
                            }
                        }
                    }
@@ -344,10 +433,32 @@ namespace cv { namespace cuda { namespace device
                __syncthreads(); // before initializing shared memory at the beginning of next loop

            } // for d loop
+
+            for (int row = 0; row < end_row; row++)
+            {
+                minSSDImage[row * cminSSD_step] = line_ssd_tails[3*row + 0];
+            }
+
+            if (uniquenessRatio > 0)
+            {
+                for (int row = 0; row < end_row; row++)
+                {
+                    // drop disparity for pixel where uniqueness requirement was not satisfied (zero value)
+                    disparImage[disp.step * row] = local_disparity[row] * uniqueness_approved[row];
+                }
+            }
+            else
+            {
+                for (int row = 0; row < end_row; row++)
+                {
+                    disparImage[disp.step * row] = local_disparity[row];
+                }
+            }
        }

-
-        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
+        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
+                                                int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
+                                                size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream)
        {
            dim3 grid(1,1,1);
            dim3 threads(BLOCK_W, 1, 1);
@@ -358,14 +469,17 @@ namespace cv { namespace cuda { namespace device
            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);

-            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio,
+                                                                       missd_buffer, minssd_step, cwidth, cheight);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        };

-        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
+        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
+                                        int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
+                                        size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream);

        const static kernel_caller_t callers[] =
        {
@@ -380,27 +494,19 @@ namespace cv { namespace cuda { namespace device
        };
        const int calles_num = sizeof(callers)/sizeof(callers[0]);

-        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp,
+                           int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
        {
            int winsz2 = winsz >> 1;

            if (winsz2 == 0 || winsz2 >= calles_num)
                CV_Error(cv::Error::StsBadArg, "Unsupported window size");

-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
-
-            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
-            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
-
-            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
+            cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) );
+            cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) );

            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
-
-            callers[winsz2](left, right, disp, maxdisp, stream);
+            callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);
        }

        __device__ inline int clamp(int x, int a, int b)
--- a/modules/cudastereo/src/stereobm.cpp
+++ b/modules/cudastereo/src/stereobm.cpp
@@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
 {
    namespace stereobm
    {
-        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
+        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
        void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream);
        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
@@ -102,8 +102,8 @@ namespace
        int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
        void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }

-        int getUniquenessRatio() const { return 0; }
-        void setUniquenessRatio(int /*uniquenessRatio*/) {}
+        int getUniquenessRatio() const { return uniquenessRatio_; }
+        void setUniquenessRatio(int uniquenessRatio) { uniquenessRatio_ = uniquenessRatio; }

        int getSmallerBlockSize() const { return 0; }
        void setSmallerBlockSize(int /*blockSize*/){}
@@ -121,12 +121,13 @@ namespace
        int preFilterCap_;
        float avergeTexThreshold_;
        int preFilterSize_;
+        int uniquenessRatio_;

        GpuMat minSSD_, leBuf_, riBuf_;
    };

    StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
-        : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9)
+        : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9), uniquenessRatio_(0)
    {
    }

@@ -183,7 +184,7 @@ namespace
            ri_for_bm = riBuf_;
        }

-        stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
+        stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, uniquenessRatio_, minSSD_, stream);

        if (avergeTexThreshold_ > 0)
            postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
--- a/modules/cudastereo/test/test_stereo.cpp
+++ b/modules/cudastereo/test/test_stereo.cpp
@@ -122,6 +122,49 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression)
    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
 }

+CUDA_TEST_P(StereoBM, Streams)
+{
+    cv::cuda::Stream stream;
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::cuda::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->compute(loadMat(left_image), loadMat(right_image), disp, stream);
+    stream.waitForCompletion();
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
+CUDA_TEST_P(StereoBM, Uniqueness_Regression)
+{
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp-uniqueness15.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->setUniquenessRatio(15);
+    bm->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    cv::Mat disp_cpu;
+    disp.download(disp_cpu);
+    cv::imwrite("disp_inq15.png", disp_cpu);
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES);

 //////////////////////////////////////////////////////////////////////////