1
0
mirror of https://github.com/opencv/opencv_contrib.git synced 2025-10-19 11:21:39 +08:00

Merge pull request #3081 from asmorkalov:as/stereo_bm_uniqueness_ratio

Uniqueness ratio support for cuda::StereoBM

* Naive implementation of uniqueness ratio option for cuda::StereoBM.

* Optimized memory consumption in cuda::stereoBM with uniqueness check.

* Got rid of line_disps array.

* Reduced line_ssds.

* Apply streams for all parts of cuda::stereoBM::compute.

* Added perf test for CUDA stereoBM with uniqueness check.

* Optimized global memory transactions.

* Restored sync data transfers as they use stack variables.

* Do not use constant memory in stereoBM to exclude stream races.

* Code review fixes.
This commit is contained in:
Alexander Smorkalov
2021-12-20 13:51:03 +03:00
committed by GitHub
parent 80ff29a653
commit ab006c9201
4 changed files with 255 additions and 66 deletions

View File

@@ -87,6 +87,45 @@ PERF_TEST_P(ImagePair, StereoBM,
}
}
PERF_TEST_P(ImagePair, StereoBMwithUniqueness,
Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
{
declare.time(300.0);
const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(imgLeft.empty());
const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(imgRight.empty());
const int ndisp = 256;
if (PERF_RUN_CUDA())
{
cv::Ptr<cv::StereoBM> d_bm = cv::cuda::createStereoBM(ndisp);
d_bm->setUniquenessRatio(10);
const cv::cuda::GpuMat d_imgLeft(imgLeft);
const cv::cuda::GpuMat d_imgRight(imgRight);
cv::cuda::GpuMat dst;
TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Ptr<cv::StereoBM> bm = cv::StereoBM::create(ndisp);
bm->setUniquenessRatio(10);
cv::Mat dst;
TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// StereoBeliefPropagation

View File

@@ -43,6 +43,7 @@
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include <limits.h>
namespace cv { namespace cuda { namespace device
{
@@ -60,18 +61,13 @@ namespace cv { namespace cuda { namespace device
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
__constant__ unsigned int* cminSSDImage;
__constant__ size_t cminSSD_step;
__constant__ int cwidth;
__constant__ int cheight;
__device__ __forceinline__ int SQ(int a)
{
return a * a;
}
template<int RADIUS>
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
@@ -99,26 +95,24 @@ namespace cv { namespace cuda { namespace device
}
template<int RADIUS>
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd)
{
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X);
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X);
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X);
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X);
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X);
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X);
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X);
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
__syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X);
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
@@ -235,26 +229,27 @@ namespace cv { namespace cuda { namespace device
}
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp,
int uniquenessRatio, unsigned int* cminSSDImage, size_t cminSSD_step, int cwidth, int cheight)
{
extern __shared__ unsigned int col_ssd_cache[];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check
uint* batch_ssds = line_ssds + 2;
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
uint line_ssd_tails[3*ROWSperTHREAD];
uchar uniqueness_approved[ROWSperTHREAD];
uchar local_disparity[ROWSperTHREAD];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;
const int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
const int Y = (blockIdx.y * ROWSperTHREAD + RADIUS);
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp.data + X + Y * disp.step;
//if (X < cwidth)
//{
// unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
// for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
// *ptr = 0xFFFFFFFF;
//}
float thresh_scale;
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
int y_tex;
int x_tex = X - RADIUS;
@@ -262,6 +257,25 @@ namespace cv { namespace cuda { namespace device
if (x_tex >= cwidth)
return;
for(int i = 0; i < ROWSperTHREAD; i++)
local_disparity[i] = 0;
for(int i = 0; i < 3*ROWSperTHREAD; i++)
{
line_ssd_tails[i] = UINT_MAX;
}
if (uniquenessRatio > 0)
{
batch_ssds[6] = UINT_MAX;
batch_ssds[7] = UINT_MAX;
thresh_scale = (1.0 + uniquenessRatio / 100.0f);
for(int i = 0; i < ROWSperTHREAD; i++)
{
uniqueness_approved[i] = 1;
}
}
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{
y_tex = Y - RADIUS;
@@ -276,10 +290,10 @@ namespace cv { namespace cuda { namespace device
if (Y < cheight - RADIUS)
{
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
// computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
// computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
//
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
// must also call "MinSSD" to avoid deadlock. (#13850)
@@ -290,10 +304,50 @@ namespace cv { namespace cuda { namespace device
if (X < cwidth - RADIUS)
{
if (minSSD.x < minSSDImage[0])
unsigned int last_opt = line_ssd_tails[3*0 + 0];
unsigned int opt = ::min(last_opt, batch_opt.x);
if (uniquenessRatio > 0)
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
line_ssds[0] = line_ssd_tails[3*0 + 1];
line_ssds[1] = line_ssd_tails[3*0 + 2];
float thresh = thresh_scale * opt;
int dtest = local_disparity[0];
if(batch_opt.x < last_opt)
{
uniqueness_approved[0] = 1;
dtest = d + batch_opt.y;
if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh))
{
uniqueness_approved[0] = 0;
}
}
if(uniqueness_approved[0])
{
// the trial to decompose the code on 2 loops without ld vs dtest makes
// uniqueness check dramatically slow. at least on gf 1080
for (int ld = d-2; ld < d + N_DISPARITIES; ld++)
{
if ((ld < dtest-1 || ld > dtest+1) && (line_ssds[ld-d+2] <= thresh))
{
uniqueness_approved[0] = 0;
break;
}
}
}
line_ssd_tails[3*0 + 1] = batch_ssds[6];
line_ssd_tails[3*0 + 2] = batch_ssds[7];
}
line_ssd_tails[3*0 + 0] = opt;
if (batch_opt.x < last_opt)
{
local_disparity[0] = (unsigned char)(d + batch_opt.y);
}
}
}
@@ -313,14 +367,13 @@ namespace cv { namespace cuda { namespace device
y_tex += 1;
__syncthreads(); //before MinSSD function
__syncthreads();
if (row < cheight - RADIUS - Y)
{
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
// computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
// computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
//
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
// must also call "MinSSD" to avoid deadlock. (#13850)
@@ -331,11 +384,47 @@ namespace cv { namespace cuda { namespace device
if (X < cwidth - RADIUS)
{
int idx = row * cminSSD_step;
if (minSSD.x < minSSDImage[idx])
unsigned int last_opt = line_ssd_tails[3*row + 0];
unsigned int opt = ::min(last_opt, batch_opt.x);
if (uniquenessRatio > 0)
{
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
line_ssds[0] = line_ssd_tails[3*row + 1];
line_ssds[1] = line_ssd_tails[3*row + 2];
float thresh = thresh_scale * opt;
int dtest = local_disparity[row];
if(batch_opt.x < last_opt)
{
uniqueness_approved[row] = 1;
dtest = d + batch_opt.y;
if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh))
{
uniqueness_approved[row] = 0;
}
}
if(uniqueness_approved[row])
{
for (int ld = 0; ld < N_DISPARITIES + 2; ld++)
{
if (((d+ld-2 < dtest-1) || (d+ld-2 > dtest+1)) && (line_ssds[ld] <= thresh))
{
uniqueness_approved[row] = 0;
break;
}
}
}
line_ssd_tails[3*row + 1] = batch_ssds[6];
line_ssd_tails[3*row + 2] = batch_ssds[7];
}
line_ssd_tails[3*row + 0] = opt;
if (batch_opt.x < last_opt)
{
local_disparity[row] = (unsigned char)(d + batch_opt.y);
}
}
}
@@ -344,10 +433,32 @@ namespace cv { namespace cuda { namespace device
__syncthreads(); // before initializing shared memory at the beginning of next loop
} // for d loop
for (int row = 0; row < end_row; row++)
{
minSSDImage[row * cminSSD_step] = line_ssd_tails[3*row + 0];
}
if (uniquenessRatio > 0)
{
for (int row = 0; row < end_row; row++)
{
// drop disparity for pixel where uniqueness requirement was not satisfied (zero value)
disparImage[disp.step * row] = local_disparity[row] * uniqueness_approved[row];
}
}
else
{
for (int row = 0; row < end_row; row++)
{
disparImage[disp.step * row] = local_disparity[row];
}
}
}
template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream)
{
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
@@ -358,14 +469,17 @@ namespace cv { namespace cuda { namespace device
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio,
missd_buffer, minssd_step, cwidth, cheight);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream);
const static kernel_caller_t callers[] =
{
@@ -380,27 +494,19 @@ namespace cv { namespace cuda { namespace device
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp,
int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
{
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
CV_Error(cv::Error::StsBadArg, "Unsupported window size");
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) );
cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp, stream);
callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);
}
__device__ inline int clamp(int x, int a, int b)

View File

@@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
{
namespace stereobm
{
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream);
void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
@@ -102,8 +102,8 @@ namespace
int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }
int getUniquenessRatio() const { return 0; }
void setUniquenessRatio(int /*uniquenessRatio*/) {}
int getUniquenessRatio() const { return uniquenessRatio_; }
void setUniquenessRatio(int uniquenessRatio) { uniquenessRatio_ = uniquenessRatio; }
int getSmallerBlockSize() const { return 0; }
void setSmallerBlockSize(int /*blockSize*/){}
@@ -121,12 +121,13 @@ namespace
int preFilterCap_;
float avergeTexThreshold_;
int preFilterSize_;
int uniquenessRatio_;
GpuMat minSSD_, leBuf_, riBuf_;
};
StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
: preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9)
: preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9), uniquenessRatio_(0)
{
}
@@ -183,7 +184,7 @@ namespace
ri_for_bm = riBuf_;
}
stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, uniquenessRatio_, minSSD_, stream);
if (avergeTexThreshold_ > 0)
postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);

View File

@@ -122,6 +122,49 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression)
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
}
CUDA_TEST_P(StereoBM, Streams)
{
cv::cuda::Stream stream;
cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
cv::Mat disp_gold = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(left_image.empty());
ASSERT_FALSE(right_image.empty());
ASSERT_FALSE(disp_gold.empty());
cv::Ptr<cv::cuda::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
cv::cuda::GpuMat disp;
bm->compute(loadMat(left_image), loadMat(right_image), disp, stream);
stream.waitForCompletion();
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
}
CUDA_TEST_P(StereoBM, Uniqueness_Regression)
{
cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
cv::Mat disp_gold = readImage("stereobm/aloe-disp-uniqueness15.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(left_image.empty());
ASSERT_FALSE(right_image.empty());
ASSERT_FALSE(disp_gold.empty());
cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
cv::cuda::GpuMat disp;
bm->setUniquenessRatio(15);
bm->compute(loadMat(left_image), loadMat(right_image), disp);
cv::Mat disp_cpu;
disp.download(disp_cpu);
cv::imwrite("disp_inq15.png", disp_cpu);
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
}
INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES);
//////////////////////////////////////////////////////////////////////////