mirror of
https://github.com/opencv/opencv_contrib.git
synced 2025-10-19 11:21:39 +08:00
Merge pull request #3081 from asmorkalov:as/stereo_bm_uniqueness_ratio
Uniqueness ratio support for cuda::StereoBM * Naive implementation of uniqueness ratio option for cuda::StereoBM. * Optimized memory consumption in cuda::stereoBM with uniqueness check. * Got rid of line_disps array. * Reduced line_ssds. * Apply streams for all parts of cuda::stereoBM::compute. * Added perf test for CUDA stereoBM with uniqueness check. * Optimized global memory transactions. * Restored sync data transfers as they use stack variables. * Do not use constant memory in stereoBM to exclude stream races. * Code review fixes.
This commit is contained in:

committed by
GitHub

parent
80ff29a653
commit
ab006c9201
@@ -87,6 +87,45 @@ PERF_TEST_P(ImagePair, StereoBM,
|
||||
}
|
||||
}
|
||||
|
||||
PERF_TEST_P(ImagePair, StereoBMwithUniqueness,
|
||||
Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
|
||||
{
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(imgLeft.empty());
|
||||
|
||||
const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(imgRight.empty());
|
||||
|
||||
const int ndisp = 256;
|
||||
|
||||
if (PERF_RUN_CUDA())
|
||||
{
|
||||
cv::Ptr<cv::StereoBM> d_bm = cv::cuda::createStereoBM(ndisp);
|
||||
d_bm->setUniquenessRatio(10);
|
||||
|
||||
const cv::cuda::GpuMat d_imgLeft(imgLeft);
|
||||
const cv::cuda::GpuMat d_imgRight(imgRight);
|
||||
cv::cuda::GpuMat dst;
|
||||
|
||||
TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::Ptr<cv::StereoBM> bm = cv::StereoBM::create(ndisp);
|
||||
bm->setUniquenessRatio(10);
|
||||
|
||||
cv::Mat dst;
|
||||
|
||||
TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
|
||||
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// StereoBeliefPropagation
|
||||
|
||||
|
@@ -43,6 +43,7 @@
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include <limits.h>
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
@@ -60,18 +61,13 @@ namespace cv { namespace cuda { namespace device
|
||||
#define STEREO_MIND 0 // The minimum d range to check
|
||||
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
|
||||
|
||||
__constant__ unsigned int* cminSSDImage;
|
||||
__constant__ size_t cminSSD_step;
|
||||
__constant__ int cwidth;
|
||||
__constant__ int cheight;
|
||||
|
||||
__device__ __forceinline__ int SQ(int a)
|
||||
{
|
||||
return a * a;
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
|
||||
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth)
|
||||
{
|
||||
unsigned int cache = 0;
|
||||
unsigned int cache2 = 0;
|
||||
@@ -99,26 +95,24 @@ namespace cv { namespace cuda { namespace device
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
|
||||
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd)
|
||||
{
|
||||
unsigned int ssd[N_DISPARITIES];
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
__syncthreads();
|
||||
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X);
|
||||
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
|
||||
|
||||
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
|
||||
|
||||
@@ -235,26 +229,27 @@ namespace cv { namespace cuda { namespace device
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
|
||||
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp,
|
||||
int uniquenessRatio, unsigned int* cminSSDImage, size_t cminSSD_step, int cwidth, int cheight)
|
||||
{
|
||||
extern __shared__ unsigned int col_ssd_cache[];
|
||||
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
|
||||
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
|
||||
uint line_ssds[2 + N_DISPARITIES]; // +2 - tail of previous batch for accurate uniquenessRatio check
|
||||
uint* batch_ssds = line_ssds + 2;
|
||||
|
||||
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
|
||||
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
|
||||
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
|
||||
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
|
||||
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
|
||||
uint line_ssd_tails[3*ROWSperTHREAD];
|
||||
uchar uniqueness_approved[ROWSperTHREAD];
|
||||
uchar local_disparity[ROWSperTHREAD];
|
||||
|
||||
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
|
||||
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;
|
||||
|
||||
const int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
|
||||
const int Y = (blockIdx.y * ROWSperTHREAD + RADIUS);
|
||||
|
||||
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
|
||||
unsigned char* disparImage = disp.data + X + Y * disp.step;
|
||||
//if (X < cwidth)
|
||||
//{
|
||||
// unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
|
||||
// for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
|
||||
// *ptr = 0xFFFFFFFF;
|
||||
//}
|
||||
float thresh_scale;
|
||||
|
||||
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
|
||||
int y_tex;
|
||||
int x_tex = X - RADIUS;
|
||||
@@ -262,6 +257,25 @@ namespace cv { namespace cuda { namespace device
|
||||
if (x_tex >= cwidth)
|
||||
return;
|
||||
|
||||
for(int i = 0; i < ROWSperTHREAD; i++)
|
||||
local_disparity[i] = 0;
|
||||
|
||||
for(int i = 0; i < 3*ROWSperTHREAD; i++)
|
||||
{
|
||||
line_ssd_tails[i] = UINT_MAX;
|
||||
}
|
||||
|
||||
if (uniquenessRatio > 0)
|
||||
{
|
||||
batch_ssds[6] = UINT_MAX;
|
||||
batch_ssds[7] = UINT_MAX;
|
||||
thresh_scale = (1.0 + uniquenessRatio / 100.0f);
|
||||
for(int i = 0; i < ROWSperTHREAD; i++)
|
||||
{
|
||||
uniqueness_approved[i] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
|
||||
{
|
||||
y_tex = Y - RADIUS;
|
||||
@@ -276,10 +290,10 @@ namespace cv { namespace cuda { namespace device
|
||||
|
||||
if (Y < cheight - RADIUS)
|
||||
{
|
||||
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
|
||||
uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
|
||||
|
||||
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
|
||||
// computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
|
||||
// computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
|
||||
//
|
||||
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
|
||||
// must also call "MinSSD" to avoid deadlock. (#13850)
|
||||
@@ -290,10 +304,50 @@ namespace cv { namespace cuda { namespace device
|
||||
|
||||
if (X < cwidth - RADIUS)
|
||||
{
|
||||
if (minSSD.x < minSSDImage[0])
|
||||
unsigned int last_opt = line_ssd_tails[3*0 + 0];
|
||||
unsigned int opt = ::min(last_opt, batch_opt.x);
|
||||
|
||||
if (uniquenessRatio > 0)
|
||||
{
|
||||
disparImage[0] = (unsigned char)(d + minSSD.y);
|
||||
minSSDImage[0] = minSSD.x;
|
||||
line_ssds[0] = line_ssd_tails[3*0 + 1];
|
||||
line_ssds[1] = line_ssd_tails[3*0 + 2];
|
||||
|
||||
float thresh = thresh_scale * opt;
|
||||
int dtest = local_disparity[0];
|
||||
|
||||
if(batch_opt.x < last_opt)
|
||||
{
|
||||
uniqueness_approved[0] = 1;
|
||||
dtest = d + batch_opt.y;
|
||||
if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh))
|
||||
{
|
||||
uniqueness_approved[0] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(uniqueness_approved[0])
|
||||
{
|
||||
// the trial to decompose the code on 2 loops without ld vs dtest makes
|
||||
// uniqueness check dramatically slow. at least on gf 1080
|
||||
for (int ld = d-2; ld < d + N_DISPARITIES; ld++)
|
||||
{
|
||||
if ((ld < dtest-1 || ld > dtest+1) && (line_ssds[ld-d+2] <= thresh))
|
||||
{
|
||||
uniqueness_approved[0] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
line_ssd_tails[3*0 + 1] = batch_ssds[6];
|
||||
line_ssd_tails[3*0 + 2] = batch_ssds[7];
|
||||
}
|
||||
|
||||
line_ssd_tails[3*0 + 0] = opt;
|
||||
if (batch_opt.x < last_opt)
|
||||
{
|
||||
local_disparity[0] = (unsigned char)(d + batch_opt.y);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -313,14 +367,13 @@ namespace cv { namespace cuda { namespace device
|
||||
|
||||
y_tex += 1;
|
||||
|
||||
__syncthreads(); //before MinSSD function
|
||||
__syncthreads();
|
||||
|
||||
if (row < cheight - RADIUS - Y)
|
||||
{
|
||||
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
|
||||
|
||||
uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
|
||||
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
|
||||
// computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
|
||||
// computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
|
||||
//
|
||||
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
|
||||
// must also call "MinSSD" to avoid deadlock. (#13850)
|
||||
@@ -331,11 +384,47 @@ namespace cv { namespace cuda { namespace device
|
||||
|
||||
if (X < cwidth - RADIUS)
|
||||
{
|
||||
int idx = row * cminSSD_step;
|
||||
if (minSSD.x < minSSDImage[idx])
|
||||
unsigned int last_opt = line_ssd_tails[3*row + 0];
|
||||
unsigned int opt = ::min(last_opt, batch_opt.x);
|
||||
if (uniquenessRatio > 0)
|
||||
{
|
||||
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
|
||||
minSSDImage[idx] = minSSD.x;
|
||||
line_ssds[0] = line_ssd_tails[3*row + 1];
|
||||
line_ssds[1] = line_ssd_tails[3*row + 2];
|
||||
|
||||
float thresh = thresh_scale * opt;
|
||||
int dtest = local_disparity[row];
|
||||
|
||||
if(batch_opt.x < last_opt)
|
||||
{
|
||||
uniqueness_approved[row] = 1;
|
||||
dtest = d + batch_opt.y;
|
||||
if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh))
|
||||
{
|
||||
uniqueness_approved[row] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(uniqueness_approved[row])
|
||||
{
|
||||
for (int ld = 0; ld < N_DISPARITIES + 2; ld++)
|
||||
{
|
||||
if (((d+ld-2 < dtest-1) || (d+ld-2 > dtest+1)) && (line_ssds[ld] <= thresh))
|
||||
{
|
||||
uniqueness_approved[row] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
line_ssd_tails[3*row + 1] = batch_ssds[6];
|
||||
line_ssd_tails[3*row + 2] = batch_ssds[7];
|
||||
}
|
||||
|
||||
line_ssd_tails[3*row + 0] = opt;
|
||||
|
||||
if (batch_opt.x < last_opt)
|
||||
{
|
||||
local_disparity[row] = (unsigned char)(d + batch_opt.y);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -344,10 +433,32 @@ namespace cv { namespace cuda { namespace device
|
||||
__syncthreads(); // before initializing shared memory at the beginning of next loop
|
||||
|
||||
} // for d loop
|
||||
|
||||
for (int row = 0; row < end_row; row++)
|
||||
{
|
||||
minSSDImage[row * cminSSD_step] = line_ssd_tails[3*row + 0];
|
||||
}
|
||||
|
||||
if (uniquenessRatio > 0)
|
||||
{
|
||||
for (int row = 0; row < end_row; row++)
|
||||
{
|
||||
// drop disparity for pixel where uniqueness requirement was not satisfied (zero value)
|
||||
disparImage[disp.step * row] = local_disparity[row] * uniqueness_approved[row];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int row = 0; row < end_row; row++)
|
||||
{
|
||||
disparImage[disp.step * row] = local_disparity[row];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
|
||||
template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
|
||||
int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
|
||||
size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream)
|
||||
{
|
||||
dim3 grid(1,1,1);
|
||||
dim3 threads(BLOCK_W, 1, 1);
|
||||
@@ -358,14 +469,17 @@ namespace cv { namespace cuda { namespace device
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
|
||||
|
||||
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
|
||||
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp, uniquenessRatio,
|
||||
missd_buffer, minssd_step, cwidth, cheight);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
};
|
||||
|
||||
typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
|
||||
typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp,
|
||||
int maxdisp, int uniquenessRatio, unsigned int* missd_buffer,
|
||||
size_t minssd_step, int cwidth, int cheight, cudaStream_t & stream);
|
||||
|
||||
const static kernel_caller_t callers[] =
|
||||
{
|
||||
@@ -380,27 +494,19 @@ namespace cv { namespace cuda { namespace device
|
||||
};
|
||||
const int calles_num = sizeof(callers)/sizeof(callers[0]);
|
||||
|
||||
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
|
||||
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp,
|
||||
int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
|
||||
{
|
||||
int winsz2 = winsz >> 1;
|
||||
|
||||
if (winsz2 == 0 || winsz2 >= calles_num)
|
||||
CV_Error(cv::Error::StsBadArg, "Unsupported window size");
|
||||
|
||||
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
|
||||
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
|
||||
|
||||
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
|
||||
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
|
||||
cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) );
|
||||
cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) );
|
||||
|
||||
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
|
||||
|
||||
callers[winsz2](left, right, disp, maxdisp, stream);
|
||||
callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);
|
||||
}
|
||||
|
||||
__device__ inline int clamp(int x, int a, int b)
|
||||
|
@@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace stereobm
|
||||
{
|
||||
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
|
||||
void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, int uniquenessRatio, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
|
||||
void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
|
||||
void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream);
|
||||
void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
|
||||
@@ -102,8 +102,8 @@ namespace
|
||||
int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
|
||||
void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }
|
||||
|
||||
int getUniquenessRatio() const { return 0; }
|
||||
void setUniquenessRatio(int /*uniquenessRatio*/) {}
|
||||
int getUniquenessRatio() const { return uniquenessRatio_; }
|
||||
void setUniquenessRatio(int uniquenessRatio) { uniquenessRatio_ = uniquenessRatio; }
|
||||
|
||||
int getSmallerBlockSize() const { return 0; }
|
||||
void setSmallerBlockSize(int /*blockSize*/){}
|
||||
@@ -121,12 +121,13 @@ namespace
|
||||
int preFilterCap_;
|
||||
float avergeTexThreshold_;
|
||||
int preFilterSize_;
|
||||
int uniquenessRatio_;
|
||||
|
||||
GpuMat minSSD_, leBuf_, riBuf_;
|
||||
};
|
||||
|
||||
StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
|
||||
: preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9)
|
||||
: preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9), uniquenessRatio_(0)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -183,7 +184,7 @@ namespace
|
||||
ri_for_bm = riBuf_;
|
||||
}
|
||||
|
||||
stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
|
||||
stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, uniquenessRatio_, minSSD_, stream);
|
||||
|
||||
if (avergeTexThreshold_ > 0)
|
||||
postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
|
||||
|
@@ -122,6 +122,49 @@ CUDA_TEST_P(StereoBM, PrefilterNormRegression)
|
||||
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
|
||||
}
|
||||
|
||||
CUDA_TEST_P(StereoBM, Streams)
|
||||
{
|
||||
cv::cuda::Stream stream;
|
||||
cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat disp_gold = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
|
||||
|
||||
ASSERT_FALSE(left_image.empty());
|
||||
ASSERT_FALSE(right_image.empty());
|
||||
ASSERT_FALSE(disp_gold.empty());
|
||||
|
||||
cv::Ptr<cv::cuda::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
|
||||
cv::cuda::GpuMat disp;
|
||||
|
||||
bm->compute(loadMat(left_image), loadMat(right_image), disp, stream);
|
||||
stream.waitForCompletion();
|
||||
|
||||
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
|
||||
}
|
||||
|
||||
CUDA_TEST_P(StereoBM, Uniqueness_Regression)
|
||||
{
|
||||
cv::Mat left_image = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat disp_gold = readImage("stereobm/aloe-disp-uniqueness15.png", cv::IMREAD_GRAYSCALE);
|
||||
|
||||
ASSERT_FALSE(left_image.empty());
|
||||
ASSERT_FALSE(right_image.empty());
|
||||
ASSERT_FALSE(disp_gold.empty());
|
||||
|
||||
cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
|
||||
cv::cuda::GpuMat disp;
|
||||
|
||||
bm->setUniquenessRatio(15);
|
||||
bm->compute(loadMat(left_image), loadMat(right_image), disp);
|
||||
|
||||
cv::Mat disp_cpu;
|
||||
disp.download(disp_cpu);
|
||||
cv::imwrite("disp_inq15.png", disp_cpu);
|
||||
|
||||
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
Reference in New Issue
Block a user