mirror of
https://github.com/opencv/opencv_contrib.git
synced 2025-10-20 12:55:15 +08:00
Merge pull request #3803 from cudawarped:cuda_update_to_npp_stream_ctx
cuda - update npp calls to use the new NppStreamContext API if available
This commit is contained in:
@@ -79,7 +79,11 @@ namespace
|
||||
{
|
||||
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
|
||||
|
||||
typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
|
||||
#if USE_NPP_STREAM_CTX
|
||||
typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx);
|
||||
#else
|
||||
typedef NppStatus(*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
|
||||
#endif
|
||||
};
|
||||
|
||||
template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
|
||||
@@ -94,9 +98,15 @@ namespace
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
|
||||
dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
|
||||
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
|
||||
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) );
|
||||
#else
|
||||
nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
|
||||
dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
|
||||
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
@@ -107,7 +117,11 @@ namespace
|
||||
{
|
||||
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
|
||||
|
||||
typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
|
||||
#if USE_NPP_STREAM_CTX
|
||||
typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx);
|
||||
#else
|
||||
typedef NppStatus(*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
|
||||
#endif
|
||||
};
|
||||
|
||||
template <int DEPTH, typename NppMirrorIFunc<DEPTH>::func_t func> struct NppMirrorI
|
||||
@@ -121,10 +135,15 @@ namespace
|
||||
NppiSize sz;
|
||||
sz.width = srcDst.cols;
|
||||
sz.height = srcDst.rows;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
|
||||
sz,
|
||||
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) );
|
||||
#else
|
||||
nppSafeCall( func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
|
||||
sz,
|
||||
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
@@ -137,23 +156,41 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
|
||||
static const func_t funcs[6][4] =
|
||||
{
|
||||
{NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
|
||||
#if USE_NPP_STREAM_CTX
|
||||
{NppMirror<CV_8U, nppiMirror_8u_C1R_Ctx>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R_Ctx>::call, NppMirror<CV_8U, nppiMirror_8u_C4R_Ctx>::call},
|
||||
{0,0,0,0},
|
||||
{NppMirror<CV_16U, nppiMirror_16u_C1R_Ctx>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R_Ctx>::call, NppMirror<CV_16U, nppiMirror_16u_C4R_Ctx>::call},
|
||||
{0,0,0,0},
|
||||
{NppMirror<CV_32S, nppiMirror_32s_C1R_Ctx>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R_Ctx>::call, NppMirror<CV_32S, nppiMirror_32s_C4R_Ctx>::call},
|
||||
{NppMirror<CV_32F, nppiMirror_32f_C1R_Ctx>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R_Ctx>::call, NppMirror<CV_32F, nppiMirror_32f_C4R_Ctx>::call}
|
||||
#else
|
||||
{ NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call },
|
||||
{0,0,0,0},
|
||||
{NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
|
||||
{0,0,0,0},
|
||||
{NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
|
||||
{NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream);
|
||||
static const ifunc_t ifuncs[6][4] =
|
||||
{
|
||||
{NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call},
|
||||
#if USE_NPP_STREAM_CTX
|
||||
{NppMirrorI<CV_8U, nppiMirror_8u_C1IR_Ctx>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR_Ctx>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR_Ctx>::call},
|
||||
{0,0,0,0},
|
||||
{NppMirrorI<CV_16U, nppiMirror_16u_C1IR_Ctx>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR_Ctx>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR_Ctx>::call},
|
||||
{0,0,0,0},
|
||||
{NppMirrorI<CV_32S, nppiMirror_32s_C1IR_Ctx>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR_Ctx>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR_Ctx>::call},
|
||||
{NppMirrorI<CV_32F, nppiMirror_32f_C1IR_Ctx>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR_Ctx>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR_Ctx>::call}
|
||||
#else
|
||||
{ NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call },
|
||||
{0,0,0,0},
|
||||
{NppMirrorI<CV_16U, nppiMirror_16u_C1IR>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR>::call},
|
||||
{0,0,0,0},
|
||||
{NppMirrorI<CV_32S, nppiMirror_32s_C1IR>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR>::call},
|
||||
{NppMirrorI<CV_32F, nppiMirror_32f_C1IR>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR>::call}
|
||||
#endif
|
||||
};
|
||||
|
||||
GpuMat src = getInputMat(_src, stream);
|
||||
|
@@ -92,7 +92,11 @@ namespace
|
||||
{
|
||||
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
|
||||
#else
|
||||
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
|
||||
#endif
|
||||
};
|
||||
|
||||
template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
|
||||
@@ -116,7 +120,11 @@ namespace
|
||||
cv::saturate_cast<npp_type>(value[3])
|
||||
};
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
|
||||
#else
|
||||
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
@@ -131,13 +139,39 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu
|
||||
typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
|
||||
static const func_t funcs[3][6][4] =
|
||||
{
|
||||
#if USE_NPP_STREAM_CTX
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
|
||||
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R_Ctx>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R_Ctx>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R_Ctx>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R_Ctx>::call}
|
||||
},
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R_Ctx>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R_Ctx>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R_Ctx>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R_Ctx>::call}
|
||||
},
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
|
||||
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R_Ctx>::call},
|
||||
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R_Ctx>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R_Ctx>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R_Ctx>::call}
|
||||
}
|
||||
#else
|
||||
{
|
||||
{ BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call, 0, NppBitwiseC<CV_8U, 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call },
|
||||
{ BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call },
|
||||
{ BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call },
|
||||
{ BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call },
|
||||
{ BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call },
|
||||
{ BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call }
|
||||
},
|
||||
{
|
||||
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
|
||||
@@ -155,6 +189,7 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu
|
||||
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
|
||||
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
const int depth = src.depth();
|
||||
|
@@ -116,8 +116,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(nppiThreshold_32f_C1R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER, h));
|
||||
#else
|
||||
nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
|
||||
#endif
|
||||
|
||||
if (!stream)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
@@ -74,8 +74,13 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, h));
|
||||
#else
|
||||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
|
||||
#endif
|
||||
|
||||
if (!stream)
|
||||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
@@ -342,13 +342,21 @@ namespace
|
||||
{
|
||||
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
|
||||
#else
|
||||
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
|
||||
#endif
|
||||
};
|
||||
template <int DEPTH> struct NppShiftFunc<DEPTH, 1>
|
||||
{
|
||||
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
|
||||
#else
|
||||
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
|
||||
#endif
|
||||
};
|
||||
|
||||
template <int DEPTH, int cn, typename NppShiftFunc<DEPTH, cn>::func_t func> struct NppShift
|
||||
@@ -363,7 +371,11 @@ namespace
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
|
||||
#else
|
||||
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
@@ -381,7 +393,11 @@ namespace
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
|
||||
#else
|
||||
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
@@ -394,11 +410,20 @@ void cv::cuda::rshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
|
||||
typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[5][4] =
|
||||
{
|
||||
#if USE_NPP_STREAM_CTX
|
||||
|
||||
{NppShift<CV_8U , 1, nppiRShiftC_8u_C1R_Ctx>::call, 0, NppShift<CV_8U , 3, nppiRShiftC_8u_C3R_Ctx>::call, NppShift<CV_8U , 4, nppiRShiftC_8u_C4R_Ctx>::call },
|
||||
{NppShift<CV_8S , 1, nppiRShiftC_8s_C1R_Ctx>::call, 0, NppShift<CV_8S , 3, nppiRShiftC_8s_C3R_Ctx>::call, NppShift<CV_8S , 4, nppiRShiftC_8s_C4R_Ctx>::call },
|
||||
{NppShift<CV_16U, 1, nppiRShiftC_16u_C1R_Ctx>::call, 0, NppShift<CV_16U, 3, nppiRShiftC_16u_C3R_Ctx>::call, NppShift<CV_16U, 4, nppiRShiftC_16u_C4R_Ctx>::call},
|
||||
{NppShift<CV_16S, 1, nppiRShiftC_16s_C1R_Ctx>::call, 0, NppShift<CV_16S, 3, nppiRShiftC_16s_C3R_Ctx>::call, NppShift<CV_16S, 4, nppiRShiftC_16s_C4R_Ctx>::call},
|
||||
{NppShift<CV_32S, 1, nppiRShiftC_32s_C1R_Ctx>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R_Ctx>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R_Ctx>::call},
|
||||
#else
|
||||
{NppShift<CV_8U , 1, nppiRShiftC_8u_C1R >::call, 0, NppShift<CV_8U , 3, nppiRShiftC_8u_C3R >::call, NppShift<CV_8U , 4, nppiRShiftC_8u_C4R>::call },
|
||||
{NppShift<CV_8S , 1, nppiRShiftC_8s_C1R >::call, 0, NppShift<CV_8S , 3, nppiRShiftC_8s_C3R >::call, NppShift<CV_8S , 4, nppiRShiftC_8s_C4R>::call },
|
||||
{NppShift<CV_16U, 1, nppiRShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiRShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiRShiftC_16u_C4R>::call},
|
||||
{NppShift<CV_16S, 1, nppiRShiftC_16s_C1R>::call, 0, NppShift<CV_16S, 3, nppiRShiftC_16s_C3R>::call, NppShift<CV_16S, 4, nppiRShiftC_16s_C4R>::call},
|
||||
{NppShift<CV_32S, 1, nppiRShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R>::call},
|
||||
#endif
|
||||
};
|
||||
|
||||
GpuMat src = getInputMat(_src, stream);
|
||||
@@ -418,11 +443,19 @@ void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
|
||||
typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[5][4] =
|
||||
{
|
||||
#if USE_NPP_STREAM_CTX
|
||||
{NppShift<CV_8U , 1, nppiLShiftC_8u_C1R_Ctx>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R_Ctx>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R_Ctx>::call },
|
||||
{0 , 0, 0 , 0 },
|
||||
{NppShift<CV_16U, 1, nppiLShiftC_16u_C1R_Ctx>::call, 0, NppShift<CV_16U, 3, nppiLShiftC_16u_C3R_Ctx>::call, NppShift<CV_16U, 4, nppiLShiftC_16u_C4R_Ctx>::call},
|
||||
{0 , 0, 0 , 0 },
|
||||
{NppShift<CV_32S, 1, nppiLShiftC_32s_C1R_Ctx>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R_Ctx>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R_Ctx>::call},
|
||||
#else
|
||||
{NppShift<CV_8U , 1, nppiLShiftC_8u_C1R>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R>::call },
|
||||
{0 , 0, 0 , 0 },
|
||||
{NppShift<CV_16U, 1, nppiLShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiLShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiLShiftC_16u_C4R>::call},
|
||||
{0 , 0, 0 , 0 },
|
||||
{NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call},
|
||||
#endif
|
||||
};
|
||||
|
||||
GpuMat src = getInputMat(_src, stream);
|
||||
@@ -468,7 +501,11 @@ void cv::cuda::max(InputArray src1, InputArray src2, OutputArray dst, Stream& st
|
||||
|
||||
namespace
|
||||
{
|
||||
#if USE_NPP_STREAM_CTX
|
||||
typedef NppStatus(*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
|
||||
#else
|
||||
typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
|
||||
#endif
|
||||
|
||||
void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
|
||||
{
|
||||
@@ -480,7 +517,11 @@ namespace
|
||||
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, h));
|
||||
#else
|
||||
nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
@@ -493,7 +534,11 @@ void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
|
||||
GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R_Ctx, StreamAccessor::getStream(stream));
|
||||
#else
|
||||
npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
|
||||
#endif
|
||||
|
||||
syncOutput(dst, _dst, stream);
|
||||
}
|
||||
@@ -504,7 +549,11 @@ void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
|
||||
GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R_Ctx, StreamAccessor::getStream(stream));
|
||||
#else
|
||||
npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
|
||||
#endif
|
||||
|
||||
syncOutput(dst, _dst, stream);
|
||||
}
|
||||
|
@@ -153,32 +153,44 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream)
|
||||
sz.width = gsrc.cols;
|
||||
sz.height = gsrc.rows;
|
||||
|
||||
#if (CUDA_VERSION >= 12040)
|
||||
#if (NPP_VERSION >= 12205)
|
||||
size_t bufSize;
|
||||
#else
|
||||
int bufSize;
|
||||
#endif
|
||||
|
||||
NppStreamHandler h(StreamAccessor::getStream(stream));
|
||||
|
||||
#if (CUDA_VERSION <= 4020)
|
||||
nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
|
||||
#else
|
||||
#if USE_NPP_STREAM_CTX
|
||||
if (gsrc.type() == CV_8UC1)
|
||||
nppSafeCall(nppiMeanStdDevGetBufferHostSize_8u_C1R_Ctx(sz, &bufSize, h));
|
||||
else
|
||||
nppSafeCall(nppiMeanStdDevGetBufferHostSize_32f_C1R_Ctx(sz, &bufSize, h));
|
||||
#else
|
||||
if (gsrc.type() == CV_8UC1)
|
||||
nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
|
||||
else
|
||||
nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1R(sz, &bufSize) );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
BufferPool pool(stream);
|
||||
CV_Assert(bufSize <= std::numeric_limits<int>::max());
|
||||
GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), gsrc.type());
|
||||
|
||||
// detail: https://github.com/opencv/opencv/issues/11063
|
||||
//NppStreamHandler h(StreamAccessor::getStream(stream));
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
if (gsrc.type() == CV_8UC1)
|
||||
nppSafeCall(nppiMean_StdDev_8u_C1R_Ctx(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
|
||||
else
|
||||
nppSafeCall(nppiMean_StdDev_32f_C1R_Ctx(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
|
||||
#else
|
||||
if(gsrc.type() == CV_8UC1)
|
||||
nppSafeCall( nppiMean_StdDev_8u_C1R(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
|
||||
else
|
||||
nppSafeCall( nppiMean_StdDev_32f_C1R(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
|
||||
#endif
|
||||
|
||||
syncOutput(gdst, dst, stream);
|
||||
}
|
||||
@@ -235,31 +247,49 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre
|
||||
sz.width = gsrc.cols;
|
||||
sz.height = gsrc.rows;
|
||||
|
||||
#if (CUDA_VERSION >= 12040)
|
||||
#if (NPP_VERSION >= 12205)
|
||||
size_t bufSize;
|
||||
#else
|
||||
int bufSize;
|
||||
#endif
|
||||
|
||||
NppStreamHandler h(StreamAccessor::getStream(stream));
|
||||
|
||||
#if (CUDA_VERSION <= 4020)
|
||||
nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) );
|
||||
#else
|
||||
#if USE_NPP_STREAM_CTX
|
||||
if (gsrc.type() == CV_8UC1)
|
||||
nppSafeCall(nppiMeanStdDevGetBufferHostSize_8u_C1MR_Ctx(sz, &bufSize, h));
|
||||
else
|
||||
nppSafeCall(nppiMeanStdDevGetBufferHostSize_32f_C1MR_Ctx(sz, &bufSize, h));
|
||||
#else
|
||||
if (gsrc.type() == CV_8UC1)
|
||||
nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1MR(sz, &bufSize) );
|
||||
else
|
||||
nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1MR(sz, &bufSize) );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
BufferPool pool(stream);
|
||||
CV_Assert(bufSize <= std::numeric_limits<int>::max());
|
||||
GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), gsrc.type());
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
if (gsrc.type() == CV_8UC1)
|
||||
nppSafeCall(nppiMean_StdDev_8u_C1MR_Ctx(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
|
||||
sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
|
||||
else
|
||||
nppSafeCall(nppiMean_StdDev_32f_C1MR_Ctx(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
|
||||
sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
|
||||
#else
|
||||
if(gsrc.type() == CV_8UC1)
|
||||
nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
|
||||
sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
|
||||
else
|
||||
nppSafeCall( nppiMean_StdDev_32f_C1MR(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
|
||||
sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
|
||||
#endif
|
||||
|
||||
syncOutput(gdst, dst, stream);
|
||||
}
|
||||
@@ -290,8 +320,13 @@ void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Re
|
||||
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
#if USE_NPP_STREAM_CTX
|
||||
nppSafeCall(nppiRectStdDev_32s32f_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
|
||||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect, h));
|
||||
#else
|
||||
nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
|
||||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
|
||||
#endif
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
Reference in New Issue
Block a user