From df101fd7e3612f3efd314b659265985e74e8b256 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Sat, 5 Mar 2022 10:11:47 +0300 Subject: [PATCH] Added 32FC1 type support and mask to cuda::meanStdDev implementation. --- .../cudaarithm/include/opencv2/cudaarithm.hpp | 30 ++++-- modules/cudaarithm/src/reductions.cpp | 93 ++++++++++++++++--- modules/cudaarithm/test/test_reductions.cpp | 68 +++++++++++++- 3 files changed, 169 insertions(+), 22 deletions(-) diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp index 60ec3ca49..10338051e 100644 --- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp +++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp @@ -685,21 +685,39 @@ CV_EXPORTS_W void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, /** @brief Computes a mean value and a standard deviation of matrix elements. -@param mtx Source matrix. CV_8UC1 matrices are supported for now. -@param mean Mean value. -@param stddev Standard deviation value. +@param src Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now. +@param dst Target GpuMat with size 1x2 and type CV_64FC1. The first value is mean, the second - stddev. +@param mask Operation mask. +@param stream Stream for the asynchronous version. @sa meanStdDev */ -CV_EXPORTS_W void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev); -/** @overload */ +CV_EXPORTS_W void meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stream& stream = Stream::Null()); +/** @overload +@param mtx Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now. +@param dst Target GpuMat with size 1x2 and type CV_64FC1. The first value is mean, the second - stddev. +@param stream Stream for the asynchronous version. + */ CV_EXPORTS_W void meanStdDev(InputArray mtx, OutputArray dst, Stream& stream = Stream::Null()); +/** @overload +@param src Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now. +@param mean Mean value. +@param stddev Standard deviation value. +@param mask Operation mask. + */ +CV_EXPORTS_W void meanStdDev(InputArray src, CV_OUT Scalar& mean, CV_OUT Scalar& stddev, InputArray mask); +/** @overload +@param mtx Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now. +@param mean Mean value. +@param stddev Standard deviation value. + */ +CV_EXPORTS_W void meanStdDev(InputArray mtx, CV_OUT Scalar& mean, CV_OUT Scalar& stddev); /** @brief Computes a standard deviation of integral images. @param src Source image. Only the CV_32SC1 type is supported. @param sqr Squared source image. Only the CV_32FC1 type is supported. -@param dst Destination image with the same type and size as src . +@param dst Destination image with the same type and size as src. @param rect Rectangular window. @param stream Stream for the asynchronous version. */ diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index 4824a5c4d..cfadad648 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -132,45 +132,55 @@ double cv::cuda::norm(InputArray _src, int normType, InputArray _mask) //////////////////////////////////////////////////////////////////////// // meanStdDev -void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream) +void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream) { if (!deviceSupports(FEATURE_SET_COMPUTE_13)) CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility"); - const GpuMat src = getInputMat(_src, stream); + const GpuMat gsrc = getInputMat(src, stream); - CV_Assert( src.type() == CV_8UC1 ); +#if (CUDA_VERSION <= 4020) + CV_Assert( gsrc.type() == CV_8UC1 ); +#else + CV_Assert( (gsrc.type() == CV_8UC1) || (gsrc.type() == CV_32FC1) ); +#endif - GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream); + GpuMat gdst = getOutputMat(dst, 1, 2, CV_64FC1, stream); NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + sz.width = gsrc.cols; + sz.height = gsrc.rows; int bufSize; #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); #else - nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) ); + if (gsrc.type() == CV_8UC1) + nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) ); + else + nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1R(sz, &bufSize) ); #endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1); + GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type()); // detail: https://github.com/opencv/opencv/issues/11063 //NppStreamHandler h(StreamAccessor::getStream(stream)); - nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, buf.ptr(), dst.ptr(), dst.ptr() + 1) ); + if(gsrc.type() == CV_8UC1) + nppSafeCall( nppiMean_StdDev_8u_C1R(gsrc.ptr(), static_cast(gsrc.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); + else + nppSafeCall( nppiMean_StdDev_32f_C1R(gsrc.ptr(), static_cast(gsrc.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); - syncOutput(dst, _dst, stream); + syncOutput(gdst, dst, stream); } -void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev) +void cv::cuda::meanStdDev(InputArray src, Scalar& mean, Scalar& stddev) { Stream& stream = Stream::Null(); HostMem dst; - meanStdDev(_src, dst, stream); + meanStdDev(src, dst, stream); stream.waitForCompletion(); @@ -181,6 +191,65 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev) stddev = Scalar(vals[1]); } +void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, InputArray _mask) +{ + Stream& stream = Stream::Null(); + + HostMem dst; + meanStdDev(_src, dst, _mask, stream); + + stream.waitForCompletion(); + + double vals[2]; + dst.createMatHeader().copyTo(Mat(1, 2, CV_64FC1, &vals[0])); + + mean = Scalar(vals[0]); + stddev = Scalar(vals[1]); +} + +void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stream& stream) +{ + if (!deviceSupports(FEATURE_SET_COMPUTE_13)) + CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility"); + + const GpuMat gsrc = getInputMat(src, stream); + const GpuMat gmask = getInputMat(mask, stream); + +#if (CUDA_VERSION <= 4020) + CV_Assert( gsrc.type() == CV_8UC1 ); +#else + CV_Assert( (gsrc.type() == CV_8UC1) || (gsrc.type() == CV_32FC1) ); +#endif + + GpuMat gdst = getOutputMat(dst, 1, 2, CV_64FC1, stream); + + NppiSize sz; + sz.width = gsrc.cols; + sz.height = gsrc.rows; + + int bufSize; +#if (CUDA_VERSION <= 4020) + nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) ); +#else + if (gsrc.type() == CV_8UC1) + nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1MR(sz, &bufSize) ); + else + nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1MR(sz, &bufSize) ); +#endif + + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type()); + + if(gsrc.type() == CV_8UC1) + nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), + sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); + else + nppSafeCall( nppiMean_StdDev_32f_C1MR(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), + sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); + + syncOutput(gdst, dst, stream); +} + ////////////////////////////////////////////////////////////////////////////// // rectStdDev diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp index b868280f9..5ec627452 100644 --- a/modules/cudaarithm/test/test_reductions.cpp +++ b/modules/cudaarithm/test/test_reductions.cpp @@ -967,17 +967,19 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Normalize, testing::Combine( //////////////////////////////////////////////////////////////////////////////// // MeanStdDev -PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi) +PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi, MatDepth) { cv::cuda::DeviceInfo devInfo; cv::Size size; bool useRoi; + int MatDepth; virtual void SetUp() { devInfo = GET_PARAM(0); size = GET_PARAM(1); useRoi = GET_PARAM(2); + MatDepth = GET_PARAM(3); cv::cuda::setDevice(devInfo.deviceID()); } @@ -985,7 +987,7 @@ PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi) CUDA_TEST_P(MeanStdDev, Accuracy) { - cv::Mat src = randomMat(size, CV_8UC1); + cv::Mat src = randomMat(size, MatDepth); if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_13)) { @@ -1015,9 +1017,42 @@ CUDA_TEST_P(MeanStdDev, Accuracy) } } +CUDA_TEST_P(MeanStdDev, MaskedAccuracy) +{ + cv::Mat src = randomMat(size, MatDepth); + cv::Mat mask = randomMat(size, CV_8UC1, 0, 2); + + if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_13)) + { + try + { + cv::Scalar mean; + cv::Scalar stddev; + cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev); + } + catch (const cv::Exception& e) + { + ASSERT_EQ(cv::Error::StsNotImplemented, e.code); + } + } + else + { + cv::Scalar mean; + cv::Scalar stddev; + cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev, loadMat(mask)); + + cv::Scalar mean_gold; + cv::Scalar stddev_gold; + cv::meanStdDev(src, mean_gold, stddev_gold, mask); + + EXPECT_SCALAR_NEAR(mean_gold, mean, 1e-5); + EXPECT_SCALAR_NEAR(stddev_gold, stddev, 1e-5); + } +} + CUDA_TEST_P(MeanStdDev, Async) { - cv::Mat src = randomMat(size, CV_8UC1); + cv::Mat src = randomMat(size, MatDepth); cv::cuda::Stream stream; @@ -1037,10 +1072,35 @@ CUDA_TEST_P(MeanStdDev, Async) EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5); } +CUDA_TEST_P(MeanStdDev, MaskedAsync) +{ + cv::Mat src = randomMat(size, MatDepth); + cv::Mat mask = randomMat(size, CV_8UC1, 0, 2); + + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::meanStdDev(loadMat(src, useRoi), dst, loadMat(mask), stream); + + stream.waitForCompletion(); + + double vals[2]; + dst.createMatHeader().copyTo(cv::Mat(1, 2, CV_64FC1, &vals[0])); + + cv::Scalar mean_gold; + cv::Scalar stddev_gold; + cv::meanStdDev(src, mean_gold, stddev_gold, mask); + + EXPECT_SCALAR_NEAR(mean_gold, cv::Scalar(vals[0]), 1e-5); + EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5); +} + INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, - WHOLE_SUBMAT)); + WHOLE_SUBMAT, + testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)) +)); /////////////////////////////////////////////////////////////////////////////////////////////////////// // Integral