From df101fd7e3612f3efd314b659265985e74e8b256 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Sat, 5 Mar 2022 10:11:47 +0300
Subject: [PATCH] Added 32FC1 type support and mask to cuda::meanStdDev
 implementation.

---
 .../cudaarithm/include/opencv2/cudaarithm.hpp | 30 ++++--
 modules/cudaarithm/src/reductions.cpp         | 93 ++++++++++++++++---
 modules/cudaarithm/test/test_reductions.cpp   | 68 +++++++++++++-
 3 files changed, 169 insertions(+), 22 deletions(-)

diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index 60ec3ca49..10338051e 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -685,21 +685,39 @@ CV_EXPORTS_W void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp,
 
 /** @brief Computes a mean value and a standard deviation of matrix elements.
 
-@param mtx Source matrix. CV_8UC1 matrices are supported for now.
-@param mean Mean value.
-@param stddev Standard deviation value.
+@param src Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now.
+@param dst Target GpuMat with size 1x2 and type CV_64FC1. The first value is mean, the second - stddev.
+@param mask Operation mask.
+@param stream Stream for the asynchronous version.
 
 @sa meanStdDev
  */
-CV_EXPORTS_W void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev);
-/** @overload */
+CV_EXPORTS_W void meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stream& stream = Stream::Null());
+/** @overload
+@param mtx Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now.
+@param dst Target GpuMat with size 1x2 and type CV_64FC1. The first value is mean, the second - stddev.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS_W void meanStdDev(InputArray mtx, OutputArray dst, Stream& stream = Stream::Null());
+/** @overload
+@param src Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now.
+@param mean Mean value.
+@param stddev Standard deviation value.
+@param mask Operation mask.
+ */
+CV_EXPORTS_W void meanStdDev(InputArray src, CV_OUT Scalar& mean, CV_OUT Scalar& stddev, InputArray mask);
+/** @overload
+@param mtx Source matrix. CV_8UC1 and CV_32FC1 matrices are supported for now.
+@param mean Mean value.
+@param stddev Standard deviation value.
+ */
+CV_EXPORTS_W void meanStdDev(InputArray mtx, CV_OUT Scalar& mean, CV_OUT Scalar& stddev);
 
 /** @brief Computes a standard deviation of integral images.
 
 @param src Source image. Only the CV_32SC1 type is supported.
 @param sqr Squared source image. Only the CV_32FC1 type is supported.
-@param dst Destination image with the same type and size as src .
+@param dst Destination image with the same type and size as src.
 @param rect Rectangular window.
 @param stream Stream for the asynchronous version.
  */
diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp
index 4824a5c4d..cfadad648 100644
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -132,45 +132,55 @@ double cv::cuda::norm(InputArray _src, int normType, InputArray _mask)
 ////////////////////////////////////////////////////////////////////////
 // meanStdDev
 
-void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
+void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream)
 {
     if (!deviceSupports(FEATURE_SET_COMPUTE_13))
         CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
 
-    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat gsrc = getInputMat(src, stream);
 
-    CV_Assert( src.type() == CV_8UC1 );
+#if (CUDA_VERSION <= 4020)
+    CV_Assert( gsrc.type() == CV_8UC1 );
+#else
+    CV_Assert( (gsrc.type() == CV_8UC1) || (gsrc.type() == CV_32FC1) );
+#endif
 
-    GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
+    GpuMat gdst = getOutputMat(dst, 1, 2, CV_64FC1, stream);
 
     NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
+    sz.width  = gsrc.cols;
+    sz.height = gsrc.rows;
 
     int bufSize;
 #if (CUDA_VERSION <= 4020)
     nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
 #else
-    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
+    if (gsrc.type() == CV_8UC1)
+        nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
+    else
+        nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1R(sz, &bufSize) );
 #endif
 
     BufferPool pool(stream);
-    GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1);
+    GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type());
 
     // detail: https://github.com/opencv/opencv/issues/11063
     //NppStreamHandler h(StreamAccessor::getStream(stream));
 
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );
+    if(gsrc.type() == CV_8UC1)
+        nppSafeCall( nppiMean_StdDev_8u_C1R(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
+    else
+        nppSafeCall( nppiMean_StdDev_32f_C1R(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
 
-    syncOutput(dst, _dst, stream);
+    syncOutput(gdst, dst, stream);
 }
 
-void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev)
+void cv::cuda::meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
 {
     Stream& stream = Stream::Null();
 
     HostMem dst;
-    meanStdDev(_src, dst, stream);
+    meanStdDev(src, dst, stream);
 
     stream.waitForCompletion();
 
@@ -181,6 +191,65 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev)
     stddev = Scalar(vals[1]);
 }
 
+void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    meanStdDev(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(Mat(1, 2, CV_64FC1, &vals[0]));
+
+    mean = Scalar(vals[0]);
+    stddev = Scalar(vals[1]);
+}
+
+void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stream& stream)
+{
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
+        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+
+    const GpuMat gsrc = getInputMat(src, stream);
+    const GpuMat gmask = getInputMat(mask, stream);
+
+#if (CUDA_VERSION <= 4020)
+    CV_Assert( gsrc.type() == CV_8UC1 );
+#else
+    CV_Assert( (gsrc.type() == CV_8UC1) || (gsrc.type() == CV_32FC1) );
+#endif
+
+    GpuMat gdst = getOutputMat(dst, 1, 2, CV_64FC1, stream);
+
+    NppiSize sz;
+    sz.width  = gsrc.cols;
+    sz.height = gsrc.rows;
+
+    int bufSize;
+#if (CUDA_VERSION <= 4020)
+        nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) );
+#else
+    if (gsrc.type() == CV_8UC1)
+        nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1MR(sz, &bufSize) );
+    else
+        nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1MR(sz, &bufSize) );
+#endif
+
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type());
+
+    if(gsrc.type() == CV_8UC1)
+        nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
+                                             sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
+    else
+        nppSafeCall( nppiMean_StdDev_32f_C1MR(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
+                                              sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
+
+    syncOutput(gdst, dst, stream);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rectStdDev
 
diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp
index b868280f9..5ec627452 100644
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -967,17 +967,19 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Normalize, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // MeanStdDev
 
-PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi, MatDepth)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
     bool useRoi;
+    int MatDepth;
 
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         size = GET_PARAM(1);
         useRoi = GET_PARAM(2);
+        MatDepth = GET_PARAM(3);
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -985,7 +987,7 @@ PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi)
 
 CUDA_TEST_P(MeanStdDev, Accuracy)
 {
-    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat src = randomMat(size, MatDepth);
 
     if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_13))
     {
@@ -1015,9 +1017,42 @@ CUDA_TEST_P(MeanStdDev, Accuracy)
     }
 }
 
+CUDA_TEST_P(MeanStdDev, MaskedAccuracy)
+{
+    cv::Mat src = randomMat(size, MatDepth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_13))
+    {
+        try
+        {
+            cv::Scalar mean;
+            cv::Scalar stddev;
+            cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::Scalar mean;
+        cv::Scalar stddev;
+        cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev, loadMat(mask));
+
+        cv::Scalar mean_gold;
+        cv::Scalar stddev_gold;
+        cv::meanStdDev(src, mean_gold, stddev_gold, mask);
+
+        EXPECT_SCALAR_NEAR(mean_gold, mean, 1e-5);
+        EXPECT_SCALAR_NEAR(stddev_gold, stddev, 1e-5);
+    }
+}
+
 CUDA_TEST_P(MeanStdDev, Async)
 {
-    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat src = randomMat(size, MatDepth);
 
     cv::cuda::Stream stream;
 
@@ -1037,10 +1072,35 @@ CUDA_TEST_P(MeanStdDev, Async)
     EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5);
 }
 
+CUDA_TEST_P(MeanStdDev, MaskedAsync)
+{
+    cv::Mat src = randomMat(size, MatDepth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::meanStdDev(loadMat(src, useRoi), dst, loadMat(mask), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(cv::Mat(1, 2, CV_64FC1, &vals[0]));
+
+    cv::Scalar mean_gold;
+    cv::Scalar stddev_gold;
+    cv::meanStdDev(src, mean_gold, stddev_gold, mask);
+
+    EXPECT_SCALAR_NEAR(mean_gold, cv::Scalar(vals[0]), 1e-5);
+    EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    WHOLE_SUBMAT));
+    WHOLE_SUBMAT,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F))
+));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Integral