From c49d0b906420dd8da69ffd0bd11e6db72a9eb549 Mon Sep 17 00:00:00 2001
From: unknown <vchill@DESKTOP-IFCIFN3.nvidia.com>
Date: Thu, 7 May 2020 20:49:26 +0530
Subject: [PATCH] Modified Stream support functionality.

---
 .../include/opencv2/cudaoptflow.hpp           | 14 ++-
 modules/cudaoptflow/samples/optical_flow.cpp  |  4 +-
 modules/cudaoptflow/src/nvidiaOpticalFlow.cpp | 86 ++++++++++++-------
 3 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
index b434e9dfb..9fde39252 100644
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -118,7 +118,9 @@ public:
     @param inputImage Input image.
     @param referenceImage Reference image of the same size and the same type as input image.
     @param flow A buffer consisting of inputImage.Size() / getGridSize() flow vectors in CV_16SC2 format.
-    @param stream Stream for the asynchronous version.
+    @param stream It is highly recommended that CUDA streams for pre and post processing of optical flow vectors should be set once per session in create() function as a part of optical flow session creation.
+                  This parameter is left here for backward compatibility and may be removed in the future.
+                  Default value is NULL stream;
     @param hint Hint buffer if client provides external hints. Must have same size as flow buffer.
                 Caller can provide flow vectors as hints for optical flow calculation.
     @param cost Cost buffer contains numbers indicating the confidence associated with each of the generated flow vectors.
@@ -435,6 +437,12 @@ public:
     @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
     @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
     @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers.
+                       The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine.
+                       If input stream is not set, the execute function will use default stream which is NULL stream;
+    @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors.
+                        The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine.
+                        If output stream is not set, the execute function will use default stream which is NULL stream;
     */
     CV_WRAP static Ptr<NvidiaOpticalFlow_1_0> create(
         int width,
@@ -444,7 +452,9 @@ public:
         bool enableTemporalHints = false,
         bool enableExternalHints = false,
         bool enableCostBuffer = false,
-        int gpuId = 0);
+        int gpuId = 0,
+        Stream& inputStream = Stream::Null(),
+        Stream& outputStream = Stream::Null());
 };
 
 //! @}
diff --git a/modules/cudaoptflow/samples/optical_flow.cpp b/modules/cudaoptflow/samples/optical_flow.cpp
index 289883b23..2bbea63ba 100644
--- a/modules/cudaoptflow/samples/optical_flow.cpp
+++ b/modules/cudaoptflow/samples/optical_flow.cpp
@@ -181,8 +181,8 @@ int main(int argc, const char* argv[])
     Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
     Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
     Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
-    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(
-        frame0.size().width, frame0.size().height, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST);
+    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(frame0.size().width, frame0.size().height,
+        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, false, false, false, 0, Stream(), Stream());
 
     {
         GpuMat d_frame0f;
diff --git a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
index 03f4b86ca..b5c760da2 100644
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
@@ -8,11 +8,11 @@
 
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
-cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int) { throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0>(); }
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&) { throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0>(); }
 
 #elif !defined HAVE_NVIDIA_OPTFLOW
 
-cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int)
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&)
 {
     CV_Error(cv::Error::HeaderIsNull, "OpenCV was build without NVIDIA OpticalFlow support");
 }
@@ -236,6 +236,8 @@ private:
     bool m_enableExternalHints;
     bool m_enableCostBuffer;
     int m_gpuId;
+    Stream m_inputStream;
+    Stream m_outputStream;
 
     CUcontext m_cuContext;
     NV_OF_BUFFER_FORMAT m_format;
@@ -286,13 +288,14 @@ protected:
     std::mutex m_lock;
 
 public:
-    NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset,
-        bool bEnableTemporalHints, bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId);
+    NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
+        bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, Stream inputStream, Stream outputStream);
 
     virtual void calc(InputArray inputImage, InputArray referenceImage,
         InputOutputArray flow, Stream& stream = Stream::Null(),
         InputArray hint = cv::noArray(), OutputArray cost = cv::noArray());
 
+
     virtual void collectGarbage();
 
     virtual void upSampler(InputArray flow, int width, int height,
@@ -303,11 +306,13 @@ public:
 
 NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
     int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
-    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId) :
+    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId,
+    Stream inputStream, Stream outputStream) :
     m_width(width), m_height(height), m_preset(perfPreset),
     m_enableTemporalHints((NV_OF_BOOL)bEnableTemporalHints),
     m_enableExternalHints((NV_OF_BOOL)bEnableExternalHints),
     m_enableCostBuffer((NV_OF_BOOL)bEnableCostBuffer), m_gpuId(gpuId),
+    m_inputStream(inputStream), m_outputStream(outputStream),
     m_cuContext(nullptr), m_format(NV_OF_BUFFER_FORMAT_GRAYSCALE8),
     m_gridSize(NV_OF_OUTPUT_VECTOR_GRID_SIZE_4)
 {
@@ -387,6 +392,12 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
 
     NVOF_API_CALL(GetAPI()->nvOFInit(GetHandle(), &m_initParams));
 
+    if (m_inputStream || m_outputStream)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
+            StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream)));
+    }
+
     //Input Buffer 1
     NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
         &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hInputBuffer));
@@ -432,13 +443,12 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
 void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow,
     Stream& stream, InputArray hint, OutputArray cost)
 {
-    Stream inputStream = {};
-    Stream outputStream = {};
-    if (stream)
-        inputStream = stream;
-
-    NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
-        StreamAccessor::getStream(inputStream), StreamAccessor::getStream(outputStream)));
+    if (stream && !m_inputStream)
+    {
+        m_inputStream = stream;
+        NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
+            StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream)));
+    }
 
     GpuMat frame0GpuMat(_frame0.size(), _frame0.type(), (void*)m_frame0cuDevPtr,
         m_inputBufferStrideInfo.strideInfo[0].strideXInBytes);
@@ -452,12 +462,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
     if (_frame0.isMat())
     {
         //Get Mats from InputArrays
-        frame0GpuMat.upload(_frame0);
+        Mat __frame0 = _frame0.getMat();
+        frame0GpuMat.upload(__frame0, m_inputStream);
     }
     else if (_frame0.isGpuMat())
     {
         //Get GpuMats from InputArrays
-        _frame0.copyTo(frame0GpuMat);
+        GpuMat __frame0 = _frame0.getGpuMat();
+        __frame0.copyTo(frame0GpuMat, m_inputStream);
     }
     else
     {
@@ -469,12 +481,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
     if (_frame1.isMat())
     {
         //Get Mats from InputArrays
-        frame1GpuMat.upload(_frame1);
+        Mat __frame1 = _frame1.getMat();
+        frame1GpuMat.upload(__frame1, m_inputStream);
     }
     else if (_frame1.isGpuMat())
     {
         //Get GpuMats from InputArrays
-        _frame1.copyTo(frame1GpuMat);
+        GpuMat __frame1 = _frame1.getGpuMat();
+        __frame1.copyTo(frame1GpuMat, m_inputStream);
     }
     else
     {
@@ -490,12 +504,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
         if (hint.isMat())
         {
             //Get Mat from InputArray hint
-            hintGpuMat.upload(hint);
+            Mat _hint = hint.getMat();
+            hintGpuMat.upload(_hint, m_inputStream);
         }
         else if(hint.isGpuMat())
         {
             //Get GpuMat from InputArray hint
-            hint.copyTo(hintGpuMat);
+            GpuMat _hint = hint.getGpuMat();
+            _hint.copyTo(hintGpuMat, m_inputStream);
         }
         else
         {
@@ -503,8 +519,6 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
         }
     }
 
-    inputStream.waitForCompletion();
-
     //Execute Call
     NV_OF_EXECUTE_INPUT_PARAMS exeInParams;
     NV_OF_EXECUTE_OUTPUT_PARAMS exeOutParams;
@@ -518,15 +532,13 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
     memset(&exeOutParams, 0, sizeof(exeOutParams));
     exeOutParams.outputBuffer = m_hOutputBuffer;
     exeOutParams.outputCostBuffer = m_initParams.enableOutputCost == NV_OF_TRUE ?
-        m_hCostBuffer : nullptr;;
+        m_hCostBuffer : nullptr;
     NVOF_API_CALL(GetAPI()->nvOFExecute(GetHandle(), &exeInParams, &exeOutParams));
 
-    outputStream.waitForCompletion();
-
     if (_flow.isMat())
-        flowXYGpuMat.download(_flow);
+        flowXYGpuMat.download(_flow, m_outputStream);
     else if(_flow.isGpuMat())
-        flowXYGpuMat.copyTo(_flow);
+        flowXYGpuMat.copyTo(_flow, m_outputStream);
     else
         CV_Error(Error::StsBadArg, "Incorrect flow buffer passed. Pass Mat or GpuMat");
 
@@ -537,13 +549,13 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
             m_costBufferStrideInfo.strideInfo[0].strideXInBytes);
 
         if (cost.isMat())
-            costGpuMat.download(cost);
+            costGpuMat.download(cost, m_outputStream);
         else if(cost.isGpuMat())
-            costGpuMat.copyTo(cost);
+            costGpuMat.copyTo(cost, m_outputStream);
         else
             CV_Error(Error::StsBadArg, "Incorrect cost buffer passed. Pass Mat or GpuMat");
     }
-    cuSafeCall(cudaDeviceSynchronize());
+    m_outputStream.waitForCompletion();
 }
 
 void NvidiaOpticalFlowImpl::collectGarbage()
@@ -574,6 +586,14 @@ void NvidiaOpticalFlowImpl::collectGarbage()
             NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hCostBuffer));
         }
     }
+    if (m_inputStream)
+    {
+        m_inputStream.waitForCompletion();
+    }
+    if (m_outputStream)
+    {
+        m_outputStream.waitForCompletion();
+    }
     if (m_hOF)
     {
         NVOF_API_CALL(GetAPI()->nvOFDestroy(m_hOF));
@@ -586,7 +606,8 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
     Mat flow;
     if (_flow.isMat())
     {
-        _flow.copyTo(flow);
+        Mat __flow = _flow.getMat();
+        __flow.copyTo(flow);
     }
     else if (_flow.isGpuMat())
     {
@@ -640,7 +661,8 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
 Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
     int width, int height, NVIDIA_OF_PERF_LEVEL perfPreset,
     bool bEnableTemporalHints, bool bEnableExternalHints,
-    bool bEnableCostBuffer, int gpuId)
+    bool bEnableCostBuffer, int gpuId,
+    Stream& inputStream, Stream& outputStream)
 {
     return makePtr<NvidiaOpticalFlowImpl>(
         width,
@@ -649,6 +671,8 @@ Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
         bEnableTemporalHints,
         bEnableExternalHints,
         bEnableCostBuffer,
-        gpuId);
+        gpuId,
+        inputStream,
+        outputStream);
 }
 #endif