Modified Stream support functionality.

2025-10-16 13:57:05 +08:00 · 2020-05-07 20:49:26 +05:30
parent dfb6d9bf71
commit c49d0b9064
3 changed files with 69 additions and 35 deletions
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -118,7 +118,9 @@ public:
    @param inputImage Input image.
    @param referenceImage Reference image of the same size and the same type as input image.
    @param flow A buffer consisting of inputImage.Size() / getGridSize() flow vectors in CV_16SC2 format.
-    @param stream Stream for the asynchronous version.
+    @param stream It is highly recommended that CUDA streams for pre and post processing of optical flow vectors should be set once per session in create() function as a part of optical flow session creation.
+                  This parameter is left here for backward compatibility and may be removed in the future.
+                  Default value is NULL stream;
    @param hint Hint buffer if client provides external hints. Must have same size as flow buffer.
                Caller can provide flow vectors as hints for optical flow calculation.
    @param cost Cost buffer contains numbers indicating the confidence associated with each of the generated flow vectors.
@@ -435,6 +437,12 @@ public:
    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers.
+                       The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine.
+                       If input stream is not set, the execute function will use default stream which is NULL stream;
+    @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors.
+                        The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine.
+                        If output stream is not set, the execute function will use default stream which is NULL stream;
    */
    CV_WRAP static Ptr<NvidiaOpticalFlow_1_0> create(
        int width,
@@ -444,7 +452,9 @@ public:
        bool enableTemporalHints = false,
        bool enableExternalHints = false,
        bool enableCostBuffer = false,
-        int gpuId = 0);
+        int gpuId = 0,
+        Stream& inputStream = Stream::Null(),
+        Stream& outputStream = Stream::Null());
 };

 //! @}
--- a/modules/cudaoptflow/samples/optical_flow.cpp
+++ b/modules/cudaoptflow/samples/optical_flow.cpp
@@ -181,8 +181,8 @@ int main(int argc, const char* argv[])
    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
-    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(
-        frame0.size().width, frame0.size().height, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST);
+    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(frame0.size().width, frame0.size().height,
+        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, false, false, false, 0, Stream(), Stream());

    {
        GpuMat d_frame0f;
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
@@ -8,11 +8,11 @@

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int) { throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0>(); }
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&) { throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0>(); }

 #elif !defined HAVE_NVIDIA_OPTFLOW

-cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int)
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&)
 {
    CV_Error(cv::Error::HeaderIsNull, "OpenCV was build without NVIDIA OpticalFlow support");
 }
@@ -236,6 +236,8 @@ private:
    bool m_enableExternalHints;
    bool m_enableCostBuffer;
    int m_gpuId;
+    Stream m_inputStream;
+    Stream m_outputStream;

    CUcontext m_cuContext;
    NV_OF_BUFFER_FORMAT m_format;
@@ -286,13 +288,14 @@ protected:
    std::mutex m_lock;

 public:
-    NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset,
-        bool bEnableTemporalHints, bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId);
+    NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
+        bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, Stream inputStream, Stream outputStream);

    virtual void calc(InputArray inputImage, InputArray referenceImage,
        InputOutputArray flow, Stream& stream = Stream::Null(),
        InputArray hint = cv::noArray(), OutputArray cost = cv::noArray());

+
    virtual void collectGarbage();

    virtual void upSampler(InputArray flow, int width, int height,
@@ -303,11 +306,13 @@ public:

 NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
    int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
-    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId) :
+    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId,
+    Stream inputStream, Stream outputStream) :
    m_width(width), m_height(height), m_preset(perfPreset),
    m_enableTemporalHints((NV_OF_BOOL)bEnableTemporalHints),
    m_enableExternalHints((NV_OF_BOOL)bEnableExternalHints),
    m_enableCostBuffer((NV_OF_BOOL)bEnableCostBuffer), m_gpuId(gpuId),
+    m_inputStream(inputStream), m_outputStream(outputStream),
    m_cuContext(nullptr), m_format(NV_OF_BUFFER_FORMAT_GRAYSCALE8),
    m_gridSize(NV_OF_OUTPUT_VECTOR_GRID_SIZE_4)
 {
@@ -387,6 +392,12 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(

    NVOF_API_CALL(GetAPI()->nvOFInit(GetHandle(), &m_initParams));

+    if (m_inputStream || m_outputStream)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
+            StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream)));
+    }
+
    //Input Buffer 1
    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
        &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hInputBuffer));
@@ -432,13 +443,12 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
 void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow,
    Stream& stream, InputArray hint, OutputArray cost)
 {
-    Stream inputStream = {};
-    Stream outputStream = {};
-    if (stream)
-        inputStream = stream;
-
-    NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
-        StreamAccessor::getStream(inputStream), StreamAccessor::getStream(outputStream)));
+    if (stream && !m_inputStream)
+    {
+        m_inputStream = stream;
+        NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
+            StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream)));
+    }

    GpuMat frame0GpuMat(_frame0.size(), _frame0.type(), (void*)m_frame0cuDevPtr,
        m_inputBufferStrideInfo.strideInfo[0].strideXInBytes);
@@ -452,12 +462,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
    if (_frame0.isMat())
    {
        //Get Mats from InputArrays
-        frame0GpuMat.upload(_frame0);
+        Mat __frame0 = _frame0.getMat();
+        frame0GpuMat.upload(__frame0, m_inputStream);
    }
    else if (_frame0.isGpuMat())
    {
        //Get GpuMats from InputArrays
-        _frame0.copyTo(frame0GpuMat);
+        GpuMat __frame0 = _frame0.getGpuMat();
+        __frame0.copyTo(frame0GpuMat, m_inputStream);
    }
    else
    {
@@ -469,12 +481,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
    if (_frame1.isMat())
    {
        //Get Mats from InputArrays
-        frame1GpuMat.upload(_frame1);
+        Mat __frame1 = _frame1.getMat();
+        frame1GpuMat.upload(__frame1, m_inputStream);
    }
    else if (_frame1.isGpuMat())
    {
        //Get GpuMats from InputArrays
-        _frame1.copyTo(frame1GpuMat);
+        GpuMat __frame1 = _frame1.getGpuMat();
+        __frame1.copyTo(frame1GpuMat, m_inputStream);
    }
    else
    {
@@ -490,12 +504,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
        if (hint.isMat())
        {
            //Get Mat from InputArray hint
-            hintGpuMat.upload(hint);
+            Mat _hint = hint.getMat();
+            hintGpuMat.upload(_hint, m_inputStream);
        }
        else if(hint.isGpuMat())
        {
            //Get GpuMat from InputArray hint
-            hint.copyTo(hintGpuMat);
+            GpuMat _hint = hint.getGpuMat();
+            _hint.copyTo(hintGpuMat, m_inputStream);
        }
        else
        {
@@ -503,8 +519,6 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
        }
    }

-    inputStream.waitForCompletion();
-
    //Execute Call
    NV_OF_EXECUTE_INPUT_PARAMS exeInParams;
    NV_OF_EXECUTE_OUTPUT_PARAMS exeOutParams;
@@ -518,15 +532,13 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
    memset(&exeOutParams, 0, sizeof(exeOutParams));
    exeOutParams.outputBuffer = m_hOutputBuffer;
    exeOutParams.outputCostBuffer = m_initParams.enableOutputCost == NV_OF_TRUE ?
-        m_hCostBuffer : nullptr;;
+        m_hCostBuffer : nullptr;
    NVOF_API_CALL(GetAPI()->nvOFExecute(GetHandle(), &exeInParams, &exeOutParams));

-    outputStream.waitForCompletion();
-
    if (_flow.isMat())
-        flowXYGpuMat.download(_flow);
+        flowXYGpuMat.download(_flow, m_outputStream);
    else if(_flow.isGpuMat())
-        flowXYGpuMat.copyTo(_flow);
+        flowXYGpuMat.copyTo(_flow, m_outputStream);
    else
        CV_Error(Error::StsBadArg, "Incorrect flow buffer passed. Pass Mat or GpuMat");

@@ -537,13 +549,13 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu
            m_costBufferStrideInfo.strideInfo[0].strideXInBytes);

        if (cost.isMat())
-            costGpuMat.download(cost);
+            costGpuMat.download(cost, m_outputStream);
        else if(cost.isGpuMat())
-            costGpuMat.copyTo(cost);
+            costGpuMat.copyTo(cost, m_outputStream);
        else
            CV_Error(Error::StsBadArg, "Incorrect cost buffer passed. Pass Mat or GpuMat");
    }
-    cuSafeCall(cudaDeviceSynchronize());
+    m_outputStream.waitForCompletion();
 }

 void NvidiaOpticalFlowImpl::collectGarbage()
@@ -574,6 +586,14 @@ void NvidiaOpticalFlowImpl::collectGarbage()
            NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hCostBuffer));
        }
    }
+    if (m_inputStream)
+    {
+        m_inputStream.waitForCompletion();
+    }
+    if (m_outputStream)
+    {
+        m_outputStream.waitForCompletion();
+    }
    if (m_hOF)
    {
        NVOF_API_CALL(GetAPI()->nvOFDestroy(m_hOF));
@@ -586,7 +606,8 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
    Mat flow;
    if (_flow.isMat())
    {
-        _flow.copyTo(flow);
+        Mat __flow = _flow.getMat();
+        __flow.copyTo(flow);
    }
    else if (_flow.isGpuMat())
    {
@@ -640,7 +661,8 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
 Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
    int width, int height, NVIDIA_OF_PERF_LEVEL perfPreset,
    bool bEnableTemporalHints, bool bEnableExternalHints,
-    bool bEnableCostBuffer, int gpuId)
+    bool bEnableCostBuffer, int gpuId,
+    Stream& inputStream, Stream& outputStream)
 {
    return makePtr<NvidiaOpticalFlowImpl>(
        width,
@@ -649,6 +671,8 @@ Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
        bEnableTemporalHints,
        bEnableExternalHints,
        bEnableCostBuffer,
-        gpuId);
+        gpuId,
+        inputStream,
+        outputStream);
 }
 #endif