From c49d0b906420dd8da69ffd0bd11e6db72a9eb549 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 7 May 2020 20:49:26 +0530 Subject: [PATCH] Modified Stream support functionality. --- .../include/opencv2/cudaoptflow.hpp | 14 ++- modules/cudaoptflow/samples/optical_flow.cpp | 4 +- modules/cudaoptflow/src/nvidiaOpticalFlow.cpp | 86 ++++++++++++------- 3 files changed, 69 insertions(+), 35 deletions(-) diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp index b434e9dfb..9fde39252 100644 --- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp +++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp @@ -118,7 +118,9 @@ public: @param inputImage Input image. @param referenceImage Reference image of the same size and the same type as input image. @param flow A buffer consisting of inputImage.Size() / getGridSize() flow vectors in CV_16SC2 format. - @param stream Stream for the asynchronous version. + @param stream It is highly recommended that CUDA streams for pre and post processing of optical flow vectors should be set once per session in create() function as a part of optical flow session creation. + This parameter is left here for backward compatibility and may be removed in the future. + Default value is NULL stream; @param hint Hint buffer if client provides external hints. Must have same size as flow buffer. Caller can provide flow vectors as hints for optical flow calculation. @param cost Cost buffer contains numbers indicating the confidence associated with each of the generated flow vectors. @@ -435,6 +437,12 @@ public: @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false. @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false. @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0. + @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers. + The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine. + If input stream is not set, the execute function will use default stream which is NULL stream; + @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors. + The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine. + If output stream is not set, the execute function will use default stream which is NULL stream; */ CV_WRAP static Ptr create( int width, @@ -444,7 +452,9 @@ public: bool enableTemporalHints = false, bool enableExternalHints = false, bool enableCostBuffer = false, - int gpuId = 0); + int gpuId = 0, + Stream& inputStream = Stream::Null(), + Stream& outputStream = Stream::Null()); }; //! @} diff --git a/modules/cudaoptflow/samples/optical_flow.cpp b/modules/cudaoptflow/samples/optical_flow.cpp index 289883b23..2bbea63ba 100644 --- a/modules/cudaoptflow/samples/optical_flow.cpp +++ b/modules/cudaoptflow/samples/optical_flow.cpp @@ -181,8 +181,8 @@ int main(int argc, const char* argv[]) Ptr lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7)); Ptr farn = cuda::FarnebackOpticalFlow::create(); Ptr tvl1 = cuda::OpticalFlowDual_TVL1::create(); - Ptr nvof = cuda::NvidiaOpticalFlow_1_0::create( - frame0.size().width, frame0.size().height, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST); + Ptr nvof = cuda::NvidiaOpticalFlow_1_0::create(frame0.size().width, frame0.size().height, + NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, false, false, false, 0, Stream(), Stream()); { GpuMat d_frame0f; diff --git a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp index 03f4b86ca..b5c760da2 100644 --- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp +++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp @@ -8,11 +8,11 @@ #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -cv::Ptr cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int) { throw_no_cuda(); return cv::Ptr(); } +cv::Ptr cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&) { throw_no_cuda(); return cv::Ptr(); } #elif !defined HAVE_NVIDIA_OPTFLOW -cv::Ptr cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int) +cv::Ptr cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&) { CV_Error(cv::Error::HeaderIsNull, "OpenCV was build without NVIDIA OpticalFlow support"); } @@ -236,6 +236,8 @@ private: bool m_enableExternalHints; bool m_enableCostBuffer; int m_gpuId; + Stream m_inputStream; + Stream m_outputStream; CUcontext m_cuContext; NV_OF_BUFFER_FORMAT m_format; @@ -286,13 +288,14 @@ protected: std::mutex m_lock; public: - NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset, - bool bEnableTemporalHints, bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId); + NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints, + bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, Stream inputStream, Stream outputStream); virtual void calc(InputArray inputImage, InputArray referenceImage, InputOutputArray flow, Stream& stream = Stream::Null(), InputArray hint = cv::noArray(), OutputArray cost = cv::noArray()); + virtual void collectGarbage(); virtual void upSampler(InputArray flow, int width, int height, @@ -303,11 +306,13 @@ public: NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl( int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints, - bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId) : + bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, + Stream inputStream, Stream outputStream) : m_width(width), m_height(height), m_preset(perfPreset), m_enableTemporalHints((NV_OF_BOOL)bEnableTemporalHints), m_enableExternalHints((NV_OF_BOOL)bEnableExternalHints), m_enableCostBuffer((NV_OF_BOOL)bEnableCostBuffer), m_gpuId(gpuId), + m_inputStream(inputStream), m_outputStream(outputStream), m_cuContext(nullptr), m_format(NV_OF_BUFFER_FORMAT_GRAYSCALE8), m_gridSize(NV_OF_OUTPUT_VECTOR_GRID_SIZE_4) { @@ -387,6 +392,12 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl( NVOF_API_CALL(GetAPI()->nvOFInit(GetHandle(), &m_initParams)); + if (m_inputStream || m_outputStream) + { + NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(), + StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream))); + } + //Input Buffer 1 NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(), &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hInputBuffer)); @@ -432,13 +443,12 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl( void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream, InputArray hint, OutputArray cost) { - Stream inputStream = {}; - Stream outputStream = {}; - if (stream) - inputStream = stream; - - NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(), - StreamAccessor::getStream(inputStream), StreamAccessor::getStream(outputStream))); + if (stream && !m_inputStream) + { + m_inputStream = stream; + NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(), + StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream))); + } GpuMat frame0GpuMat(_frame0.size(), _frame0.type(), (void*)m_frame0cuDevPtr, m_inputBufferStrideInfo.strideInfo[0].strideXInBytes); @@ -452,12 +462,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu if (_frame0.isMat()) { //Get Mats from InputArrays - frame0GpuMat.upload(_frame0); + Mat __frame0 = _frame0.getMat(); + frame0GpuMat.upload(__frame0, m_inputStream); } else if (_frame0.isGpuMat()) { //Get GpuMats from InputArrays - _frame0.copyTo(frame0GpuMat); + GpuMat __frame0 = _frame0.getGpuMat(); + __frame0.copyTo(frame0GpuMat, m_inputStream); } else { @@ -469,12 +481,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu if (_frame1.isMat()) { //Get Mats from InputArrays - frame1GpuMat.upload(_frame1); + Mat __frame1 = _frame1.getMat(); + frame1GpuMat.upload(__frame1, m_inputStream); } else if (_frame1.isGpuMat()) { //Get GpuMats from InputArrays - _frame1.copyTo(frame1GpuMat); + GpuMat __frame1 = _frame1.getGpuMat(); + __frame1.copyTo(frame1GpuMat, m_inputStream); } else { @@ -490,12 +504,14 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu if (hint.isMat()) { //Get Mat from InputArray hint - hintGpuMat.upload(hint); + Mat _hint = hint.getMat(); + hintGpuMat.upload(_hint, m_inputStream); } else if(hint.isGpuMat()) { //Get GpuMat from InputArray hint - hint.copyTo(hintGpuMat); + GpuMat _hint = hint.getGpuMat(); + _hint.copyTo(hintGpuMat, m_inputStream); } else { @@ -503,8 +519,6 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu } } - inputStream.waitForCompletion(); - //Execute Call NV_OF_EXECUTE_INPUT_PARAMS exeInParams; NV_OF_EXECUTE_OUTPUT_PARAMS exeOutParams; @@ -518,15 +532,13 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu memset(&exeOutParams, 0, sizeof(exeOutParams)); exeOutParams.outputBuffer = m_hOutputBuffer; exeOutParams.outputCostBuffer = m_initParams.enableOutputCost == NV_OF_TRUE ? - m_hCostBuffer : nullptr;; + m_hCostBuffer : nullptr; NVOF_API_CALL(GetAPI()->nvOFExecute(GetHandle(), &exeInParams, &exeOutParams)); - outputStream.waitForCompletion(); - if (_flow.isMat()) - flowXYGpuMat.download(_flow); + flowXYGpuMat.download(_flow, m_outputStream); else if(_flow.isGpuMat()) - flowXYGpuMat.copyTo(_flow); + flowXYGpuMat.copyTo(_flow, m_outputStream); else CV_Error(Error::StsBadArg, "Incorrect flow buffer passed. Pass Mat or GpuMat"); @@ -537,13 +549,13 @@ void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOu m_costBufferStrideInfo.strideInfo[0].strideXInBytes); if (cost.isMat()) - costGpuMat.download(cost); + costGpuMat.download(cost, m_outputStream); else if(cost.isGpuMat()) - costGpuMat.copyTo(cost); + costGpuMat.copyTo(cost, m_outputStream); else CV_Error(Error::StsBadArg, "Incorrect cost buffer passed. Pass Mat or GpuMat"); } - cuSafeCall(cudaDeviceSynchronize()); + m_outputStream.waitForCompletion(); } void NvidiaOpticalFlowImpl::collectGarbage() @@ -574,6 +586,14 @@ void NvidiaOpticalFlowImpl::collectGarbage() NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hCostBuffer)); } } + if (m_inputStream) + { + m_inputStream.waitForCompletion(); + } + if (m_outputStream) + { + m_outputStream.waitForCompletion(); + } if (m_hOF) { NVOF_API_CALL(GetAPI()->nvOFDestroy(m_hOF)); @@ -586,7 +606,8 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height, Mat flow; if (_flow.isMat()) { - _flow.copyTo(flow); + Mat __flow = _flow.getMat(); + __flow.copyTo(flow); } else if (_flow.isGpuMat()) { @@ -640,7 +661,8 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height, Ptr cv::cuda::NvidiaOpticalFlow_1_0::create( int width, int height, NVIDIA_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints, bool bEnableExternalHints, - bool bEnableCostBuffer, int gpuId) + bool bEnableCostBuffer, int gpuId, + Stream& inputStream, Stream& outputStream) { return makePtr( width, @@ -649,6 +671,8 @@ Ptr cv::cuda::NvidiaOpticalFlow_1_0::create( bEnableTemporalHints, bEnableExternalHints, bEnableCostBuffer, - gpuId); + gpuId, + inputStream, + outputStream); } #endif