Merge pull request #3355 from cudawarped:cudacodec_add_resize_crop

Add scaling and cropping options to `cudacodec::VideoReader`
2025-10-17 07:04:18 +08:00 · 2022-09-22 12:06:56 +03:00
parent de84cc02a8 58e7e307e3
commit 74fce7f71f
8 changed files with 107 additions and 12 deletions
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Video Encoding/Decoding")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wshadow)

-ocv_add_module(cudacodec opencv_core opencv_videoio OPTIONAL opencv_cudev WRAP python)
+ocv_add_module(cudacodec opencv_core opencv_videoio opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudev WRAP python)

 ocv_module_include_directories()
 ocv_glob_module_sources()
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -309,6 +309,9 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
    CV_PROP_RW double fps;
    CV_PROP_RW int ulNumDecodeSurfaces;//!< Maximum number of internal decode surfaces.
    CV_PROP_RW DeinterlaceMode deinterlaceMode;
+    CV_PROP_RW cv::Size targetSz;//!< Post-processed size of the output frame.
+    CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
+    CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
 };

 /** @brief cv::cudacodec::VideoReader generic properties identifier.
@@ -516,6 +519,10 @@ surfaces it requires for correct functionality and optimal video memory usage bu
 overall application. The optimal number of decode surfaces (in terms of performance and memory utilization) should be decided by experimentation for each application,
 but it cannot go below the number determined by NVDEC.
@param rawMode Allow the raw encoded data which has been read up until the last call to grab() to be retrieved by calling retrieve(rawData,RAW_DATA_IDX).
+@param targetSz Post-processed size (width/height should be multiples of 2) of the output frame, defaults to the size of the encoded video source.
+@param srcRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) decoded from video source, defaults to the full frame.
+@param targetRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) within the output frame to copy and resize the decoded frame to,
+defaults to the full frame.
 */
 struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0) {};
@@ -523,6 +530,9 @@ struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
    CV_PROP_RW bool allowFrameDrop;
    CV_PROP_RW int minNumDecodeSurfaces;
    CV_PROP_RW bool rawMode;
+    CV_PROP_RW cv::Size targetSz;
+    CV_PROP_RW cv::Rect srcRoi;
+    CV_PROP_RW cv::Rect targetRoi;
 };

 /** @brief Creates video reader.
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -148,6 +148,14 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
    createInfo_.ulTargetHeight      = videoFormat.height;
    createInfo_.ulMaxWidth          = videoFormat.ulMaxWidth;
    createInfo_.ulMaxHeight         = videoFormat.ulMaxHeight;
+    createInfo_.display_area.left   = videoFormat.displayArea.x;
+    createInfo_.display_area.right  = videoFormat.displayArea.x + videoFormat.displayArea.width;
+    createInfo_.display_area.top    = videoFormat.displayArea.y;
+    createInfo_.display_area.bottom = videoFormat.displayArea.y + videoFormat.displayArea.height;
+    createInfo_.target_rect.left    = videoFormat.targetRoi.x;
+    createInfo_.target_rect.right   = videoFormat.targetRoi.x + videoFormat.targetRoi.width;
+    createInfo_.target_rect.top     = videoFormat.targetRoi.y;
+    createInfo_.target_rect.bottom  = videoFormat.targetRoi.y + videoFormat.targetRoi.height;
    createInfo_.ulNumOutputSurfaces = 2;
    createInfo_.ulCreationFlags     = videoCreateFlags;
    createInfo_.vidLock = lock_;
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -49,10 +49,17 @@ namespace cv { namespace cudacodec { namespace detail {
 class VideoDecoder
 {
 public:
-    VideoDecoder(const Codec& codec, const int minNumDecodeSurfaces, CUcontext ctx, CUvideoctxlock lock) : ctx_(ctx), lock_(lock), decoder_(0)
+    VideoDecoder(const Codec& codec, const int minNumDecodeSurfaces, cv::Size targetSz, cv::Rect srcRoi, cv::Rect targetRoi, CUcontext ctx, CUvideoctxlock lock) :
+        ctx_(ctx), lock_(lock), decoder_(0)
    {
        videoFormat_.codec = codec;
        videoFormat_.ulNumDecodeSurfaces = minNumDecodeSurfaces;
+        // alignment enforced by nvcuvid, likely due to chroma subsampling
+        videoFormat_.targetSz.width = targetSz.width - targetSz.width % 2; videoFormat_.targetSz.height = targetSz.height - targetSz.height % 2;
+        videoFormat_.srcRoi.x = srcRoi.x - srcRoi.x % 4; videoFormat_.srcRoi.width = srcRoi.width - srcRoi.width % 4;
+        videoFormat_.srcRoi.y = srcRoi.y - srcRoi.y % 2; videoFormat_.srcRoi.height = srcRoi.height - srcRoi.height % 2;
+        videoFormat_.targetRoi.x = targetRoi.x - targetRoi.x % 4; videoFormat_.targetRoi.width = targetRoi.width - targetRoi.width % 4;
+        videoFormat_.targetRoi.y = targetRoi.y - targetRoi.y % 2; videoFormat_.targetRoi.height = targetRoi.height - targetRoi.height % 2;
    }

    ~VideoDecoder()
@@ -66,6 +73,9 @@ public:
    // Get the code-type currently used.
    cudaVideoCodec codec() const { return static_cast<cudaVideoCodec>(videoFormat_.codec); }
    int nDecodeSurfaces() const { return videoFormat_.ulNumDecodeSurfaces; }
+    cv::Size getTargetSz() const { return videoFormat_.targetSz; }
+    cv::Rect getSrcRoi() const { return videoFormat_.srcRoi; }
+    cv::Rect getTargetRoi() const { return videoFormat_.targetRoi; }

    unsigned long frameWidth() const { return videoFormat_.ulWidth; }
    unsigned long frameHeight() const { return videoFormat_.ulHeight; }
@@ -89,7 +99,7 @@ public:

        cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );

-        return cuda::GpuMat(frameHeight() * 3 / 2, frameWidth(), CV_8UC1, (void*) ptr, pitch);
+        return cuda::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
    }

    void unmapFrame(cuda::GpuMat& frame)
--- a/modules/cudacodec/src/video_parser.cpp
+++ b/modules/cudacodec/src/video_parser.cpp
@@ -120,10 +120,19 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
        newFormat.nBitDepthMinus8 = format->bit_depth_luma_minus8;
        newFormat.ulWidth = format->coded_width;
        newFormat.ulHeight = format->coded_height;
-        newFormat.width = format->coded_width;
-        newFormat.height = format->coded_height;
-        newFormat.displayArea = Rect(Point(format->display_area.left, format->display_area.top), Point(format->display_area.right, format->display_area.bottom));
        newFormat.fps = format->frame_rate.numerator / static_cast<float>(format->frame_rate.denominator);
+        newFormat.targetSz = thiz->videoDecoder_->getTargetSz();
+        newFormat.width = newFormat.targetSz.width ? newFormat.targetSz.width : format->coded_width;
+        newFormat.height = newFormat.targetSz.height ? newFormat.targetSz.height : format->coded_height;
+        newFormat.srcRoi = thiz->videoDecoder_->getSrcRoi();
+        if (newFormat.srcRoi.empty()) {
+            format->display_area.right = format->coded_width;
+            format->display_area.bottom = format->coded_height;
+            newFormat.displayArea = Rect(Point(format->display_area.left, format->display_area.top), Point(format->display_area.right, format->display_area.bottom));
+        }
+        else
+            newFormat.displayArea = newFormat.srcRoi;
+        newFormat.targetRoi = thiz->videoDecoder_->getTargetRoi();
        newFormat.ulNumDecodeSurfaces = min(!thiz->allowFrameDrop_ ? max(thiz->videoDecoder_->nDecodeSurfaces(), static_cast<int>(format->min_num_decode_surfaces)) :
            format->min_num_decode_surfaces * 2, 32);
        if (format->progressive_sequence)
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -86,7 +86,8 @@ namespace
    class VideoReaderImpl : public VideoReader
    {
    public:
-        explicit VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop = false , const bool udpSource = false);
+        explicit VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop = false , const bool udpSource = false,
+            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect());
        ~VideoReaderImpl();

        bool nextFrame(GpuMat& frame, Stream& stream) CV_OVERRIDE;
@@ -131,7 +132,8 @@ namespace
        return videoSource_->format();
    }

-    VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource) :
+    VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
+        const Size targetSz, const Rect srcRoi, const Rect targetRoi) :
        videoSource_(source),
        lock_(0)
    {
@@ -143,7 +145,7 @@ namespace
        cuSafeCall( cuCtxGetCurrent(&ctx) );
        cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
        frameQueue_.reset(new FrameQueue());
-        videoDecoder_.reset(new VideoDecoder(videoSource_->format().codec, minNumDecodeSurfaces, ctx, lock_));
+        videoDecoder_.reset(new VideoDecoder(videoSource_->format().codec, minNumDecodeSurfaces, targetSz, srcRoi, targetRoi, ctx, lock_));
        videoParser_.reset(new VideoParser(videoDecoder_, frameQueue_, allowFrameDrop, udpSource));
        videoSource_->setVideoParser(videoParser_);
        videoSource_->start();
@@ -357,13 +359,15 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
        videoSource.reset(new CuvidVideoSource(filename));
    }

-    return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource);
+    return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
+        params.srcRoi, params.targetRoi);
 }

 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>& source, const VideoReaderInitParams params)
 {
    Ptr<VideoSource> videoSource(new RawVideoSourceWrapper(source, params.rawMode));
-    return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces);
+    return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
+        params.srcRoi, params.targetRoi);
 }

 #endif // HAVE_NVCUVID
--- a/modules/cudacodec/test/test_precomp.hpp
+++ b/modules/cudacodec/test/test_precomp.hpp
@@ -47,6 +47,8 @@
 #include "opencv2/ts/cuda_test.hpp"

 #include "opencv2/cudacodec.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/cudaarithm.hpp"

 #include "cvconfig.h"

--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -54,6 +54,10 @@ PARAM_TEST_CASE(CheckExtraData, cv::cuda::DeviceInfo, check_extra_data_params_t)
 {
 };

+PARAM_TEST_CASE(Scaling, cv::cuda::DeviceInfo, std::string, Size2f, Rect2f, Rect2f)
+{
+};
+
 PARAM_TEST_CASE(Video, cv::cuda::DeviceInfo, std::string)
 {
 };
@@ -177,6 +181,47 @@ CUDA_TEST_P(CheckKeyFrame, Reader)
    }
 }

+CUDA_TEST_P(Scaling, Reader)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + GET_PARAM(1);
+    const Size2f targetSzIn = GET_PARAM(2);
+    const Rect2f srcRoiIn = GET_PARAM(3);
+    const Rect2f targetRoiIn = GET_PARAM(4);
+
+    GpuMat frameOr;
+    {
+        cv::Ptr<cv::cudacodec::VideoReader> readerGs = cv::cudacodec::createVideoReader(inputFile);
+        readerGs->set(cudacodec::ColorFormat::GRAY);
+        ASSERT_TRUE(readerGs->nextFrame(frameOr));
+    }
+
+    cudacodec::VideoReaderInitParams params;
+    params.targetSz = Size(frameOr.cols * targetSzIn.width, frameOr.rows * targetSzIn.height);
+    params.srcRoi = Rect(frameOr.cols * srcRoiIn.x, frameOr.rows * srcRoiIn.y, frameOr.cols * srcRoiIn.width, frameOr.rows * srcRoiIn.height);
+    params.targetRoi = Rect(params.targetSz.width * targetRoiIn.x, params.targetSz.height * targetRoiIn.y, params.targetSz.width * targetRoiIn.width,
+        params.targetSz.height * targetRoiIn.height);
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
+    reader->set(cudacodec::ColorFormat::GRAY);
+    GpuMat frame;
+    ASSERT_TRUE(reader->nextFrame(frame));
+    const cudacodec::FormatInfo format = reader->format();
+    Size targetSzOut;
+    targetSzOut.width = params.targetSz.width - params.targetSz.width % 2; targetSzOut.height = params.targetSz.height - params.targetSz.height % 2;
+    Rect srcRoiOut, targetRoiOut;
+    srcRoiOut.x = params.srcRoi.x - params.srcRoi.x % 4; srcRoiOut.width = params.srcRoi.width - params.srcRoi.width % 4;
+    srcRoiOut.y = params.srcRoi.y - params.srcRoi.y % 2; srcRoiOut.height = params.srcRoi.height - params.srcRoi.height % 2;
+    targetRoiOut.x = params.targetRoi.x - params.targetRoi.x % 4; targetRoiOut.width = params.targetRoi.width - params.targetRoi.width % 4;
+    targetRoiOut.y = params.targetRoi.y - params.targetRoi.y % 2; targetRoiOut.height = params.targetRoi.height - params.targetRoi.height % 2;
+    ASSERT_TRUE(format.valid && format.targetSz == targetSzOut && format.srcRoi == srcRoiOut && format.targetRoi == targetRoiOut);
+    ASSERT_TRUE(frame.size() == targetSzOut);
+    GpuMat frameGs;
+    cv::cuda::resize(frameOr(srcRoiOut), frameGs, targetRoiOut.size(), 0, 0, INTER_AREA);
+    // assert on mean absolute error due to different resize algorithms
+    const double mae = cv::cuda::norm(frameGs, frame(targetRoiOut), NORM_L1)/frameGs.size().area();
+    ASSERT_LT(mae, 2.35);
+}
+
 CUDA_TEST_P(Video, Reader)
 {
    cv::cuda::setDevice(GET_PARAM(0).deviceID());
@@ -431,7 +476,14 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckSet, testing::Combine(
    ALL_DEVICES,
    testing::Values("highgui/video/big_buck_bunny.mp4")));

-#define VIDEO_SRC_R  "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
+#define VIDEO_SRC_SCALING "highgui/video/big_buck_bunny.mp4"
+#define TARGET_SZ Size2f(1,1), Size2f(0.8,0.9), Size2f(2.3,1.8)
+#define SRC_ROI Rect2f(0,0,1,1), Rect2f(0.25,0.25,0.5,0.5)
+#define TARGET_ROI Rect2f(0,0,1,1), Rect2f(0.2,0.3,0.6,0.7)
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Scaling, testing::Combine(
+    ALL_DEVICES, testing::Values(VIDEO_SRC_SCALING), testing::Values(TARGET_SZ), testing::Values(SRC_ROI), testing::Values(TARGET_ROI)));
+
+#define VIDEO_SRC_R "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
    "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
    "cv/tracking/faceocc2/data/faceocc2.webm"