diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index af8f169c1..bb664386e 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -295,7 +295,7 @@ enum ChromaFormat
 
 /** @brief Deinterlacing mode used by decoder.
 * @param Weave Weave both fields (no deinterlacing). For progressive content and for content that doesn't need deinterlacing.
-* Bob Drop one field.
+* @param Bob Drop one field.
 * @param Adaptive Adaptive deinterlacing needs more video memory than other deinterlacing modes.
 * */
 enum DeinterlaceMode
@@ -305,12 +305,22 @@ enum DeinterlaceMode
     Adaptive = 2
 };
 
+/** @brief Utility function demonstrating how to map the luma histogram when FormatInfo::videoFullRangeFlag == false
+    @param hist Luma histogram \a hist returned from VideoReader::nextFrame(GpuMat& frame, GpuMat& hist, Stream& stream).
+    @param histFull Host histogram equivelent to downloading \a hist after calling cuda::calcHist(InputArray frame, OutputArray hist, Stream& stream).
+
+    @note
+    -   This function demonstrates how to map the luma histogram back so that it is equivalent to the result obtained from cuda::calcHist()
+    if the returned frame was colorFormat::GRAY.
+ */
+CV_EXPORTS_W void MapHist(const GpuMat& hist, CV_OUT Mat& histFull);
+
 /** @brief Struct providing information about video file format. :
  */
 struct CV_EXPORTS_W_SIMPLE FormatInfo
 {
-    CV_WRAP FormatInfo() : nBitDepthMinus8(-1), nBitDepthChromaMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
-        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false) {};
+    CV_WRAP FormatInfo() : nBitDepthMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
+        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false), enableHistogram(false), nCounterBitDepth(0), nMaxHistogramBins(0){};
 
     CV_PROP_RW Codec codec;
     CV_PROP_RW ChromaFormat chromaFormat;
@@ -331,6 +341,9 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
     CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
     CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
     CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.  Internally the conversion from NV12 to BGR obeys ITU 709.
+    CV_PROP_RW bool enableHistogram;//!< Flag requesting histogram output if supported. Exception will be thrown when requested but not supported.
+    CV_PROP_RW int nCounterBitDepth;//!< Bit depth of histogram bins if histogram output is requested and supported.
+    CV_PROP_RW int nMaxHistogramBins;//!< Max number of histogram bins if histogram output is requested and supported.
 };
 
 /** @brief cv::cudacodec::VideoReader generic properties identifier.
@@ -376,6 +389,20 @@ public:
      */
     CV_WRAP virtual bool nextFrame(CV_OUT GpuMat& frame, Stream &stream = Stream::Null()) = 0;
 
+    /** @brief Grabs, decodes and returns the next video frame and frame luma histogram.
+
+    @param [out] frame The video frame.
+    @param [out] histogram Histogram of the luma component of the encoded frame, see note.
+    @param stream Stream for the asynchronous version.
+    @return `false` if no frames have been grabbed.
+
+    If no frames have been grabbed (there are no more frames in video file), the methods return false.
+    The method throws an Exception if error occurs.
+
+    @note Histogram data is collected by NVDEC during the decoding process resulting in zero performance penalty. NVDEC computes the histogram data for only the luma component of decoded output, not on post-processed frame(i.e. when scaling, cropping, etc. applied).  If the source is encoded using a limited range of luma values (FormatInfo::videoFullRangeFlag == false) then the histogram bin values will correspond to to this limited range of values and will need to be mapped to contain the same output as cuda::calcHist().  The MapHist() utility function can be used to perform this mapping on the host if required.
+     */
+    CV_WRAP_AS(nextFrameWithHist) virtual bool nextFrame(CV_OUT GpuMat& frame, CV_OUT GpuMat& histogram, Stream& stream = Stream::Null()) = 0;
+
     /** @brief Returns information about video file format.
     */
     CV_WRAP virtual FormatInfo format() const = 0;
@@ -535,9 +562,10 @@ but it cannot go below the number determined by NVDEC.
 @param srcRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) decoded from video source, defaults to the full frame.
 @param targetRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) within the output frame to copy and resize the decoded frame to,
 defaults to the full frame.
+@param enableHistogram Request output of decoded luma histogram \a hist from VideoReader::nextFrame(GpuMat& frame, GpuMat& hist, Stream& stream), if hardware supported.
 */
 struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
-    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0) {};
+    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0), enableHistogram(false){};
     CV_PROP_RW bool udpSource;
     CV_PROP_RW bool allowFrameDrop;
     CV_PROP_RW int minNumDecodeSurfaces;
@@ -545,6 +573,7 @@ struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
     CV_PROP_RW cv::Size targetSz;
     CV_PROP_RW cv::Rect srcRoi;
     CV_PROP_RW cv::Rect targetRoi;
+    CV_PROP_RW bool enableHistogram;
 };
 
 /** @brief Creates video reader.
diff --git a/modules/cudacodec/misc/python/test/test_cudacodec.py b/modules/cudacodec/misc/python/test/test_cudacodec.py
index 3f41c3bbe..1e5d3755c 100644
--- a/modules/cudacodec/misc/python/test/test_cudacodec.py
+++ b/modules/cudacodec/misc/python/test/test_cudacodec.py
@@ -14,36 +14,53 @@ class cudacodec_test(NewOpenCVTests):
     @unittest.skipIf('OPENCV_TEST_DATA_PATH' not in os.environ,
                      "OPENCV_TEST_DATA_PATH is not defined")
     def test_reader(self):
-        #Test the functionality but not the results of the video reader
+        # Test the functionality but not the results of the VideoReader
 
-        vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/cv/video/1920x1080.avi'
+        vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/highgui/video/big_buck_bunny.h264'
         try:
             reader = cv.cudacodec.createVideoReader(vid_path)
             format_info = reader.format()
             ret, gpu_mat = reader.nextFrame()
             self.assertTrue(ret)
-            self.assertTrue('GpuMat' in str(type(gpu_mat)), msg=type(gpu_mat))
+            self.assertTrue(isinstance(gpu_mat, cv.cuda.GpuMat), msg=type(gpu_mat))
             #TODO: print(cv.utils.dumpInputArray(gpu_mat)) # - no support for GpuMat
 
+            # Retrieve format info
             if(not format_info.valid):
               format_info = reader.format()
             sz = gpu_mat.size()
             self.assertTrue(sz[0] == format_info.width and sz[1] == format_info.height)
 
             # not checking output, therefore sepearate tests for different signatures is unecessary
-            ret, _gpu_mat2 = reader.nextFrame(gpu_mat)
-            #TODO: self.assertTrue(gpu_mat == gpu_mat2)
-            self.assertTrue(ret)
+            ret, gpu_mat_ = reader.nextFrame(gpu_mat)
+            self.assertTrue(ret and gpu_mat_.cudaPtr() == gpu_mat.cudaPtr())
 
+            # Pass VideoReaderInitParams to the decoder and initialization params to the source (cv::VideoCapture)
             params = cv.cudacodec.VideoReaderInitParams()
             params.rawMode = True
+            params.enableHistogramOutput = True
             ms_gs = 1234
+            post_processed_sz = (gpu_mat.size()[0]*2, gpu_mat.size()[1]*2)
+            params.targetSz = post_processed_sz
             reader = cv.cudacodec.createVideoReader(vid_path,[cv.CAP_PROP_OPEN_TIMEOUT_MSEC, ms_gs], params)
             ret, ms = reader.get(cv.CAP_PROP_OPEN_TIMEOUT_MSEC)
             self.assertTrue(ret and ms == ms_gs)
             ret, raw_mode = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_RAW_MODE)
             self.assertTrue(ret and raw_mode)
 
+            # Retrieve image histogram
+            ret, gpu_mat, hist = reader.nextFrameWithHist()
+            self.assertTrue(ret and not gpu_mat.empty() and hist.size() == (256,1))
+            ret, gpu_mat_, hist_ = reader.nextFrameWithHist(gpu_mat, hist)
+            self.assertTrue(ret and not gpu_mat.empty() and hist.size() == (256,1))
+            self.assertTrue(gpu_mat_.cudaPtr() == gpu_mat.cudaPtr() and hist_.cudaPtr() == hist.cudaPtr())
+            hist_host = cv.cudacodec.MapHist(hist)
+            self.assertTrue(hist_host.shape == (1,256) and isinstance(hist_host, np.ndarray))
+
+            # Check post processing applied
+            self.assertTrue(gpu_mat.size() == post_processed_sz)
+
+            # Change color format
             ret, colour_code = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_COLOR_FORMAT)
             self.assertTrue(ret and colour_code == cv.cudacodec.ColorFormat_BGRA)
             colour_code_gs = cv.cudacodec.ColorFormat_GRAY
@@ -51,6 +68,7 @@ class cudacodec_test(NewOpenCVTests):
             ret, colour_code = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_COLOR_FORMAT)
             self.assertTrue(ret and colour_code == colour_code_gs)
 
+            # Read raw encoded bitstream
             ret, i_base = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_RAW_PACKAGES_BASE_INDEX)
             self.assertTrue(ret and i_base == 2.0)
             self.assertTrue(reader.grab())
@@ -75,8 +93,8 @@ class cudacodec_test(NewOpenCVTests):
             else:
                 self.skipTest(e.err)
 
-    def test_writer_existence(self):
-        #Test at least the existence of wrapped functions for now
+    def test_writer(self):
+        # Test the functionality but not the results of the VideoWriter
 
         try:
             fd, fname = tempfile.mkstemp(suffix=".h264")
@@ -91,11 +109,12 @@ class cudacodec_test(NewOpenCVTests):
             writer.write(blankFrameIn)
             writer.release()
             encoder_params_out = writer.getEncoderParams()
-            self.assert_true(encoder_params_in.gopLength == encoder_params_out.gopLength)
+            self.assertTrue(encoder_params_in.gopLength == encoder_params_out.gopLength)
             cap = cv.VideoCapture(fname,cv.CAP_FFMPEG)
-            self.assert_true(cap.isOpened())
+            self.assertTrue(cap.isOpened())
             ret, blankFrameOut = cap.read()
-            self.assert_true(ret and blankFrameOut.shape == blankFrameIn.download().shape)
+            self.assertTrue(ret and blankFrameOut.shape == blankFrameIn.download().shape)
+            cap.release()
         except cv.error as e:
             self.assertEqual(e.code, cv.Error.StsNotImplemented)
             self.skipTest("Either NVCUVENC or a GPU hardware encoder is missing or the encoding profile is not supported.")
diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
index 23a349c3b..10008d9b0 100644
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -96,18 +96,18 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
                             cudaVideoCodec_YUYV     == _codec ||
                             cudaVideoCodec_UYVY     == _codec;
 
-#if defined (HAVE_CUDA)
 #if (CUDART_VERSION >= 6050)
-    codecSupported |=       cudaVideoCodec_HEVC     == _codec;
+    codecSupported |= cudaVideoCodec_HEVC == _codec;
+#endif
+#if (CUDART_VERSION >= 7050)
+    codecSupported |= cudaVideoCodec_YUV420 == _codec;
 #endif
 #if  ((CUDART_VERSION == 7050) || (CUDART_VERSION >= 9000))
-    codecSupported |=       cudaVideoCodec_VP8      == _codec ||
-                            cudaVideoCodec_VP9      == _codec ||
-                            cudaVideoCodec_AV1      == _codec ||
-                            cudaVideoCodec_YUV420   == _codec;
+    codecSupported |= cudaVideoCodec_VP8 == _codec || cudaVideoCodec_VP9 == _codec;
 #endif
+#if (CUDART_VERSION >= 9000)
+    codecSupported |= cudaVideoCodec_AV1;
 #endif
-
     CV_Assert(codecSupported);
     CV_Assert(  cudaVideoChromaFormat_Monochrome == _chromaFormat ||
                 cudaVideoChromaFormat_420        == _chromaFormat ||
@@ -123,31 +123,55 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     cuSafeCall(cuCtxPushCurrent(ctx_));
     cuSafeCall(cuvidGetDecoderCaps(&decodeCaps));
     cuSafeCall(cuCtxPopCurrent(NULL));
-    if (!(decodeCaps.bIsSupported && (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)))){
-        CV_LOG_ERROR(NULL, "Video source is not supported by hardware video decoder.");
-        CV_Error(Error::StsUnsupportedFormat, "Video source is not supported by hardware video decoder");
+    if (!(decodeCaps.bIsSupported && (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)))) {
+        CV_Error(Error::StsUnsupportedFormat, "Video source is not supported by hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
     }
+
+    if (videoFormat.enableHistogram) {
+        if (!decodeCaps.bIsHistogramSupported) {
+            CV_Error(Error::StsBadArg, "Luma histogram output is not supported for current codec and/or on current device.");
+        }
+
+        if (decodeCaps.nCounterBitDepth != 32) {
+            std::ostringstream error;
+            error << "Luma histogram output disabled due to current device using " << decodeCaps.nCounterBitDepth << " bit bins. Histogram output only supports 32 bit bins.";
+            CV_Error(Error::StsBadArg, error.str());
+        }
+        else {
+            videoFormat_.nCounterBitDepth = decodeCaps.nCounterBitDepth;
+            videoFormat_.nMaxHistogramBins = decodeCaps.nMaxHistogramBins;
+        }
+    }
+
     CV_Assert(videoFormat.ulWidth >= decodeCaps.nMinWidth &&
         videoFormat.ulHeight >= decodeCaps.nMinHeight &&
         videoFormat.ulWidth <= decodeCaps.nMaxWidth &&
         videoFormat.ulHeight <= decodeCaps.nMaxHeight);
 
-    CV_Assert((videoFormat.width >> 4)* (videoFormat.height >> 4) <= decodeCaps.nMaxMBCount);
+    CV_Assert((videoFormat.width >> 4) * (videoFormat.height >> 4) <= decodeCaps.nMaxMBCount);
+#else
+    if (videoFormat.enableHistogram) {
+        CV_Error(Error::StsBadArg, "Luma histogram output is not supported when CUDA Toolkit version <= 9.0.");
+    }
 #endif
+
     // Create video decoder
     CUVIDDECODECREATEINFO createInfo_ = {};
+#if (CUDART_VERSION >= 9000)
+    createInfo_.enableHistogram = videoFormat.enableHistogram;
+    createInfo_.bitDepthMinus8 = videoFormat.nBitDepthMinus8;
+    createInfo_.ulMaxWidth = videoFormat.ulMaxWidth;
+    createInfo_.ulMaxHeight = videoFormat.ulMaxHeight;
+#endif
     createInfo_.CodecType           = _codec;
     createInfo_.ulWidth             = videoFormat.ulWidth;
     createInfo_.ulHeight            = videoFormat.ulHeight;
     createInfo_.ulNumDecodeSurfaces = videoFormat.ulNumDecodeSurfaces;
     createInfo_.ChromaFormat    = _chromaFormat;
-    createInfo_.bitDepthMinus8 = videoFormat.nBitDepthMinus8;
     createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
     createInfo_.DeinterlaceMode = static_cast<cudaVideoDeinterlaceMode>(videoFormat.deinterlaceMode);
     createInfo_.ulTargetWidth       = videoFormat.width;
     createInfo_.ulTargetHeight      = videoFormat.height;
-    createInfo_.ulMaxWidth          = videoFormat.ulMaxWidth;
-    createInfo_.ulMaxHeight         = videoFormat.ulMaxHeight;
     createInfo_.display_area.left   = videoFormat.displayArea.x;
     createInfo_.display_area.right  = videoFormat.displayArea.x + videoFormat.displayArea.width;
     createInfo_.display_area.top    = videoFormat.displayArea.y;
@@ -169,12 +193,10 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 
 int cv::cudacodec::detail::VideoDecoder::reconfigure(const FormatInfo& videoFormat) {
     if (videoFormat.nBitDepthMinus8 != videoFormat_.nBitDepthMinus8 || videoFormat.nBitDepthChromaMinus8 != videoFormat_.nBitDepthChromaMinus8) {
-        CV_LOG_ERROR(NULL, "Reconfigure Not supported for bit depth change");
         CV_Error(Error::StsUnsupportedFormat, "Reconfigure Not supported for bit depth change");
     }
 
     if (videoFormat.chromaFormat != videoFormat_.chromaFormat) {
-        CV_LOG_ERROR(NULL, "Reconfigure Not supported for chroma format change");
         CV_Error(Error::StsUnsupportedFormat, "Reconfigure Not supported for chroma format change");
     }
 
@@ -183,7 +205,6 @@ int cv::cudacodec::detail::VideoDecoder::reconfigure(const FormatInfo& videoForm
     if ((videoFormat.ulWidth > videoFormat_.ulMaxWidth) || (videoFormat.ulHeight > videoFormat_.ulMaxHeight)) {
         // For VP9, let driver  handle the change if new width/height > maxwidth/maxheight
         if (videoFormat.codec != Codec::VP9) {
-            CV_LOG_ERROR(NULL, "Reconfigure Not supported when width/height > maxwidth/maxheight");
             CV_Error(Error::StsUnsupportedFormat, "Reconfigure Not supported when width/height > maxwidth/maxheight");
         }
     }
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index 96338d7e4..a32bf1bb0 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -49,11 +49,12 @@ namespace cv { namespace cudacodec { namespace detail {
 class VideoDecoder
 {
 public:
-    VideoDecoder(const Codec& codec, const int minNumDecodeSurfaces, cv::Size targetSz, cv::Rect srcRoi, cv::Rect targetRoi, CUcontext ctx, CUvideoctxlock lock) :
+    VideoDecoder(const Codec& codec, const int minNumDecodeSurfaces, cv::Size targetSz, cv::Rect srcRoi, cv::Rect targetRoi, const bool enableHistogram, CUcontext ctx, CUvideoctxlock lock) :
         ctx_(ctx), lock_(lock), decoder_(0)
     {
         videoFormat_.codec = codec;
         videoFormat_.ulNumDecodeSurfaces = minNumDecodeSurfaces;
+        videoFormat_.enableHistogram = enableHistogram;
         // alignment enforced by nvcuvid, likely due to chroma subsampling
         videoFormat_.targetSz.width = targetSz.width - targetSz.width % 2; videoFormat_.targetSz.height = targetSz.height - targetSz.height % 2;
         videoFormat_.srcRoi.x = srcRoi.x - srcRoi.x % 4; videoFormat_.srcRoi.width = srcRoi.width - srcRoi.width % 4;
@@ -88,13 +89,14 @@ public:
 
     cudaVideoChromaFormat chromaFormat() const { return static_cast<cudaVideoChromaFormat>(videoFormat_.chromaFormat); }
     int nBitDepthMinus8() const { return videoFormat_.nBitDepthMinus8; }
+    bool enableHistogram() const { return videoFormat_.enableHistogram; }
 
     bool decodePicture(CUVIDPICPARAMS* picParams)
     {
         return cuvidDecodePicture(decoder_, picParams) == CUDA_SUCCESS;
     }
 
-    cuda::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
+    GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
     {
         CUdeviceptr ptr;
         unsigned int pitch;
diff --git a/modules/cudacodec/src/video_parser.cpp b/modules/cudacodec/src/video_parser.cpp
index 459db17da..1aba16d58 100644
--- a/modules/cudacodec/src/video_parser.cpp
+++ b/modules/cudacodec/src/video_parser.cpp
@@ -78,9 +78,18 @@ bool cv::cudacodec::detail::VideoParser::parseVideoData(const unsigned char* dat
     if (rawMode)
         currentFramePackets.push_back(RawPacket(data, size, containsKeyFrame));
 
-    if (cuvidParseVideoData(parser_, &packet) != CUDA_SUCCESS)
-    {
-        CV_LOG_ERROR(NULL, "Call to cuvidParseVideoData failed!");
+    CUresult retVal = CUDA_SUCCESS;
+    try {
+        retVal = cuvidParseVideoData(parser_, &packet);
+    }
+    catch(const cv::Exception& e) {
+        CV_LOG_ERROR(NULL, e.msg);
+        hasError_ = true;
+        frameQueue_->endDecode();
+        return false;
+    }
+
+    if (retVal != CUDA_SUCCESS) {
         hasError_ = true;
         frameQueue_->endDecode();
         return false;
@@ -149,26 +158,18 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
         maxH = format->coded_height;
     newFormat.ulMaxWidth = maxW;
     newFormat.ulMaxHeight = maxH;
+    newFormat.enableHistogram = thiz->videoDecoder_->enableHistogram();
 
     thiz->frameQueue_->waitUntilEmpty();
     int retVal = newFormat.ulNumDecodeSurfaces;
-    try
-    {
-        if (thiz->videoDecoder_->inited()) {
-            retVal = thiz->videoDecoder_->reconfigure(newFormat);
-            if (retVal > 1 && newFormat.ulNumDecodeSurfaces != thiz->frameQueue_->getMaxSz())
-                thiz->frameQueue_->resize(newFormat.ulNumDecodeSurfaces);
-        }
-        else {
-            thiz->frameQueue_->init(newFormat.ulNumDecodeSurfaces);
-            thiz->videoDecoder_->create(newFormat);
-        }
+    if (thiz->videoDecoder_->inited()) {
+        retVal = thiz->videoDecoder_->reconfigure(newFormat);
+        if (retVal > 1 && newFormat.ulNumDecodeSurfaces != thiz->frameQueue_->getMaxSz())
+            thiz->frameQueue_->resize(newFormat.ulNumDecodeSurfaces);
     }
-    catch (const cv::Exception&)
-    {
-        CV_LOG_ERROR(NULL, "Attempt to configure Nvidia decoder failed!");
-        thiz->hasError_ = true;
-        retVal = 0;
+    else {
+        thiz->frameQueue_->init(newFormat.ulNumDecodeSurfaces);
+        thiz->videoDecoder_->create(newFormat);
     }
     return retVal;
 }
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index cf47d8399..b6ef2ca53 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -50,6 +50,7 @@ using namespace cv::cudacodec;
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const String&, const std::vector<int>&, const VideoReaderInitParams) { throw_no_cuda(); return Ptr<VideoReader>(); }
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&, const VideoReaderInitParams) { throw_no_cuda(); return Ptr<VideoReader>(); }
+void cv::cudacodec::MapHist(const GpuMat&, Mat&) { throw_no_cuda(); }
 
 #else // HAVE_NVCUVID
 
@@ -111,11 +112,13 @@ namespace
     {
     public:
         explicit VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop = false , const bool udpSource = false,
-            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect());
+            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect(), const bool enableHistogram = false);
         ~VideoReaderImpl();
 
         bool nextFrame(GpuMat& frame, Stream& stream) CV_OVERRIDE;
 
+        bool nextFrame(GpuMat& frame, GpuMat& histogram, Stream& stream) CV_OVERRIDE;
+
         FormatInfo format() const CV_OVERRIDE;
 
         bool grab(Stream& stream) CV_OVERRIDE;
@@ -132,7 +135,7 @@ namespace
         bool get(const int propertyId, double& propertyVal) const CV_OVERRIDE;
 
     private:
-        bool internalGrab(GpuMat& frame, Stream& stream);
+        bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
 
         Ptr<VideoSource> videoSource_;
@@ -145,13 +148,16 @@ namespace
 
         std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
         std::vector<RawPacket> rawPackets;
-        GpuMat lastFrame;
+        GpuMat lastFrame, lastHistogram;
         static const int decodedFrameIdx = 0;
         static const int extraDataIdx = 1;
         static const int rawPacketsBaseIdx = 2;
         ColorFormat colorFormat = ColorFormat::BGRA;
+        static const String errorMsg;
     };
 
+    const String VideoReaderImpl::errorMsg = "Parsing/Decoding video source failed, check GPU memory is available and GPU supports requested functionality.";
+
     FormatInfo VideoReaderImpl::format() const
     {
         return videoSource_->format();
@@ -161,13 +167,13 @@ namespace
         for (;;) {
             if (videoDecoder_->inited()) break;
             if (videoParser_->hasError() || frameQueue_->isEndOfDecode())
-                CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
+                CV_Error(Error::StsError, errorMsg);
             Thread::sleep(1);
         }
     }
 
     VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
-        const Size targetSz, const Rect srcRoi, const Rect targetRoi) :
+        const Size targetSz, const Rect srcRoi, const Rect targetRoi, const bool enableHistogram) :
         videoSource_(source),
         lock_(0)
     {
@@ -179,7 +185,7 @@ namespace
         cuSafeCall( cuCtxGetCurrent(&ctx) );
         cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
         frameQueue_.reset(new FrameQueue());
-        videoDecoder_.reset(new VideoDecoder(videoSource_->format().codec, minNumDecodeSurfaces, targetSz, srcRoi, targetRoi, ctx, lock_));
+        videoDecoder_.reset(new VideoDecoder(videoSource_->format().codec, minNumDecodeSurfaces, targetSz, srcRoi, targetRoi, enableHistogram, ctx, lock_));
         videoParser_.reset(new VideoParser(videoDecoder_, frameQueue_, allowFrameDrop, udpSource));
         videoSource_->setVideoParser(videoParser_);
         videoSource_->start();
@@ -203,10 +209,10 @@ namespace
         CUvideoctxlock m_lock;
     };
 
-    bool VideoReaderImpl::internalGrab(GpuMat& frame, Stream& stream) {
+    bool VideoReaderImpl::internalGrab(GpuMat& frame, GpuMat& histogram, Stream& stream) {
         if (videoParser_->hasError())
-            CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
-
+            CV_Error(Error::StsError, errorMsg);
+        cudacodec::FormatInfo fmt;
         if (frames_.empty())
         {
             CUVIDPARSERDISPINFO displayInfo;
@@ -217,7 +223,7 @@ namespace
                     break;
 
                 if (videoParser_->hasError())
-                    CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
+                    CV_Error(Error::StsError, errorMsg);
 
                 if (frameQueue_->isEndOfDecode())
                     return false;
@@ -228,7 +234,8 @@ namespace
 
             bool isProgressive = displayInfo.progressive_frame != 0;
             const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
-            videoSource_->updateFormat(videoDecoder_->format());
+            fmt = videoDecoder_->format();
+            videoSource_->updateFormat(fmt);
 
             for (int active_field = 0; active_field < num_fields; ++active_field)
             {
@@ -254,11 +261,21 @@ namespace
         {
             VideoCtxAutoLock autoLock(lock_);
 
+            unsigned long long cuHistogramPtr = 0;
+            if (fmt.enableHistogram)
+                frameInfo.second.histogram_dptr = &cuHistogramPtr;
+
             // map decoded video frame to CUDA surface
             GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
 
             cvtFromNv12(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, videoDecoder_->format().videoFullRangeFlag, stream);
 
+            if (fmt.enableHistogram) {
+                const size_t histogramSz = 4 * fmt.nMaxHistogramBins;
+                histogram.create(1, fmt.nMaxHistogramBins, CV_32S);
+                cuSafeCall(cuMemcpyDtoDAsync((CUdeviceptr)(histogram.data), cuHistogramPtr, histogramSz, StreamAccessor::getStream(stream)));
+            }
+
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
             videoDecoder_->unmapFrame(decodedFrame);
@@ -272,7 +289,7 @@ namespace
     }
 
     bool VideoReaderImpl::grab(Stream& stream) {
-        return internalGrab(lastFrame, stream);
+        return internalGrab(lastFrame, lastHistogram, stream);
     }
 
     bool VideoReaderImpl::retrieve(OutputArray frame, const size_t idx) const {
@@ -387,7 +404,13 @@ namespace
 
     bool VideoReaderImpl::nextFrame(GpuMat& frame, Stream& stream)
     {
-        if (!internalGrab(frame, stream))
+        GpuMat tmp;
+        return nextFrame(frame, tmp, stream);
+    }
+
+    bool VideoReaderImpl::nextFrame(GpuMat& frame, GpuMat& histogram, Stream& stream)
+    {
+        if (!internalGrab(frame, histogram, stream))
             return false;
         return true;
     }
@@ -412,14 +435,26 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
     }
 
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi);
+        params.srcRoi, params.targetRoi, params.enableHistogram);
 }
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>& source, const VideoReaderInitParams params)
 {
     Ptr<VideoSource> videoSource(new RawVideoSourceWrapper(source, params.rawMode));
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi);
+        params.srcRoi, params.targetRoi, params.enableHistogram);
+}
+
+void cv::cudacodec::MapHist(const GpuMat& hist, Mat& histFull) {
+    Mat histHost; hist.download(histHost);
+    histFull.create(histHost.size(), histHost.type());
+    histFull = 0;
+    const float scale = 255.0f / 219.0f;
+    const int offset = 16;
+    for (int iScaled = 0; iScaled < histHost.cols; iScaled++) {
+        const int iHistFull = std::min(std::max(0, static_cast<int>(std::round((iScaled - offset) * scale))), static_cast<int>(histFull.total()) - 1);
+        histFull.at<int>(iHistFull) += histHost.at<int>(iScaled);
+    }
 }
 
 #endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/video_source.cpp b/modules/cudacodec/src/video_source.cpp
index b58d753f7..a81b75e36 100644
--- a/modules/cudacodec/src/video_source.cpp
+++ b/modules/cudacodec/src/video_source.cpp
@@ -137,7 +137,8 @@ void cv::cudacodec::detail::RawVideoSourceWrapper::readLoop(void* userData)
             break;
     }
 
-    thiz->parseVideoData(0, 0, false, false, true);
+    if(!thiz->hasError_)
+        thiz->parseVideoData(0, 0, false, false, true);
 }
 
 #endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 7ecc2924b..ead5fa944 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -92,6 +92,11 @@ PARAM_TEST_CASE(VideoReadRaw, cv::cuda::DeviceInfo, std::string)
 {
 };
 
+typedef tuple<std::string, bool> histogram_params_t;
+PARAM_TEST_CASE(Histogram, cv::cuda::DeviceInfo, histogram_params_t)
+{
+};
+
 PARAM_TEST_CASE(CheckKeyFrame, cv::cuda::DeviceInfo, std::string)
 {
 };
@@ -480,6 +485,46 @@ CUDA_TEST_P(VideoReadRaw, Reader)
     ASSERT_EQ(0, remove(fileNameOut.c_str()));
 }
 
+CUDA_TEST_P(Histogram, Reader)
+{
+    cuda::setDevice(GET_PARAM(0).deviceID());
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(1));
+    const bool histAvailable = get<1>(GET_PARAM(1));
+    cudacodec::VideoReaderInitParams params;
+    params.enableHistogram = histAvailable;
+    Ptr<cudacodec::VideoReader> reader;
+    try {
+        reader = cudacodec::createVideoReader(inputFile, {}, params);
+    }
+    catch (const cv::Exception& e) {
+        throw SkipTestException(e.msg);
+    }
+    const cudacodec::FormatInfo fmt = reader->format();
+    ASSERT_EQ(histAvailable, fmt.enableHistogram);
+    reader->set(cudacodec::ColorFormat::GRAY);
+    GpuMat frame, hist;
+    reader->nextFrame(frame, hist);
+    if (histAvailable) {
+        ASSERT_TRUE(!hist.empty());
+        Mat frameHost, histGsHostFloat, histGs, histHost;
+        frame.download(frameHost);
+        const int histSize = 256;
+        const float range[] = { 0, 256 };
+        const float* histRange[] = { range };
+        cv::calcHist(&frameHost, 1, 0, Mat(), histGsHostFloat, 1, &histSize, histRange);
+        histGsHostFloat.convertTo(histGs, CV_32S);
+        if (fmt.videoFullRangeFlag)
+            hist.download(histHost);
+        else
+            cudacodec::MapHist(hist, histHost);
+        const double err = cv::norm(histGs.t(), histHost, NORM_INF);
+        ASSERT_EQ(err, 0);
+    }
+    else {
+        ASSERT_TRUE(hist.empty());
+    }
+}
+
 CUDA_TEST_P(CheckParams, Reader)
 {
     std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
@@ -844,6 +889,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, VideoReadRaw, testing::Combine(
     ALL_DEVICES,
     testing::Values(VIDEO_SRC_RW)));
 
+const histogram_params_t histogram_params[] =
+{
+    histogram_params_t("highgui/video/big_buck_bunny.mp4", false),
+    histogram_params_t("highgui/video/big_buck_bunny.h264", true),
+    histogram_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Histogram, testing::Combine(ALL_DEVICES,testing::ValuesIn(histogram_params)));
+
 const check_extra_data_params_t check_extra_data_params[] =
 {
     check_extra_data_params_t("highgui/video/big_buck_bunny.mp4", 45),