From 9e26b24d04bbab7943f9846d993d90bb49b6217e Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Tue, 25 Apr 2017 18:44:14 +0300
Subject: [PATCH] improved speed of ENet processing.

---
 modules/dnn/samples/torch_enet.cpp            |  15 +-
 modules/dnn/src/layers/batch_norm_layer.cpp   |  22 +-
 modules/dnn/src/layers/convolution_layer.cpp  | 533 ++++++++----------
 modules/dnn/src/layers/elementwise_layers.cpp |  62 +-
 modules/dnn/src/layers/eltwise_layer.cpp      |  15 +-
 modules/dnn/src/layers/op_im2col.cpp          | 323 +++++++++++
 modules/dnn/src/layers/op_im2col.hpp          | 265 +--------
 7 files changed, 639 insertions(+), 596 deletions(-)
diff --git a/modules/dnn/samples/torch_enet.cpp b/modules/dnn/samples/torch_enet.cpp
index feb276cb4..1dbf6c548 100644
--- a/modules/dnn/samples/torch_enet.cpp
+++ b/modules/dnn/samples/torch_enet.cpp
@@ -98,14 +98,19 @@ int main(int argc, char **argv)
     net.setBlob("", inputBlob);        //set the network input
     //! [Set input blob]
 
+    const int N = 3;
     TickMeter tm;
-    tm.start();
 
     //! [Make forward pass]
-    net.forward();                          //compute output
-    //! [Make forward pass]
-
-    tm.stop();
+    for( int i = 0; i < N; i++ )
+    {
+        TickMeter tm_;
+        tm_.start();
+        net.forward();                          //compute output
+        tm_.stop();
+        if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
+            tm = tm_;
+    }
 
     //! [Gather output]
 
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 04ef3c234..866864a5d 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -41,6 +41,15 @@ public:
             Mat* inp = inputs[i];
             outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
         }
+
+        varMeanScale = 1.f;
+        if (!hasWeights && !hasBias) {
+            varMeanScale = *blobs[2].ptr<float>();
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+
+        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
     }
 
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
@@ -52,16 +61,6 @@ public:
         int weightsBlobIndex = 2;
         int biasBlobIndex = weightsBlobIndex + hasWeights;
 
-        float varMeanScale = 1;
-        if (!hasWeights && !hasBias) {
-            varMeanScale = *blobs[2].ptr<float>();
-            if (varMeanScale != 0)
-                varMeanScale = 1/varMeanScale;
-        }
-
-        Mat invStdMat;
-        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
-
         int rows = inpBlob.size[2];
         int cols = inpBlob.size[3];
 
@@ -92,7 +91,8 @@ public:
     }
 
     bool hasWeights, hasBias;
-    float epsilon;
+    float epsilon, varMeanScale;
+    Mat invStdMat;
 };
 
 Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 5ce08646a..a9bf0893e 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -54,12 +54,74 @@ namespace dnn
 class BaseConvolutionLayerImpl : public ConvolutionLayer
 {
 public:
-    BaseConvolutionLayerImpl();
-    virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
+    BaseConvolutionLayerImpl()
+    {
+        numOutput = -1;
+        group = -1;
+        inpH = inpW = inpCn = 0;
+        outH = outW = outCn = 0;
+        inpGroupCn = outGroupCn = 0;
+        ksize = 0;
+        bias = false;
+#ifdef HAVE_LAPACK
+        int nthreads = cv::getThreadNum();
+        if (getBlasThreads() != nthreads)
+        {
+            setBlasThreads(nthreads);
+        }
+#endif
+    }
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() > 0);
 
-    void init();
+        init();
+
+        const Mat &input = *inputs[0];
+        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i]->type() == input.type());
+            CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
+            CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
+        }
+
+        computeInpOutShape(input);
+
+        if (bias)
+        {
+            biasOnesBlob.create(1, outH * outW, input.type());
+            biasOnesBlob.setTo(1);
+        }
+
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            int sz[] = { inputs[i]->size[0], outCn, outH, outW };
+            outputs[i].create(4, sz, input.type());
+        }
+
+        if (!is1x1())
+        {
+            colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
+            colRowBlob.setTo(0);
+        }
+    }
+
+    void init()
+    {
+        CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
+        CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
+
+        bias = (blobs.size() >= 2);
+    }
     virtual void computeInpOutShape(const Mat &inpBlob) = 0;
-    bool is1x1() const;
+    bool is1x1() const
+    {
+        return (kernel.height == 1 && kernel.width == 1) &&
+        (stride.height == 1 && stride.width == 1) &&
+        (dilation.height == 1 && dilation.width == 1);
+    }
 
     int numOutput, group;
     int inpH, inpW, inpCn;
@@ -76,307 +138,186 @@ public:
 class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
-    virtual void computeInpOutShape(const Mat &inpBlob);
+    void computeInpOutShape(const Mat &input)
+    {
+        CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
 
-    void im2col(const  Mat &srcImg,  Mat &dstCol);
-    void im2row(const  Mat &srcImg,  Mat &dstRow);
+        numOutput = blobs[0].size[0];
+
+        inpH = input.size[2];
+        inpW = input.size[3];
+        inpCn = input.size[1];
+        outCn = numOutput;
+
+        if (padMode.empty())
+        {
+            outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
+            outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
+        }
+        else
+        {
+            getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
+        }
+
+        group = inpCn / blobs[0].size[1];
+
+        CV_Assert(inpCn % group == 0 && outCn % group == 0);
+        CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
+
+        outGroupCn = outCn / group;
+        inpGroupCn = inpCn / group;
+        ksize = inpGroupCn * kernel.height * kernel.width;
+
+        colRowBlobShape.clear();
+        colRowBlobShape.push_back(outH*outW);
+        colRowBlobShape.push_back(ksize);
+    }
+
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() > 0);
+
+        Mat weightsMat = blobs[0].reshape(1, outCn);
+        Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            int numImg = inputs[ii]->size[0];
+            Mat inpMat = *inputs[ii];
+            Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
+
+            for (int n = 0; n < numImg; n++)
+            {
+                for (int g = 0; g < group; g++)
+                {
+                    Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
+
+                    im2row(curInp, colRowBlob);
+
+                    _Range kerRange(g * outGroupCn, outGroupCn);
+                    Mat kerMat = weightsMat.rowRange(kerRange);
+
+                    _Range outRange((g + n * group) * outGroupCn, outGroupCn);
+                    Mat dstMat = outMat.rowRange(outRange);
+
+                    dnn::gemm(kerMat, colRowBlob, 1, dstMat, 0, GEMM_2_T);
+
+                    if (bias)
+                    {
+                        dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
+                    }
+                }
+            }
+        }
+    }
+
+    void im2row(const  Mat &srcImg, Mat &dstRow)
+    {
+        if (is1x1())
+        {
+            transpose(srcImg.reshape(1, ksize), dstRow);
+        }
+        else
+        {
+            cv::dnn::im2row(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
+                            kernel.width, pad.height, pad.width, stride.height, stride.width,
+                            dilation.height, dilation.width, outH, outW, dstRow.ptr<float>());
+        }
+    }
 };
 
 class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
+    void computeInpOutShape(const Mat &inpBlob)
+    {
+        CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
 
-    virtual void computeInpOutShape(const Mat &inpBlob);
-    void col2im(const  Mat &colMat, Mat  &dstImg);
+        numOutput = blobs[0].size[0];
+
+        inpH = inpBlob.size[2];
+        inpW = inpBlob.size[3];
+        inpCn = inpBlob.size[1];
+
+        outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
+        outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
+        outCn = numOutput;
+
+        group = inpCn / blobs[0].size[1];
+        outGroupCn = outCn / group;
+        inpGroupCn = inpCn / group;
+        ksize = outGroupCn * kernel.height * kernel.width;
+
+        CV_Assert(inpCn % group == 0 && outCn % group == 0);
+        CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
+
+        colRowBlobShape.clear();
+        colRowBlobShape.push_back(ksize);
+        colRowBlobShape.push_back(inpH * inpW);
+
+        ofsbuf.resize(ksize*3);
+        for( int k = 0; k < ksize; k++ )
+        {
+            int w_offset = k % kernel.width;
+            int h_offset = (k / kernel.width) % kernel.height;
+            int c_im = k / kernel.height / kernel.width;
+            ofsbuf[k*3] = w_offset;
+            ofsbuf[k*3+1] = h_offset;
+            ofsbuf[k*3+2] = c_im;
+        }
+    }
+
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    {
+        Mat weightsMat = blobs[0].reshape(1, inpCn);
+        Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            int numImg = inputs[ii]->size[0];
+            Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
+            Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
+
+            for (int n = 0; n < numImg; n++)
+            {
+                for (int g = 0; g < group; g++)
+                {
+                    Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
+                    Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
+
+                    Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
+                    Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
+
+                    dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
+
+                    if (!is1x1())
+                        col2im(colMat, dstMat);
+
+                    if (bias)
+                    {
+                        Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
+                        dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
+                    }
+                }
+            }
+        }
+    }
+
+    void col2im(const Mat &colMat, Mat &dstImg)
+    {
+        if (is1x1())
+        {
+            dstImg = colMat;
+            return;
+        }
+        cv::dnn::col2im(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width,
+                        pad.height, pad.width, stride.height, stride.width,
+                        dilation.height, dilation.width, dstImg.ptr<float>(), &ofsbuf[0]);
+    }
+
+    std::vector<int> ofsbuf;
 };
 
-
-BaseConvolutionLayerImpl::BaseConvolutionLayerImpl():
-    numOutput(-1), group(-1),
-    inpH(0), inpW(0), inpCn(0),
-    outH(0), outW(0), outCn(0),
-    inpGroupCn(0), outGroupCn(0),
-    ksize(0), bias(false)
-{
-#ifdef HAVE_LAPACK
-    if (getBlasThreads() != cv::getThreadNum())
-    {
-        setBlasThreads(cv::getThreadNum());
-    }
-#endif
-}
-
-void BaseConvolutionLayerImpl::init()
-{
-    CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
-    CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
-
-    bias = (blobs.size() >= 2);
-}
-
-void BaseConvolutionLayerImpl::allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-{
-    CV_Assert(inputs.size() > 0);
-
-    init();
-
-    const Mat &input = *inputs[0];
-    CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
-    for (size_t i = 0; i < inputs.size(); i++)
-    {
-        CV_Assert(inputs[i]->type() == input.type());
-        CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
-        CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
-    }
-
-    computeInpOutShape(input);
-
-    if (bias)
-    {
-        biasOnesBlob.create(1, outH * outW, input.type());
-        biasOnesBlob.setTo(1);
-    }
-
-    outputs.resize(inputs.size());
-    for (size_t i = 0; i < inputs.size(); i++)
-    {
-        int sz[] = { inputs[i]->size[0], outCn, outH, outW };
-        outputs[i].create(4, sz, input.type());
-    }
-
-    if (!is1x1())
-    {
-        colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
-        colRowBlob.setTo(0);
-    }
-}
-
-bool BaseConvolutionLayerImpl::is1x1() const
-{
-    return (kernel.height == 1 && kernel.width == 1) &&
-           (stride.height == 1 && stride.width == 1) &&
-           (dilation.height == 1 && dilation.width == 1);
-}
-
-void ConvolutionLayerImpl::computeInpOutShape(const Mat &input)
-{
-    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
-
-    numOutput = blobs[0].size[0];
-
-    inpH = input.size[2];
-    inpW = input.size[3];
-    inpCn = input.size[1];
-    outCn = numOutput;
-
-    if (padMode.empty())
-    {
-        outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
-        outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
-    }
-    else
-    {
-        getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
-    }
-
-    group = inpCn / blobs[0].size[1];
-
-    CV_Assert(inpCn % group == 0 && outCn % group == 0);
-    CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
-
-    outGroupCn = outCn / group;
-    inpGroupCn = inpCn / group;
-    ksize = inpGroupCn * kernel.height * kernel.width;
-
-    colRowBlobShape.clear();
-    colRowBlobShape.push_back(outH*outW);
-    colRowBlobShape.push_back(ksize);
-}
-
-void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-{
-    CV_Assert(inputs.size() > 0);
-
-    Mat weightsMat = blobs[0].reshape(1, outCn);
-    Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
-
-    for (size_t ii = 0; ii < outputs.size(); ii++)
-    {
-        int numImg = inputs[ii]->size[0];
-        Mat inpMat = *inputs[ii];
-        Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
-
-        for (int n = 0; n < numImg; n++)
-        {
-            for (int g = 0; g < group; g++)
-            {
-                Mat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
-
-                im2row(curInp, colMat);
-
-                _Range kerRange(g * outGroupCn, outGroupCn);
-                Mat kerMat = weightsMat.rowRange(kerRange);
-
-                _Range outRange((g + n * group) * outGroupCn, outGroupCn);
-                Mat dstMat = outMat.rowRange(outRange);
-
-                dnn::gemm(kerMat, colMat, 1, dstMat, 0, GEMM_2_T);
-
-                if (bias)
-                {
-                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
-                }
-            }
-        }
-    }
-}
-
-void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol)
-{
-    if (is1x1())
-    {
-        dstCol = srcImg.reshape(1, ksize);
-        return;
-    }
-
-    Mat &colMat = colRowBlob;
-    if (srcImg.type() == CV_32F)
-        im2col_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
-                                    kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                    dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
-    if (srcImg.type() == CV_64F)
-        im2col_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
-                                     kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                     dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
-
-    dstCol = colMat;
-}
-
-void ConvolutionLayerImpl::im2row(const  Mat &srcImg,  Mat &dstRow)
-{
-    if (is1x1())
-    {
-        dstRow = srcImg.reshape(1, ksize).t();
-        return;
-    }
-
-    Mat &colMat = colRowBlob;
-    if (srcImg.type() == CV_32F)
-        im2row_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
-                                    kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                    dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
-    if (srcImg.type() == CV_64F)
-        im2row_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
-                                     kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                     dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
-
-    dstRow = colMat;
-}
-
-//Deconvolution
-
-void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob)
-{
-    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
-
-    numOutput = blobs[0].size[0];
-
-    inpH = inpBlob.size[2];
-    inpW = inpBlob.size[3];
-    inpCn = inpBlob.size[1];
-
-    outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
-    outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
-    outCn = numOutput;
-
-    group = inpCn / blobs[0].size[1];
-    outGroupCn = outCn / group;
-    inpGroupCn = inpCn / group;
-    ksize = outGroupCn * kernel.height * kernel.width;
-
-    CV_Assert(inpCn % group == 0 && outCn % group == 0);
-    CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
-
-    colRowBlobShape.clear();
-    colRowBlobShape.push_back(ksize);
-    colRowBlobShape.push_back(inpH * inpW);
-}
-
-void DeConvolutionLayerImpl::forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
-{
-    Mat weightsMat = blobs[0].reshape(1, inpCn);
-    Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
-
-    for (size_t ii = 0; ii < outputs.size(); ii++)
-    {
-        int numImg = inputs[ii]->size[0];
-        Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
-        Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
-
-        for (int n = 0; n < numImg; n++)
-        {
-            for (int g = 0; g < group; g++)
-            {
-                Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
-                Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
-
-                Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
-                Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
-
-                dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
-
-                if (!is1x1())
-                    col2im(colMat, dstMat);
-
-                if (bias)
-                {
-                    Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
-                    dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
-                }
-            }
-        }
-    }
-}
-
-void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg)
-{
-    if (is1x1())
-    {
-        dstImg = colMat;
-        return;
-    }
-    if (dstImg.type() == CV_32F)
-        col2im_CpuPBody<float>::run(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<float>());
-    if (dstImg.type() == CV_64F)
-        col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<double>());
-}
-
-//Initializers
-
-/*Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation)
-{
-    ConvolutionLayerImpl *l = new ConvolutionLayerImpl();
-    l->kernel = kernel;
-    l->pad = pad;
-    l->stride = stride;
-    l->dilation = dilation;
-    return Ptr<BaseConvolutionLayer>(l);
-}
-
-Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation, Size adjustPad)
-{
-    DeConvolutionLayerImpl *l = new DeConvolutionLayerImpl();
-    l->kernel = kernel;
-    l->pad = pad;
-    l->stride = stride;
-    l->dilation = dilation;
-    l->adjustPad = adjustPad;
-
-    return Ptr<BaseConvolutionLayer>(l);
-}*/
-
 //Convolution and Deconvolution
 static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const LayerParams &params)
 {
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 74e5ab4ac..e0f3936b7 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -15,8 +15,7 @@ using std::pow;
 template<typename Func>
 class ElementWiseLayer : public Func::Layer
 {
-    Func func;
-
+public:
     template<typename Dtype>
     class PBody : public cv::ParallelLoopBody
     {
@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
         }
     };
 
-public:
-
-    ElementWiseLayer(const Func &f=Func()) : func(f) {}
+    ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
 
     void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
@@ -58,9 +55,16 @@ public:
 
             Range sizeRange = Range(0, dst.total());
             CV_Assert(src.type() == CV_32F);
-            cv::parallel_for_(sizeRange, PBody<float>(dst, func));
+            PBody<float> body(dst, func);
+            if( run_parallel )
+                cv::parallel_for_(sizeRange, body);
+            else
+                body(sizeRange);
         }
     }
+
+    Func func;
+    bool run_parallel;
 };
 
 struct ReLUFunctor
@@ -135,8 +139,24 @@ struct PowerFunctor
     template<typename TFloat>
     inline TFloat operator()(TFloat x) const
     {
-        return power == 1.0f ? (TFloat)shift + (TFloat)scale * x :
-            pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+    }
+};
+
+struct PowerFunctor1
+{
+    typedef PowerLayer Layer;
+
+    const float scale;
+    const float shift;
+
+    PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
+    : scale(scale_), shift(shift_) {}
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return (TFloat)shift + (TFloat)scale * x;
     }
 };
 
@@ -165,12 +185,12 @@ public:
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
         CV_Assert(inputs.size() == 1);
-
         Mat &inpBlob = *inputs[0];
 
         for (size_t ii = 0; ii < outputs.size(); ii++)
         {
             Mat &outBlob = outputs[ii];
+            CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
 
             CV_Assert(blobs[0].total() == inpBlob.size[1]);
 
@@ -181,8 +201,16 @@ public:
                 Mat inpBlobPlane = getPlane(inpBlob, 0, n);
                 Mat outBlobPlane = getPlane(outBlob, 0, n);
 
-                threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
-                scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
+                size_t i, planeTotal = inpBlobPlane.total();
+                const float* inptr = inpBlobPlane.ptr<float>();
+                float* outptr = outBlobPlane.ptr<float>();
+                for( i = 0; i < planeTotal; i++ )
+                {
+                    float val = inptr[i];
+                    outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
+                }
+                //threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
+                //scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
             }
         }
     }
@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
     float negativeSlope = params.get<float>("negative_slope", 0.f);
-    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
+    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
     l->setParamsFrom(params);
 
     return l;
@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 
 Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
 {
-    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
+    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
     l->setParamsFrom(params);
 
     return l;
@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
 
 Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
 {
-    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
+    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
     l->setParamsFrom(params);
 
     return l;
@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
 
 Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
 {
-    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
+    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
     l->setParamsFrom(params);
 
     return l;
@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
     float power = params.get<float>("power", 1.0f);
     float scale = params.get<float>("scale", 1.0f);
     float shift = params.get<float>("shift", 0.0f);
-    Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
+    Ptr<PowerLayer> l(power == 1.f ?
+                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
+                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
     l->setParamsFrom(params);
 
     return l;
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 2e88bbbe5..945d2587f 100755
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -98,15 +98,14 @@ public:
 
     void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
+        Mat& output = outputs[0];
         switch (op)
         {
             case SUM:
-            {
                 CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
-                Mat& output = outputs[0];
-                output.setTo(0.);
                 if (0 < coeffs.size())
                 {
+                    output.setTo(0.);
                     for (size_t i = 0; i < inputs.size(); i++)
                     {
                         output += *inputs[i] * coeffs[i];
@@ -114,32 +113,26 @@ public:
                 }
                 else
                 {
-                    for (size_t i = 0; i < inputs.size(); i++)
+                    add(*inputs[0], *inputs[1], output);
+                    for (size_t i = 2; i < inputs.size(); i++)
                     {
                         output += *inputs[i];
                     }
                 }
-            }
                 break;
             case PROD:
-            {
-                Mat& output = outputs[0];
                 output.setTo(1.);
                 for (size_t i = 0; i < inputs.size(); i++)
                 {
                     output = output.mul(*inputs[i]);
                 }
-            }
                 break;
             case MAX:
-            {
-                Mat& output = outputs[0];
                 cv::max(*inputs[0], *inputs[1], output);
                 for (size_t i = 2; i < inputs.size(); i++)
                 {
                     cv::max(output, *inputs[i], output);
                 }
-            }
                 break;
             default:
                 CV_Assert(0);
diff --git a/modules/dnn/src/layers/op_im2col.cpp b/modules/dnn/src/layers/op_im2col.cpp
index bae2011d0..690cac28d 100644
--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@@ -44,3 +44,326 @@
 #include "opencl_kernels_dnn.hpp"
 #include "op_im2col.hpp"
 #include "opencl_kernels_dnn.hpp"
+
+namespace cv {
+namespace dnn {
+
+#if 0
+template <typename Dtype>
+class im2col_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_im;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    int dilation_h, dilation_w;
+    Dtype* data_col;
+    int height_col, width_col, channels_col;
+
+    im2col_CpuPBody() {}
+public:
+
+    static void run(const Dtype* data_im,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    int dilation_h, int dilation_w,
+                    int height_col, int width_col,
+                    Dtype* data_col)
+    {
+        im2col_CpuPBody<Dtype> t;
+
+        t.data_im = data_im;
+        t.data_col = data_col;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
+
+        t.height_col = height_col;
+        t.width_col = width_col;
+        t.channels_col = channels * kernel_h * kernel_w;
+
+        cv::parallel_for_(Range(0, t.channels_col), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        for (int c = r.start; c < r.end; ++c)
+        {
+            int w_offset = c % kernel_w;
+            int h_offset = (c / kernel_w) % kernel_h;
+            int c_im = c / kernel_h / kernel_w;
+            for (int h = 0; h < height_col; ++h)
+            {
+                for (int w = 0; w < width_col; ++w)
+                {
+                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                        data_col[(c * height_col + h) * width_col + w] =
+                        data_im[(c_im * height + h_pad) * width + w_pad];
+                    else
+                        data_col[(c * height_col + h) * width_col + w] = 0;
+                }
+            }
+        }
+    }
+};
+#endif
+
+template <typename Dtype>
+class im2row_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_im;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    int dilation_h, dilation_w;
+    Dtype* data_col;
+    int height_col, width_col, channels_col;
+
+    im2row_CpuPBody() {}
+public:
+
+    static void run(const Dtype* data_im,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    int dilation_h, int dilation_w,
+                    int height_col, int width_col,
+                    Dtype* data_col)
+    {
+        im2row_CpuPBody<Dtype> t;
+
+        t.data_im = data_im;
+        t.data_col = data_col;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
+
+        t.height_col = height_col;
+        t.width_col = width_col;
+        t.channels_col = channels * kernel_h * kernel_w;
+
+        int total = t.height_col*t.width_col;
+#if 1
+        t(Range(0, total));
+#else
+        cv::parallel_for_(Range(0, total), t, 16);
+#endif
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        int dh = dilation_h, dw = dilation_w;
+        int kh = kernel_h, kw = kernel_w;
+        Dtype* data_col_ = data_col;
+        const Dtype* data_im_ = data_im;
+        int kelems = kh*kw;
+        AutoBuffer<int> ofs_(kelems);
+        int* ofs = ofs_;
+        int k = 0;
+        for( int k_r = 0; k_r < kernel_h; k_r++ )
+            for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
+                ofs[k] = k_r*dh*width + k_c*dw;
+
+        for (int row = r.start; row < r.end; ++row)
+        {
+            int out_c = row % width_col;
+            int out_r = row / width_col;
+            int out_row_offset = row*kh*kw*channels;
+
+            int start_in_r = out_r * stride_h - pad_h;
+            int start_in_c = out_c * stride_w - pad_w;
+            int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
+            int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
+            int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
+            int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
+
+            if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
+            {
+                for( int i_c = 0; i_c < channels; i_c++ )
+                {
+                    float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
+                    const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
+
+                    for( k = 0; k < kelems; k++ )
+                    {
+                        data_col_c[k] = data_im_c[ofs[k]];
+                    }
+                }
+            }
+            else
+            {
+                memset(data_col_, 0, kw*kh*channels*sizeof(data_col_[0]));
+                for(int i_c = 0; i_c < channels; i_c++)
+                {
+                    int channels_offset = i_c * width * height;
+                    int out_ch_offset = i_c*kh*kw;
+                    int in_r = start_in_r + start_k_r*dh;
+
+                    for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
+                    {
+                        int row_offset = in_r*width;
+                        int out_col_offset = k_r*kw;
+                        int in_c = start_in_c + start_k_c*dw;
+                        
+                        for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
+                        {
+                            int in_index = channels_offset + row_offset + in_c;
+                            int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
+                            
+                            data_col_[out_index] = data_im_[in_index];
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+
+void im2row(const float* data_im, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            int height_col, int width_col, float* data_col)
+{
+    im2row_CpuPBody<float>::run(data_im, channels, height, width,
+                                kernel_h, kernel_w, pad_h, pad_w,
+                                stride_h, stride_w, dilation_h, dilation_w,
+                                height_col, width_col, data_col);
+}
+
+
+#if 0
+template <typename Dtype>
+class col2im_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_col;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    Dtype* data_im;
+    int height_col, width_col;
+
+    col2im_CpuPBody() {}
+
+public:
+    static void run(const Dtype* data_col,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    Dtype* data_im)
+    {
+        //TODO: single-threaded version switch
+
+        col2im_CpuPBody t;
+        t.data_col = data_col;
+        t.data_im = data_im;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        int img_total = channels * height * width;
+
+        cv::parallel_for_(Range(0, img_total), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        const Dtype* data_col_ = data_col;
+        Dtype* data_im_ = data_im;
+        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
+        int coeff_w_col = (1 - stride_w * height_col * width_col);
+        for (int index = r.start; index < r.end; index++)
+        {
+            Dtype val = 0;
+            int w = index % width + pad_w;
+            int h = (index / width) % height + pad_h;
+            int c = index / (width * height);
+
+            // compute the start and end of the output
+            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            int w_col_end = std::min(w / stride_w + 1, width_col);
+            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            int h_col_end = std::min(h / stride_h + 1, height_col);
+
+            // equivalent implementation
+            int offset =
+            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
+
+            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+                for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                    val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+                }
+            }
+            data_im_[index] = val;
+        }
+    }
+};
+#endif
+
+//single-threaded version
+template <typename Dtype>
+void col2im_cpu(const Dtype* data_col,
+                int channels, int height, int width,
+                int kernel_h, int kernel_w,
+                int pad_h, int pad_w,
+                int stride_h, int stride_w,
+                int dilation_h, int dilation_w,
+                Dtype* data_im,
+                const int* ofsbuf)
+{
+    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+
+    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
+
+    for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
+    {
+        //int w_offset = c % kernel_w;
+        //int h_offset = (c / kernel_w) % kernel_h;
+        //int c_im = c / kernel_h / kernel_w;
+        int w_offset = ofsbuf[0];
+        int h_offset = ofsbuf[1];
+        int c_im = ofsbuf[2];
+
+        for (int h = 0; h < height_col; ++h)
+        {
+            for (int w = 0; w < width_col; ++w)
+            {
+                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+                
+                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                    data_im[(c_im * height + h_pad) * width + w_pad] +=
+                    data_col[(c * height_col + h) * width_col + w];
+            }
+        }
+    }
+}
+
+void col2im(const float* data_col, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            float* data_im, const int* ofsbuf)
+{
+    //col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
+    col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
+               stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
+}
+
+}
+}
diff --git a/modules/dnn/src/layers/op_im2col.hpp b/modules/dnn/src/layers/op_im2col.hpp
index 488fab30f..e3685fa11 100644
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -49,264 +49,15 @@ namespace cv
 namespace dnn
 {
 
-template <typename Dtype>
-class im2col_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
+void im2row(const float* data_im, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            int height_col, int width_col, float* data_col);
 
-    im2col_CpuPBody() {}
-public:
-
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2col_CpuPBody<Dtype> t;
-
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-
-        cv::parallel_for_(Range(0, t.channels_col), t);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        for (int c = r.start; c < r.end; ++c)
-        {
-            int w_offset = c % kernel_w;
-            int h_offset = (c / kernel_w) % kernel_h;
-            int c_im = c / kernel_h / kernel_w;
-            for (int h = 0; h < height_col; ++h)
-            {
-                for (int w = 0; w < width_col; ++w)
-                {
-                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                        data_col[(c * height_col + h) * width_col + w] =
-                            data_im[(c_im * height + h_pad) * width + w_pad];
-                    else
-                        data_col[(c * height_col + h) * width_col + w] = 0;
-                }
-            }
-        }
-    }
-};
-
-template <typename Dtype>
-class im2row_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-
-    im2row_CpuPBody() {}
-public:
-
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2row_CpuPBody<Dtype> t;
-
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-
-        cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        int dh = dilation_h, dw = dilation_w;
-        Dtype* data_col_ = data_col;
-        const Dtype* data_im_ = data_im;
-
-        for (int row = r.start; row < r.end; ++row)
-        {
-            int out_c = row % width_col;
-            int out_r = row / width_col;
-            int out_row_offset = row*kernel_h*kernel_w*channels;
-
-            int start_in_r = out_r * stride_h - pad_h;
-            int start_in_c = out_c * stride_w - pad_w;
-            int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
-            int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
-            int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
-            int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
-
-            for(int i_c = 0; i_c < channels; i_c++)
-            {
-                int channels_offset = i_c * width * height;
-                int out_ch_offset = i_c*kernel_h*kernel_w;
-                int in_r = start_in_r + start_k_r*dilation_h;
-
-                for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
-                {
-                    int row_offset = in_r*width;
-                    int out_col_offset = k_r*kernel_w;
-                    int in_c = start_in_c + start_k_c*dilation_w;
-
-                    for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
-                    {
-                        int in_index = channels_offset + row_offset + in_c;
-
-                        int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
-
-                        data_col_[out_index] = data_im_[in_index];
-                    }
-                }
-            }
-        }
-    }
-};
-
-template <typename Dtype>
-class col2im_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_col;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    Dtype* data_im;
-    int height_col, width_col;
-
-    col2im_CpuPBody() {}
-
-public:
-
-    static void run(const Dtype* data_col,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    Dtype* data_im)
-    {
-        //TODO: single-threaded version switch
-
-        col2im_CpuPBody t;
-        t.data_col = data_col;
-        t.data_im = data_im;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        int img_total = channels * height * width;
-
-        cv::parallel_for_(Range(0, img_total), t);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        const Dtype* data_col_ = data_col;
-        Dtype* data_im_ = data_im;
-        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int index = r.start; index < r.end; index++)
-        {
-            Dtype val = 0;
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = index / (width * height);
-
-            // compute the start and end of the output
-            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            int w_col_end = std::min(w / stride_w + 1, width_col);
-            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            int h_col_end = std::min(h / stride_h + 1, height_col);
-
-            // equivalent implementation
-            int offset =
-            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
-
-            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-              for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-              }
-            }
-            data_im_[index] = val;
-        }
-    }
-};
-
-//single-threaded version
-template <typename Dtype>
-void col2im_cpu(const Dtype* data_col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int dilation_h, int dilation_w,
-                Dtype* data_im)
-{
-    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-
-    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
-
-    for (int c = 0; c < channels_col; ++c)
-    {
-        int w_offset = c % kernel_w;
-        int h_offset = (c / kernel_w) % kernel_h;
-        int c_im = c / kernel_h / kernel_w;
-
-        for (int h = 0; h < height_col; ++h)
-        {
-            for (int w = 0; w < width_col; ++w)
-            {
-                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
-                        data_col[(c * height_col + h) * width_col + w];
-            }
-        }
-    }
-}
+void col2im(const float* data_col, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            float* data_im, const int* ofsbuf);
 
 }
 }