From 9e26b24d04bbab7943f9846d993d90bb49b6217e Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 25 Apr 2017 18:44:14 +0300 Subject: [PATCH] improved speed of ENet processing. --- modules/dnn/samples/torch_enet.cpp | 15 +- modules/dnn/src/layers/batch_norm_layer.cpp | 22 +- modules/dnn/src/layers/convolution_layer.cpp | 533 ++++++++---------- modules/dnn/src/layers/elementwise_layers.cpp | 62 +- modules/dnn/src/layers/eltwise_layer.cpp | 15 +- modules/dnn/src/layers/op_im2col.cpp | 323 +++++++++++ modules/dnn/src/layers/op_im2col.hpp | 265 +-------- 7 files changed, 639 insertions(+), 596 deletions(-) diff --git a/modules/dnn/samples/torch_enet.cpp b/modules/dnn/samples/torch_enet.cpp index feb276cb4..1dbf6c548 100644 --- a/modules/dnn/samples/torch_enet.cpp +++ b/modules/dnn/samples/torch_enet.cpp @@ -98,14 +98,19 @@ int main(int argc, char **argv) net.setBlob("", inputBlob); //set the network input //! [Set input blob] + const int N = 3; TickMeter tm; - tm.start(); //! [Make forward pass] - net.forward(); //compute output - //! [Make forward pass] - - tm.stop(); + for( int i = 0; i < N; i++ ) + { + TickMeter tm_; + tm_.start(); + net.forward(); //compute output + tm_.stop(); + if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() ) + tm = tm_; + } //! [Gather output] diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 04ef3c234..866864a5d 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -41,6 +41,15 @@ public: Mat* inp = inputs[i]; outputs[i].create(inp->dims, &inp->size.p[0], inp->type()); } + + varMeanScale = 1.f; + if (!hasWeights && !hasBias) { + varMeanScale = *blobs[2].ptr(); + if (varMeanScale != 0) + varMeanScale = 1/varMeanScale; + } + + cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat); } void forward(std::vector &inputs, std::vector &outputs) @@ -52,16 +61,6 @@ public: int weightsBlobIndex = 2; int biasBlobIndex = weightsBlobIndex + hasWeights; - float varMeanScale = 1; - if (!hasWeights && !hasBias) { - varMeanScale = *blobs[2].ptr(); - if (varMeanScale != 0) - varMeanScale = 1/varMeanScale; - } - - Mat invStdMat; - cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat); - int rows = inpBlob.size[2]; int cols = inpBlob.size[3]; @@ -92,7 +91,8 @@ public: } bool hasWeights, hasBias; - float epsilon; + float epsilon, varMeanScale; + Mat invStdMat; }; Ptr BatchNormLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 5ce08646a..a9bf0893e 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -54,12 +54,74 @@ namespace dnn class BaseConvolutionLayerImpl : public ConvolutionLayer { public: - BaseConvolutionLayerImpl(); - virtual void allocate(const std::vector &inputs, std::vector &outputs); + BaseConvolutionLayerImpl() + { + numOutput = -1; + group = -1; + inpH = inpW = inpCn = 0; + outH = outW = outCn = 0; + inpGroupCn = outGroupCn = 0; + ksize = 0; + bias = false; +#ifdef HAVE_LAPACK + int nthreads = cv::getThreadNum(); + if (getBlasThreads() != nthreads) + { + setBlasThreads(nthreads); + } +#endif + } + void allocate(const std::vector &inputs, std::vector &outputs) + { + CV_Assert(inputs.size() > 0); - void init(); + init(); + + const Mat &input = *inputs[0]; + CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F)); + for (size_t i = 0; i < inputs.size(); i++) + { + CV_Assert(inputs[i]->type() == input.type()); + CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]); + CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]); + } + + computeInpOutShape(input); + + if (bias) + { + biasOnesBlob.create(1, outH * outW, input.type()); + biasOnesBlob.setTo(1); + } + + outputs.resize(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) + { + int sz[] = { inputs[i]->size[0], outCn, outH, outW }; + outputs[i].create(4, sz, input.type()); + } + + if (!is1x1()) + { + colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type()); + colRowBlob.setTo(0); + } + } + + void init() + { + CV_Assert(blobs.size() >= 1 && blobs.size() <= 2); + CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height); + + bias = (blobs.size() >= 2); + } virtual void computeInpOutShape(const Mat &inpBlob) = 0; - bool is1x1() const; + bool is1x1() const + { + return (kernel.height == 1 && kernel.width == 1) && + (stride.height == 1 && stride.width == 1) && + (dilation.height == 1 && dilation.width == 1); + } int numOutput, group; int inpH, inpW, inpCn; @@ -76,307 +138,186 @@ public: class ConvolutionLayerImpl : public BaseConvolutionLayerImpl { public: - virtual void forward(std::vector &inputs, std::vector &outputs); - virtual void computeInpOutShape(const Mat &inpBlob); + void computeInpOutShape(const Mat &input) + { + CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]); - void im2col(const Mat &srcImg, Mat &dstCol); - void im2row(const Mat &srcImg, Mat &dstRow); + numOutput = blobs[0].size[0]; + + inpH = input.size[2]; + inpW = input.size[3]; + inpCn = input.size[1]; + outCn = numOutput; + + if (padMode.empty()) + { + outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1; + outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1; + } + else + { + getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW); + } + + group = inpCn / blobs[0].size[1]; + + CV_Assert(inpCn % group == 0 && outCn % group == 0); + CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group); + + outGroupCn = outCn / group; + inpGroupCn = inpCn / group; + ksize = inpGroupCn * kernel.height * kernel.width; + + colRowBlobShape.clear(); + colRowBlobShape.push_back(outH*outW); + colRowBlobShape.push_back(ksize); + } + + void forward(std::vector &inputs, std::vector &outputs) + { + CV_Assert(inputs.size() > 0); + + Mat weightsMat = blobs[0].reshape(1, outCn); + Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat(); + + for (size_t ii = 0; ii < outputs.size(); ii++) + { + int numImg = inputs[ii]->size[0]; + Mat inpMat = *inputs[ii]; + Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn); + + for (int n = 0; n < numImg; n++) + { + for (int g = 0; g < group; g++) + { + Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn)); + + im2row(curInp, colRowBlob); + + _Range kerRange(g * outGroupCn, outGroupCn); + Mat kerMat = weightsMat.rowRange(kerRange); + + _Range outRange((g + n * group) * outGroupCn, outGroupCn); + Mat dstMat = outMat.rowRange(outRange); + + dnn::gemm(kerMat, colRowBlob, 1, dstMat, 0, GEMM_2_T); + + if (bias) + { + dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1); + } + } + } + } + } + + void im2row(const Mat &srcImg, Mat &dstRow) + { + if (is1x1()) + { + transpose(srcImg.reshape(1, ksize), dstRow); + } + else + { + cv::dnn::im2row(srcImg.ptr(), inpGroupCn, inpH, inpW, kernel.height, + kernel.width, pad.height, pad.width, stride.height, stride.width, + dilation.height, dilation.width, outH, outW, dstRow.ptr()); + } + } }; class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl { public: - virtual void forward(std::vector &inputs, std::vector &outputs); + void computeInpOutShape(const Mat &inpBlob) + { + CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]); - virtual void computeInpOutShape(const Mat &inpBlob); - void col2im(const Mat &colMat, Mat &dstImg); + numOutput = blobs[0].size[0]; + + inpH = inpBlob.size[2]; + inpW = inpBlob.size[3]; + inpCn = inpBlob.size[1]; + + outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height; + outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width; + outCn = numOutput; + + group = inpCn / blobs[0].size[1]; + outGroupCn = outCn / group; + inpGroupCn = inpCn / group; + ksize = outGroupCn * kernel.height * kernel.width; + + CV_Assert(inpCn % group == 0 && outCn % group == 0); + CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group); + + colRowBlobShape.clear(); + colRowBlobShape.push_back(ksize); + colRowBlobShape.push_back(inpH * inpW); + + ofsbuf.resize(ksize*3); + for( int k = 0; k < ksize; k++ ) + { + int w_offset = k % kernel.width; + int h_offset = (k / kernel.width) % kernel.height; + int c_im = k / kernel.height / kernel.width; + ofsbuf[k*3] = w_offset; + ofsbuf[k*3+1] = h_offset; + ofsbuf[k*3+2] = c_im; + } + } + + void forward(std::vector &inputs, std::vector &outputs) + { + Mat weightsMat = blobs[0].reshape(1, inpCn); + Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat(); + + for (size_t ii = 0; ii < outputs.size(); ii++) + { + int numImg = inputs[ii]->size[0]; + Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn); + Mat decnBlob = outputs[ii].reshape(1, numImg*outCn); + + for (int n = 0; n < numImg; n++) + { + for (int g = 0; g < group; g++) + { + Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn)); + Mat &colMat = (is1x1()) ? dstMat : colRowBlob; + + Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn)); + Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn)); + + dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T); + + if (!is1x1()) + col2im(colMat, dstMat); + + if (bias) + { + Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn)); + dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1); + } + } + } + } + } + + void col2im(const Mat &colMat, Mat &dstImg) + { + if (is1x1()) + { + dstImg = colMat; + return; + } + cv::dnn::col2im(colMat.ptr(), outGroupCn, outH, outW, kernel.height, kernel.width, + pad.height, pad.width, stride.height, stride.width, + dilation.height, dilation.width, dstImg.ptr(), &ofsbuf[0]); + } + + std::vector ofsbuf; }; - -BaseConvolutionLayerImpl::BaseConvolutionLayerImpl(): - numOutput(-1), group(-1), - inpH(0), inpW(0), inpCn(0), - outH(0), outW(0), outCn(0), - inpGroupCn(0), outGroupCn(0), - ksize(0), bias(false) -{ -#ifdef HAVE_LAPACK - if (getBlasThreads() != cv::getThreadNum()) - { - setBlasThreads(cv::getThreadNum()); - } -#endif -} - -void BaseConvolutionLayerImpl::init() -{ - CV_Assert(blobs.size() >= 1 && blobs.size() <= 2); - CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height); - - bias = (blobs.size() >= 2); -} - -void BaseConvolutionLayerImpl::allocate(const std::vector &inputs, std::vector &outputs) -{ - CV_Assert(inputs.size() > 0); - - init(); - - const Mat &input = *inputs[0]; - CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F)); - for (size_t i = 0; i < inputs.size(); i++) - { - CV_Assert(inputs[i]->type() == input.type()); - CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]); - CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]); - } - - computeInpOutShape(input); - - if (bias) - { - biasOnesBlob.create(1, outH * outW, input.type()); - biasOnesBlob.setTo(1); - } - - outputs.resize(inputs.size()); - for (size_t i = 0; i < inputs.size(); i++) - { - int sz[] = { inputs[i]->size[0], outCn, outH, outW }; - outputs[i].create(4, sz, input.type()); - } - - if (!is1x1()) - { - colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type()); - colRowBlob.setTo(0); - } -} - -bool BaseConvolutionLayerImpl::is1x1() const -{ - return (kernel.height == 1 && kernel.width == 1) && - (stride.height == 1 && stride.width == 1) && - (dilation.height == 1 && dilation.width == 1); -} - -void ConvolutionLayerImpl::computeInpOutShape(const Mat &input) -{ - CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]); - - numOutput = blobs[0].size[0]; - - inpH = input.size[2]; - inpW = input.size[3]; - inpCn = input.size[1]; - outCn = numOutput; - - if (padMode.empty()) - { - outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1; - outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1; - } - else - { - getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW); - } - - group = inpCn / blobs[0].size[1]; - - CV_Assert(inpCn % group == 0 && outCn % group == 0); - CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group); - - outGroupCn = outCn / group; - inpGroupCn = inpCn / group; - ksize = inpGroupCn * kernel.height * kernel.width; - - colRowBlobShape.clear(); - colRowBlobShape.push_back(outH*outW); - colRowBlobShape.push_back(ksize); -} - -void ConvolutionLayerImpl::forward(std::vector &inputs, std::vector &outputs) -{ - CV_Assert(inputs.size() > 0); - - Mat weightsMat = blobs[0].reshape(1, outCn); - Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat(); - - for (size_t ii = 0; ii < outputs.size(); ii++) - { - int numImg = inputs[ii]->size[0]; - Mat inpMat = *inputs[ii]; - Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn); - - for (int n = 0; n < numImg; n++) - { - for (int g = 0; g < group; g++) - { - Mat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn)); - - im2row(curInp, colMat); - - _Range kerRange(g * outGroupCn, outGroupCn); - Mat kerMat = weightsMat.rowRange(kerRange); - - _Range outRange((g + n * group) * outGroupCn, outGroupCn); - Mat dstMat = outMat.rowRange(outRange); - - dnn::gemm(kerMat, colMat, 1, dstMat, 0, GEMM_2_T); - - if (bias) - { - dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1); - } - } - } - } -} - -void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol) -{ - if (is1x1()) - { - dstCol = srcImg.reshape(1, ksize); - return; - } - - Mat &colMat = colRowBlob; - if (srcImg.type() == CV_32F) - im2col_CpuPBody::run(srcImg.ptr(), inpGroupCn, inpH, inpW, kernel.height, - kernel.width, pad.height, pad.width, stride.height, stride.width, - dilation.height, dilation.width, outH, outW, colMat.ptr()); - if (srcImg.type() == CV_64F) - im2col_CpuPBody::run(srcImg.ptr(), inpGroupCn, inpH, inpW, kernel.height, - kernel.width, pad.height, pad.width, stride.height, stride.width, - dilation.height, dilation.width, outH, outW, colMat.ptr()); - - dstCol = colMat; -} - -void ConvolutionLayerImpl::im2row(const Mat &srcImg, Mat &dstRow) -{ - if (is1x1()) - { - dstRow = srcImg.reshape(1, ksize).t(); - return; - } - - Mat &colMat = colRowBlob; - if (srcImg.type() == CV_32F) - im2row_CpuPBody::run(srcImg.ptr(), inpGroupCn, inpH, inpW, kernel.height, - kernel.width, pad.height, pad.width, stride.height, stride.width, - dilation.height, dilation.width, outH, outW, colMat.ptr()); - if (srcImg.type() == CV_64F) - im2row_CpuPBody::run(srcImg.ptr(), inpGroupCn, inpH, inpW, kernel.height, - kernel.width, pad.height, pad.width, stride.height, stride.width, - dilation.height, dilation.width, outH, outW, colMat.ptr()); - - dstRow = colMat; -} - -//Deconvolution - -void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob) -{ - CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]); - - numOutput = blobs[0].size[0]; - - inpH = inpBlob.size[2]; - inpW = inpBlob.size[3]; - inpCn = inpBlob.size[1]; - - outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height; - outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width; - outCn = numOutput; - - group = inpCn / blobs[0].size[1]; - outGroupCn = outCn / group; - inpGroupCn = inpCn / group; - ksize = outGroupCn * kernel.height * kernel.width; - - CV_Assert(inpCn % group == 0 && outCn % group == 0); - CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group); - - colRowBlobShape.clear(); - colRowBlobShape.push_back(ksize); - colRowBlobShape.push_back(inpH * inpW); -} - -void DeConvolutionLayerImpl::forward(std::vector &inputs, std::vector &outputs) -{ - Mat weightsMat = blobs[0].reshape(1, inpCn); - Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat(); - - for (size_t ii = 0; ii < outputs.size(); ii++) - { - int numImg = inputs[ii]->size[0]; - Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn); - Mat decnBlob = outputs[ii].reshape(1, numImg*outCn); - - for (int n = 0; n < numImg; n++) - { - for (int g = 0; g < group; g++) - { - Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn)); - Mat &colMat = (is1x1()) ? dstMat : colRowBlob; - - Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn)); - Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn)); - - dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T); - - if (!is1x1()) - col2im(colMat, dstMat); - - if (bias) - { - Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn)); - dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1); - } - } - } - } -} - -void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg) -{ - if (is1x1()) - { - dstImg = colMat; - return; - } - if (dstImg.type() == CV_32F) - col2im_CpuPBody::run(colMat.ptr(), outGroupCn, outH, outW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr()); - if (dstImg.type() == CV_64F) - col2im_CpuPBody::run(colMat.ptr(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr()); -} - -//Initializers - -/*Ptr ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation) -{ - ConvolutionLayerImpl *l = new ConvolutionLayerImpl(); - l->kernel = kernel; - l->pad = pad; - l->stride = stride; - l->dilation = dilation; - return Ptr(l); -} - -Ptr DeconvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation, Size adjustPad) -{ - DeConvolutionLayerImpl *l = new DeConvolutionLayerImpl(); - l->kernel = kernel; - l->pad = pad; - l->stride = stride; - l->dilation = dilation; - l->adjustPad = adjustPad; - - return Ptr(l); -}*/ - //Convolution and Deconvolution static void initConvDeconvLayerFromCaffe(Ptr l, const LayerParams ¶ms) { diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 74e5ab4ac..e0f3936b7 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -15,8 +15,7 @@ using std::pow; template class ElementWiseLayer : public Func::Layer { - Func func; - +public: template class PBody : public cv::ParallelLoopBody { @@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer } }; -public: - - ElementWiseLayer(const Func &f=Func()) : func(f) {} + ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {} void allocate(const std::vector &inputs, std::vector &outputs) { @@ -58,9 +55,16 @@ public: Range sizeRange = Range(0, dst.total()); CV_Assert(src.type() == CV_32F); - cv::parallel_for_(sizeRange, PBody(dst, func)); + PBody body(dst, func); + if( run_parallel ) + cv::parallel_for_(sizeRange, body); + else + body(sizeRange); } } + + Func func; + bool run_parallel; }; struct ReLUFunctor @@ -135,8 +139,24 @@ struct PowerFunctor template inline TFloat operator()(TFloat x) const { - return power == 1.0f ? (TFloat)shift + (TFloat)scale * x : - pow((TFloat)shift + (TFloat)scale * x, (TFloat)power); + return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power); + } +}; + +struct PowerFunctor1 +{ + typedef PowerLayer Layer; + + const float scale; + const float shift; + + PowerFunctor1(float scale_ = 1.f, float shift_ = 0) + : scale(scale_), shift(shift_) {} + + template + inline TFloat operator()(TFloat x) const + { + return (TFloat)shift + (TFloat)scale * x; } }; @@ -165,12 +185,12 @@ public: void forward(std::vector &inputs, std::vector &outputs) { CV_Assert(inputs.size() == 1); - Mat &inpBlob = *inputs[0]; for (size_t ii = 0; ii < outputs.size(); ii++) { Mat &outBlob = outputs[ii]; + CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous()); CV_Assert(blobs[0].total() == inpBlob.size[1]); @@ -181,8 +201,16 @@ public: Mat inpBlobPlane = getPlane(inpBlob, 0, n); Mat outBlobPlane = getPlane(outBlob, 0, n); - threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV); - scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane); + size_t i, planeTotal = inpBlobPlane.total(); + const float* inptr = inpBlobPlane.ptr(); + float* outptr = outBlobPlane.ptr(); + for( i = 0; i < planeTotal; i++ ) + { + float val = inptr[i]; + outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight); + } + //threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV); + //scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane); } } } @@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \ Ptr ReLULayer::create(const LayerParams& params) { float negativeSlope = params.get("negative_slope", 0.f); - Ptr l(new ElementWiseLayer(ReLUFunctor(negativeSlope))); + Ptr l(new ElementWiseLayer(false, ReLUFunctor(negativeSlope))); l->setParamsFrom(params); return l; @@ -204,7 +232,7 @@ Ptr ReLULayer::create(const LayerParams& params) Ptr TanHLayer::create(const LayerParams& params) { - Ptr l(new ElementWiseLayer()); + Ptr l(new ElementWiseLayer(true)); l->setParamsFrom(params); return l; @@ -212,7 +240,7 @@ Ptr TanHLayer::create(const LayerParams& params) Ptr SigmoidLayer::create(const LayerParams& params) { - Ptr l(new ElementWiseLayer()); + Ptr l(new ElementWiseLayer(true)); l->setParamsFrom(params); return l; @@ -228,7 +256,7 @@ Ptr AbsLayer::create(const LayerParams& params) Ptr BNLLLayer::create(const LayerParams& params) { - Ptr l(new ElementWiseLayer()); + Ptr l(new ElementWiseLayer(true)); l->setParamsFrom(params); return l; @@ -239,7 +267,9 @@ Ptr PowerLayer::create(const LayerParams& params) float power = params.get("power", 1.0f); float scale = params.get("scale", 1.0f); float shift = params.get("shift", 0.0f); - Ptr l(new ElementWiseLayer(PowerFunctor(power, scale, shift))); + Ptr l(power == 1.f ? + (PowerLayer*)(new ElementWiseLayer(false, PowerFunctor1(scale, shift))) : + (PowerLayer*)(new ElementWiseLayer(true, PowerFunctor(power, scale, shift)))); l->setParamsFrom(params); return l; diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 2e88bbbe5..945d2587f 100755 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -98,15 +98,14 @@ public: void forward(std::vector &inputs, std::vector &outputs) { + Mat& output = outputs[0]; switch (op) { case SUM: - { CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size()); - Mat& output = outputs[0]; - output.setTo(0.); if (0 < coeffs.size()) { + output.setTo(0.); for (size_t i = 0; i < inputs.size(); i++) { output += *inputs[i] * coeffs[i]; @@ -114,32 +113,26 @@ public: } else { - for (size_t i = 0; i < inputs.size(); i++) + add(*inputs[0], *inputs[1], output); + for (size_t i = 2; i < inputs.size(); i++) { output += *inputs[i]; } } - } break; case PROD: - { - Mat& output = outputs[0]; output.setTo(1.); for (size_t i = 0; i < inputs.size(); i++) { output = output.mul(*inputs[i]); } - } break; case MAX: - { - Mat& output = outputs[0]; cv::max(*inputs[0], *inputs[1], output); for (size_t i = 2; i < inputs.size(); i++) { cv::max(output, *inputs[i], output); } - } break; default: CV_Assert(0); diff --git a/modules/dnn/src/layers/op_im2col.cpp b/modules/dnn/src/layers/op_im2col.cpp index bae2011d0..690cac28d 100644 --- a/modules/dnn/src/layers/op_im2col.cpp +++ b/modules/dnn/src/layers/op_im2col.cpp @@ -44,3 +44,326 @@ #include "opencl_kernels_dnn.hpp" #include "op_im2col.hpp" #include "opencl_kernels_dnn.hpp" + +namespace cv { +namespace dnn { + +#if 0 +template +class im2col_CpuPBody : public cv::ParallelLoopBody +{ + const Dtype* data_im; + int channels, height, width; + int kernel_h, kernel_w; + int pad_h, pad_w; + int stride_h, stride_w; + int dilation_h, dilation_w; + Dtype* data_col; + int height_col, width_col, channels_col; + + im2col_CpuPBody() {} +public: + + static void run(const Dtype* data_im, + int channels, int height, int width, + int kernel_h, int kernel_w, + int pad_h, int pad_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int height_col, int width_col, + Dtype* data_col) + { + im2col_CpuPBody t; + + t.data_im = data_im; + t.data_col = data_col; + t.channels = channels; t.height = height; t.width = width; + t.kernel_h = kernel_h; t.kernel_w = kernel_w; + t.pad_h = pad_h; t.pad_w = pad_w; + t.stride_h = stride_h; t.stride_w = stride_w; + t.dilation_h = dilation_h; t.dilation_w = dilation_w; + + t.height_col = height_col; + t.width_col = width_col; + t.channels_col = channels * kernel_h * kernel_w; + + cv::parallel_for_(Range(0, t.channels_col), t); + } + + virtual void operator ()(const Range &r) const + { + for (int c = r.start; c < r.end; ++c) + { + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; + for (int h = 0; h < height_col; ++h) + { + for (int w = 0; w < width_col; ++w) + { + int h_pad = h * stride_h - pad_h + h_offset * dilation_h; + int w_pad = w * stride_w - pad_w + w_offset * dilation_w; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_col[(c * height_col + h) * width_col + w] = + data_im[(c_im * height + h_pad) * width + w_pad]; + else + data_col[(c * height_col + h) * width_col + w] = 0; + } + } + } + } +}; +#endif + +template +class im2row_CpuPBody : public cv::ParallelLoopBody +{ + const Dtype* data_im; + int channels, height, width; + int kernel_h, kernel_w; + int pad_h, pad_w; + int stride_h, stride_w; + int dilation_h, dilation_w; + Dtype* data_col; + int height_col, width_col, channels_col; + + im2row_CpuPBody() {} +public: + + static void run(const Dtype* data_im, + int channels, int height, int width, + int kernel_h, int kernel_w, + int pad_h, int pad_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int height_col, int width_col, + Dtype* data_col) + { + im2row_CpuPBody t; + + t.data_im = data_im; + t.data_col = data_col; + t.channels = channels; t.height = height; t.width = width; + t.kernel_h = kernel_h; t.kernel_w = kernel_w; + t.pad_h = pad_h; t.pad_w = pad_w; + t.stride_h = stride_h; t.stride_w = stride_w; + t.dilation_h = dilation_h; t.dilation_w = dilation_w; + + t.height_col = height_col; + t.width_col = width_col; + t.channels_col = channels * kernel_h * kernel_w; + + int total = t.height_col*t.width_col; +#if 1 + t(Range(0, total)); +#else + cv::parallel_for_(Range(0, total), t, 16); +#endif + } + + virtual void operator ()(const Range &r) const + { + int dh = dilation_h, dw = dilation_w; + int kh = kernel_h, kw = kernel_w; + Dtype* data_col_ = data_col; + const Dtype* data_im_ = data_im; + int kelems = kh*kw; + AutoBuffer ofs_(kelems); + int* ofs = ofs_; + int k = 0; + for( int k_r = 0; k_r < kernel_h; k_r++ ) + for( int k_c = 0; k_c < kernel_w; k_c++, k++ ) + ofs[k] = k_r*dh*width + k_c*dw; + + for (int row = r.start; row < r.end; ++row) + { + int out_c = row % width_col; + int out_r = row / width_col; + int out_row_offset = row*kh*kw*channels; + + int start_in_r = out_r * stride_h - pad_h; + int start_in_c = out_c * stride_w - pad_w; + int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h); + int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h); + int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w); + int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w); + + if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw ) + { + for( int i_c = 0; i_c < channels; i_c++ ) + { + float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw; + const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c; + + for( k = 0; k < kelems; k++ ) + { + data_col_c[k] = data_im_c[ofs[k]]; + } + } + } + else + { + memset(data_col_, 0, kw*kh*channels*sizeof(data_col_[0])); + for(int i_c = 0; i_c < channels; i_c++) + { + int channels_offset = i_c * width * height; + int out_ch_offset = i_c*kh*kw; + int in_r = start_in_r + start_k_r*dh; + + for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh) + { + int row_offset = in_r*width; + int out_col_offset = k_r*kw; + int in_c = start_in_c + start_k_c*dw; + + for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw) + { + int in_index = channels_offset + row_offset + in_c; + int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c; + + data_col_[out_index] = data_im_[in_index]; + } + } + } + } + } + } +}; + +void im2row(const float* data_im, int channels, int height, int width, + int kernel_h, int kernel_w, int pad_h, int pad_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int height_col, int width_col, float* data_col) +{ + im2row_CpuPBody::run(data_im, channels, height, width, + kernel_h, kernel_w, pad_h, pad_w, + stride_h, stride_w, dilation_h, dilation_w, + height_col, width_col, data_col); +} + + +#if 0 +template +class col2im_CpuPBody : public cv::ParallelLoopBody +{ + const Dtype* data_col; + int channels, height, width; + int kernel_h, kernel_w; + int pad_h, pad_w; + int stride_h, stride_w; + Dtype* data_im; + int height_col, width_col; + + col2im_CpuPBody() {} + +public: + static void run(const Dtype* data_col, + int channels, int height, int width, + int kernel_h, int kernel_w, + int pad_h, int pad_w, + int stride_h, int stride_w, + Dtype* data_im) + { + //TODO: single-threaded version switch + + col2im_CpuPBody t; + t.data_col = data_col; + t.data_im = data_im; + t.channels = channels; t.height = height; t.width = width; + t.kernel_h = kernel_h; t.kernel_w = kernel_w; + t.pad_h = pad_h; t.pad_w = pad_w; + t.stride_h = stride_h; t.stride_w = stride_w; + t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int img_total = channels * height * width; + + cv::parallel_for_(Range(0, img_total), t); + } + + virtual void operator ()(const Range &r) const + { + const Dtype* data_col_ = data_col; + Dtype* data_im_ = data_im; + int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int index = r.start; index < r.end; index++) + { + Dtype val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + + // compute the start and end of the output + int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + int w_col_end = std::min(w / stride_w + 1, width_col); + int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + int h_col_end = std::min(h / stride_h + 1, height_col); + + // equivalent implementation + int offset = + (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col; + + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im_[index] = val; + } + } +}; +#endif + +//single-threaded version +template +void col2im_cpu(const Dtype* data_col, + int channels, int height, int width, + int kernel_h, int kernel_w, + int pad_h, int pad_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + Dtype* data_im, + const int* ofsbuf) +{ + int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; + + std::memset(data_im, 0, height * width * channels * sizeof(Dtype)); + + for (int c = 0; c < channels_col; ++c, ofsbuf += 3) + { + //int w_offset = c % kernel_w; + //int h_offset = (c / kernel_w) % kernel_h; + //int c_im = c / kernel_h / kernel_w; + int w_offset = ofsbuf[0]; + int h_offset = ofsbuf[1]; + int c_im = ofsbuf[2]; + + for (int h = 0; h < height_col; ++h) + { + for (int w = 0; w < width_col; ++w) + { + int h_pad = h * stride_h - pad_h + h_offset * dilation_h; + int w_pad = w * stride_w - pad_w + w_offset * dilation_w; + + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_im[(c_im * height + h_pad) * width + w_pad] += + data_col[(c * height_col + h) * width_col + w]; + } + } + } +} + +void col2im(const float* data_col, int channels, int height, int width, + int kernel_h, int kernel_w, int pad_h, int pad_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + float* data_im, const int* ofsbuf) +{ + //col2im_CpuPBody::run(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_im); + col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, + stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf); +} + +} +} diff --git a/modules/dnn/src/layers/op_im2col.hpp b/modules/dnn/src/layers/op_im2col.hpp index 488fab30f..e3685fa11 100644 --- a/modules/dnn/src/layers/op_im2col.hpp +++ b/modules/dnn/src/layers/op_im2col.hpp @@ -49,264 +49,15 @@ namespace cv namespace dnn { -template -class im2col_CpuPBody : public cv::ParallelLoopBody -{ - const Dtype* data_im; - int channels, height, width; - int kernel_h, kernel_w; - int pad_h, pad_w; - int stride_h, stride_w; - int dilation_h, dilation_w; - Dtype* data_col; - int height_col, width_col, channels_col; +void im2row(const float* data_im, int channels, int height, int width, + int kernel_h, int kernel_w, int pad_h, int pad_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int height_col, int width_col, float* data_col); - im2col_CpuPBody() {} -public: - - static void run(const Dtype* data_im, - int channels, int height, int width, - int kernel_h, int kernel_w, - int pad_h, int pad_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - int height_col, int width_col, - Dtype* data_col) - { - im2col_CpuPBody t; - - t.data_im = data_im; - t.data_col = data_col; - t.channels = channels; t.height = height; t.width = width; - t.kernel_h = kernel_h; t.kernel_w = kernel_w; - t.pad_h = pad_h; t.pad_w = pad_w; - t.stride_h = stride_h; t.stride_w = stride_w; - t.dilation_h = dilation_h; t.dilation_w = dilation_w; - - t.height_col = height_col; - t.width_col = width_col; - t.channels_col = channels * kernel_h * kernel_w; - - cv::parallel_for_(Range(0, t.channels_col), t); - } - - virtual void operator ()(const Range &r) const - { - for (int c = r.start; c < r.end; ++c) - { - int w_offset = c % kernel_w; - int h_offset = (c / kernel_w) % kernel_h; - int c_im = c / kernel_h / kernel_w; - for (int h = 0; h < height_col; ++h) - { - for (int w = 0; w < width_col; ++w) - { - int h_pad = h * stride_h - pad_h + h_offset * dilation_h; - int w_pad = w * stride_w - pad_w + w_offset * dilation_w; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; - else - data_col[(c * height_col + h) * width_col + w] = 0; - } - } - } - } -}; - -template -class im2row_CpuPBody : public cv::ParallelLoopBody -{ - const Dtype* data_im; - int channels, height, width; - int kernel_h, kernel_w; - int pad_h, pad_w; - int stride_h, stride_w; - int dilation_h, dilation_w; - Dtype* data_col; - int height_col, width_col, channels_col; - - im2row_CpuPBody() {} -public: - - static void run(const Dtype* data_im, - int channels, int height, int width, - int kernel_h, int kernel_w, - int pad_h, int pad_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - int height_col, int width_col, - Dtype* data_col) - { - im2row_CpuPBody t; - - t.data_im = data_im; - t.data_col = data_col; - t.channels = channels; t.height = height; t.width = width; - t.kernel_h = kernel_h; t.kernel_w = kernel_w; - t.pad_h = pad_h; t.pad_w = pad_w; - t.stride_h = stride_h; t.stride_w = stride_w; - t.dilation_h = dilation_h; t.dilation_w = dilation_w; - - t.height_col = height_col; - t.width_col = width_col; - t.channels_col = channels * kernel_h * kernel_w; - - cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16); - } - - virtual void operator ()(const Range &r) const - { - int dh = dilation_h, dw = dilation_w; - Dtype* data_col_ = data_col; - const Dtype* data_im_ = data_im; - - for (int row = r.start; row < r.end; ++row) - { - int out_c = row % width_col; - int out_r = row / width_col; - int out_row_offset = row*kernel_h*kernel_w*channels; - - int start_in_r = out_r * stride_h - pad_h; - int start_in_c = out_c * stride_w - pad_w; - int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h)); - int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h)); - int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w)); - int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w)); - - for(int i_c = 0; i_c < channels; i_c++) - { - int channels_offset = i_c * width * height; - int out_ch_offset = i_c*kernel_h*kernel_w; - int in_r = start_in_r + start_k_r*dilation_h; - - for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh) - { - int row_offset = in_r*width; - int out_col_offset = k_r*kernel_w; - int in_c = start_in_c + start_k_c*dilation_w; - - for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw) - { - int in_index = channels_offset + row_offset + in_c; - - int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c; - - data_col_[out_index] = data_im_[in_index]; - } - } - } - } - } -}; - -template -class col2im_CpuPBody : public cv::ParallelLoopBody -{ - const Dtype* data_col; - int channels, height, width; - int kernel_h, kernel_w; - int pad_h, pad_w; - int stride_h, stride_w; - Dtype* data_im; - int height_col, width_col; - - col2im_CpuPBody() {} - -public: - - static void run(const Dtype* data_col, - int channels, int height, int width, - int kernel_h, int kernel_w, - int pad_h, int pad_w, - int stride_h, int stride_w, - Dtype* data_im) - { - //TODO: single-threaded version switch - - col2im_CpuPBody t; - t.data_col = data_col; - t.data_im = data_im; - t.channels = channels; t.height = height; t.width = width; - t.kernel_h = kernel_h; t.kernel_w = kernel_w; - t.pad_h = pad_h; t.pad_w = pad_w; - t.stride_h = stride_h; t.stride_w = stride_w; - t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int img_total = channels * height * width; - - cv::parallel_for_(Range(0, img_total), t); - } - - virtual void operator ()(const Range &r) const - { - const Dtype* data_col_ = data_col; - Dtype* data_im_ = data_im; - int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col; - int coeff_w_col = (1 - stride_w * height_col * width_col); - for (int index = r.start; index < r.end; index++) - { - Dtype val = 0; - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = index / (width * height); - - // compute the start and end of the output - int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - int w_col_end = std::min(w / stride_w + 1, width_col); - int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - int h_col_end = std::min(h / stride_h + 1, height_col); - - // equivalent implementation - int offset = - (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col; - - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im_[index] = val; - } - } -}; - -//single-threaded version -template -void col2im_cpu(const Dtype* data_col, - int channels, int height, int width, - int kernel_h, int kernel_w, - int pad_h, int pad_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - Dtype* data_im) -{ - int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - int channels_col = channels * kernel_h * kernel_w; - - std::memset(data_im, 0, height * width * channels * sizeof(Dtype)); - - for (int c = 0; c < channels_col; ++c) - { - int w_offset = c % kernel_w; - int h_offset = (c / kernel_w) % kernel_h; - int c_im = c / kernel_h / kernel_w; - - for (int h = 0; h < height_col; ++h) - { - for (int w = 0; w < width_col; ++w) - { - int h_pad = h * stride_h - pad_h + h_offset * dilation_h; - int w_pad = w * stride_w - pad_w + w_offset * dilation_w; - - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_im[(c_im * height + h_pad) * width + w_pad] += - data_col[(c * height_col + h) * width_col + w]; - } - } - } -} +void col2im(const float* data_col, int channels, int height, int width, + int kernel_h, int kernel_w, int pad_h, int pad_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + float* data_im, const int* ofsbuf); } }