Blobs reuse improvement (#1205)

* Reuse deep learning output blobs * Changed order for iterating through blobs while seeking memory. Refactored a little.
2025-10-23 18:09:25 +08:00 · 2017-06-09 21:36:19 +03:00
parent 1c8809ff7d
commit 09b73b2dc7
10 changed files with 377 additions and 87 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -369,6 +369,21 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         CV_WRAP void getMemoryConsumption(const int layerId,
                                           const MatShape& netInputShape,
                                           size_t& weights, size_t& blobs) const;
+
+         /** @brief Computes bytes number which are requered to store
+          * all weights and intermediate blobs for each layer.
+          * @param netInputShapes vector of shapes for all net inputs.
+          * @param layerIds output vector to save layer IDs.
+          * @param weights output parameter to store resulting bytes for weights.
+          * @param blobs output parameter to store resulting bytes for intermediate blobs.
+          */
+         CV_WRAP void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                           std::vector<int>& layerIds, std::vector<size_t>& weights,
+                                           std::vector<size_t>& blobs) const;
+         /** @overload */
+         CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
+                                           std::vector<int>& layerIds, std::vector<size_t>& weights,
+                                           std::vector<size_t>& blobs) const;
    private:

        struct Impl;
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@@ -2,6 +2,7 @@
 typedef dnn::DictValue LayerId;
 typedef std::vector<dnn::MatShape> vector_MatShape;
 typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
+typedef std::vector<size_t> vector_size_t;

 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -55,6 +55,22 @@ using std::map;
 using std::make_pair;
 using std::set;

+namespace
+{
+    typedef std::vector<MatShape> ShapesVec;
+
+    struct LayerShapes
+    {
+        ShapesVec in, out, internal;
+        // No guarantees that layer which support in-place computations
+        // will be computed in-place (input.data_ptr == output.data_ptr).
+        // If layer said that it could work in-place and layers after it
+        // no longer use input blob, we'll set output = input.
+        bool supportInPlace;
+        LayerShapes() {supportInPlace = false;}
+    };
+}
+
 namespace cv
 {
 namespace dnn
@@ -154,6 +170,11 @@ struct LayerPin
    {
        return (lid == r.lid && oid == r.oid);
    }
+
+    bool operator<(const LayerPin &r) const
+    {
+        return lid < r.lid || lid == r.lid && oid < r.oid;
+    }
 };

 struct LayerData
@@ -219,16 +240,222 @@ private:
    std::vector<String> outNames;
 };

-struct Net::Impl
+struct BlobManager
 {
-    typedef std::vector<MatShape> ShapesVec;
-    struct LayerShapes
+public:
+    // Increase references counter to layer output.
+    void addReference(const LayerPin& lp)
    {
-        ShapesVec in, out, internal;
-        bool inplace;
-        LayerShapes() {inplace = false;}
+        std::map<LayerPin, int>::iterator it = refCounter.find(lp);
+        if (it == refCounter.end())
+            refCounter[lp] = 1;
+        else
+            it->second += 1;
+    }
+
+    void addReferences(const std::vector<LayerPin>& pins)
+    {
+        for (int i = 0; i < pins.size(); i++)
+        {
+            addReference(pins[i]);
+        }
+    }
+
+    // Returns number of references to allocated memory that used in specific
+    // layer blob.
+    int numReferences(const LayerPin& lp)
+    {
+        std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
+        CV_Assert(mapIt != reuseMap.end());
+        LayerPin memHost = mapIt->second;
+
+        std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
+        CV_Assert(refIt != refCounter.end());
+        return refIt->second;
+    }
+
+    // Reuse data allocated in <host> inside the <user> blob.
+    void reuse(const LayerPin& host, const LayerPin& user)
+    {
+        CV_Assert(reuseMap.find(user) == reuseMap.end());
+        CV_Assert(reuseMap.find(host) != reuseMap.end());
+        LayerPin memHost = reuseMap[host];
+        reuseMap[user] = memHost;
+        if (refCounter.find(memHost) != refCounter.end())
+        {
+            std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
+            if (userRefIt != refCounter.end())
+            {
+                refCounter[memHost] += userRefIt->second;
+                refCounter.erase(userRefIt);
+            }
+            else
+                refCounter[memHost] += 1;
+        }
+    }
+
+    // Decrease references counter to allocated memory inside specific blob.
+    void releaseReference(const LayerPin& lp)
+    {
+        std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
+        CV_Assert(mapIt != reuseMap.end());
+
+        std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
+        CV_Assert(refIt != refCounter.end());
+        CV_Assert(refIt->second > 0);
+        refIt->second -= 1;
+    }
+
+    void releaseReferences(const std::vector<LayerPin>& pins)
+    {
+        for (int i = 0; i < pins.size(); i++)
+        {
+            releaseReference(pins[i]);
+        }
+    }
+
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
+    {
+        std::map<LayerPin, Mat>::iterator hostIt;
+        std::map<LayerPin, int>::iterator refIt;
+
+        const int targetTotal = total(shape);
+        Mat bestBlob;
+        int bestBlobTotal = INT_MAX;
+        LayerPin bestBlobPin;
+        for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
+        {
+            refIt = refCounter.find(hostIt->first);
+            // Use only blobs that had references before because if not,
+            // it might be used as output.
+            if (refIt != refCounter.end() && refIt->second == 0)
+            {
+                Mat& unusedBlob = hostIt->second;
+                if (unusedBlob.total() >= targetTotal &&
+                    unusedBlob.total() < bestBlobTotal)
+                {
+                    bestBlobPin = hostIt->first;
+                    bestBlob = unusedBlob;
+                    bestBlobTotal = unusedBlob.total();
+                }
+            }
+        }
+        if (!bestBlob.empty())
+        {
+            reuse(bestBlobPin, lp);
+            dst = Mat(shape, CV_32F, bestBlob.data);
+        }
+        else
+        {
+            dst.create(shape, CV_32F);
+            addHost(lp, dst);
+        }
+    }
+
+    void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
+                               std::vector<LayerPin>& pinsForInternalBlobs)
+    {
+        pinsForInternalBlobs.clear();
+
+        std::vector<Mat>& outputBlobs = ld.outputBlobs,
+                &internalBlobs = ld.internals;
+
+        const ShapesVec& outShapes = layerShapes.out,
+                internalShapes = layerShapes.internal;
+
+        outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
+        internalBlobs.resize(internalShapes.size());
+
+        CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
+
+        // Check that layer could work in-place.
+        bool inPlace = false;
+        if (layerShapes.supportInPlace)
+        {
+            if (ld.inputBlobs.size() == 1)
+            {
+                // Get number of references to the input memory.
+                int numRef = numReferences(ld.inputBlobsId[0]);
+                // If current layer is one and only customer of this blob.
+                inPlace = numRef == 1;
+            }
+        }
+
+        ShapesVec shapes(outShapes);
+        shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
+        std::vector<Mat*> blobs;
+        for(int i = 0; i < outputBlobs.size(); i++)
+        {
+            blobs.push_back(&outputBlobs[i]);
+        }
+
+        for(int i = 0; i < internalBlobs.size(); i++)
+        {
+            blobs.push_back(&internalBlobs[i]);
+            if (total(internalShapes[i]))
+            {
+                pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
+            }
+        }
+
+        addReferences(pinsForInternalBlobs);
+
+        std::map<int, std::vector<int> > idxSizes;
+        for(int i = 0; i < shapes.size(); i++)
+        {
+            idxSizes[total(shapes[i])].push_back(i);
+        }
+
+        std::map<int, std::vector<int> >::reverse_iterator it;
+        for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
+        {
+            for(int j = 0; j < it->second.size(); j++)
+            {
+                int index = it->second[j];
+                if (total(shapes[index]))
+                {
+                    LayerPin blobPin(ld.id, index);
+                    if (index < outShapes.size() && inPlace)
+                    {
+                        CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
+                        ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
+                        reuse(ld.inputBlobsId[0], blobPin);
+                    }
+                    else
+                    {
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index]);
+                    }
+                }
+            }
+        }
+    }
+
+    // Clear internal state. Calls before an every reallocation.
+    void reset()
+    {
+        refCounter.clear();
+        reuseMap.clear();
+        memHosts.clear();
+    }
+
+private:
+    // Registed allocated memory.
+    void addHost(const LayerPin& lp, const Mat& mat)
+    {
+        CV_Assert(memHosts.find(lp) == memHosts.end());
+        reuseMap[lp] = lp;
+        memHosts[lp] = mat;
+    }
+
+    std::map<LayerPin, int> refCounter;
+    // Maps pin to origin blob (for whom memory was allocated firstly).
+    // For origin blobs key == value.
+    std::map<LayerPin, LayerPin> reuseMap;
+    std::map<LayerPin, Mat> memHosts;
 };

+struct Net::Impl
+{
    typedef std::map<int, LayerShapes> LayersShapesMap;
    typedef std::map<int, LayerData> MapIdToLayerData;

@@ -252,6 +479,7 @@ struct Net::Impl

    MapIdToLayerData layers;
    std::map<String, int> layerNameToId;
+    BlobManager blobManager;

    int lastLayerId;

@@ -469,37 +697,11 @@ struct Net::Impl
        LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);

        CV_Assert(layerShapesIt != layersShapes.end());
-        const ShapesVec& outShapes = layerShapesIt->second.out;
-        CV_Assert(ld.requiredOutputs.size() <= outShapes.size());

-        ld.outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
-        for(int i = 0; i < outShapes.size(); i++)
-        {
-            if (shape(ld.outputBlobs[i]) != outShapes[i])
-            {
-                if (layerShapesIt->second.inplace)
-                {
-                    CV_Assert(ld.inputBlobs.size() == ld.outputBlobs.size());
-                    CV_Assert(ld.inputBlobs[i]->total() == total(outShapes[i]));
-                    ld.outputBlobs[i] = ld.inputBlobs[i]->reshape(1, outShapes[i]);
-                }
-                else
-                {
-                    ld.outputBlobs[i].create(outShapes[i], CV_32F);
-                }
-            }
-        }
-
-        const ShapesVec& intShapes = layerShapesIt->second.internal;
-        ld.internals.resize(intShapes.size());
-        for(int i = 0; i < intShapes.size(); i++)
-        {
-            if (shape(ld.internals[i]) != intShapes[i] && total(intShapes[i]))
-                ld.internals[i].create(intShapes[i], CV_32F);
-        }
+        std::vector<LayerPin> pinsForInternalBlobs;
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);

        Ptr<Layer> layerPtr = ld.getLayerInstance();
-        //try
        {
            layerPtr->finalize(ld.inputBlobs, ld.outputBlobs);
 #if 0
@@ -512,10 +714,10 @@ struct Net::Impl
            std::cout << "\n";
 #endif
        }
-        /*catch (const cv::Exception &err)
-        {
-            CV_RETHROW_ERROR(err, format("The following error occured while making allocate() for layer \"%s\": %s", ld.name.c_str(), err.err.c_str()));
-        }*/
+
+        // After allocation of layer, we decrease counters to it's input blobs.
+        blobManager.releaseReferences(ld.inputBlobsId);
+        blobManager.releaseReferences(pinsForInternalBlobs);

        ld.flag = 1;
    }
@@ -536,6 +738,13 @@ struct Net::Impl
        LayersShapesMap layersShapes;
        getLayersShapes(inputShapes, layersShapes);

+        blobManager.reset();
+        for (it = layers.begin(); it != layers.end(); ++it)
+        {
+            const LayerData& ld = it->second;
+            blobManager.addReferences(ld.inputBlobsId);
+        }
+
        for (it = layers.begin(); it != layers.end(); it++)
        {
            int lid = it->first;
@@ -609,7 +818,7 @@ struct Net::Impl
        ShapesVec& os = inOutShapes[id].out;
        ShapesVec& ints = inOutShapes[id].internal;
        int requiredOutputs = layers[id].requiredOutputs.size();
-        inOutShapes[id].inplace =
+        inOutShapes[id].supportInPlace =
                layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
    }

@@ -718,9 +927,13 @@ void Net::setBlob(String outputName, const Mat &blob_)
    LayerData &ld = impl->layers[pin.lid];
    ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
    MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
+    bool oldShape = prevShape == shape(blob_);
+    if (oldShape)
+        blob_.copyTo(ld.outputBlobs[pin.oid]);
+    else
        ld.outputBlobs[pin.oid] = blob_.clone();

-    impl->netWasAllocated = impl->netWasAllocated && prevShape == shape(blob_);
+    impl->netWasAllocated = impl->netWasAllocated && oldShape;
 }

 Mat Net::getBlob(String outputName)
@@ -827,10 +1040,10 @@ std::vector<int> Net::getUnconnectedOutLayers() const
    return layersIds;
 }

-void Net::getLayersShapes(const Net::Impl::ShapesVec& netInputShapes,
+void Net::getLayersShapes(const ShapesVec& netInputShapes,
                          std::vector<int>* layersIds,
-                          std::vector<Net::Impl::ShapesVec>* inLayersShapes,
-                          std::vector<Net::Impl::ShapesVec>* outLayersShapes) const
+                          std::vector<ShapesVec>* inLayersShapes,
+                          std::vector<ShapesVec>* outLayersShapes) const
 {
    if ((layersIds || inLayersShapes || outLayersShapes) == false)
        return;
@@ -856,29 +1069,29 @@ void Net::getLayersShapes(const Net::Impl::ShapesVec& netInputShapes,

 void Net::getLayersShapes(const MatShape& netInputShape,
                          std::vector<int>* layerIds,
-                          std::vector<Net::Impl::ShapesVec>* inLayersShapes,
-                          std::vector<Net::Impl::ShapesVec>* outLayersShapes) const
+                          std::vector<ShapesVec>* inLayersShapes,
+                          std::vector<ShapesVec>* outLayersShapes) const
 {
-    getLayersShapes(Net::Impl::ShapesVec(1, netInputShape),
+    getLayersShapes(ShapesVec(1, netInputShape),
                    layerIds, inLayersShapes, outLayersShapes);
 }

 void Net::getLayerShapes(const MatShape& netInputShape,
                         const int layerId,
-                         Net::Impl::ShapesVec* inLayerShapes,
-                         Net::Impl::ShapesVec* outLayerShapes) const
+                         ShapesVec* inLayerShapes,
+                         ShapesVec* outLayerShapes) const
 {
-    getLayerShapes(Net::Impl::ShapesVec(1, netInputShape),
+    getLayerShapes(ShapesVec(1, netInputShape),
                   layerId, inLayerShapes, outLayerShapes);

 }

-void Net::getLayerShapes(const Net::Impl::ShapesVec& netInputShapes,
+void Net::getLayerShapes(const ShapesVec& netInputShapes,
                    const int layerId,
-                    Net::Impl::ShapesVec* inLayerShapes,
-                    Net::Impl::ShapesVec* outLayerShapes) const
+                    ShapesVec* inLayerShapes,
+                    ShapesVec* outLayerShapes) const
 {
-    Impl::LayerShapes shapes;
+    LayerShapes shapes;
    impl->getLayerShapes(netInputShapes, layerId, shapes);
    if (inLayerShapes)
        *inLayerShapes = shapes.in;
@@ -915,7 +1128,7 @@ int64 Net::getFLOPS(const int layerId,
    Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
    CV_Assert(layer != impl->layers.end());

-    Impl::LayerShapes shapes;
+    LayerShapes shapes;
    impl->getLayerShapes(netInputShapes, layerId, shapes);

    return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
@@ -986,26 +1199,14 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
                               size_t& weights, size_t& blobs) const
 {
    std::vector<int> layerIds;
-    std::vector<std::vector<MatShape> > outLayerShapes;
-
-    getLayersShapes(netInputShapes, &layerIds, 0, &outLayerShapes);
+    std::vector<size_t> w, b;
+    getMemoryConsumption(netInputShapes, layerIds, w, b);

    weights = blobs = 0;
    for(int i = 0; i < layerIds.size(); i++)
    {
-        Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
-        CV_Assert(layer != impl->layers.end());
-
-        for(int j = 0; j < layer->second.params.blobs.size(); j++)
-        {
-            const Mat& weightsBlob = layer->second.params.blobs[j];
-            weights += weightsBlob.total()*weightsBlob.elemSize();
-        }
-
-        for(int j = 0; j < outLayerShapes[i].size(); j++)
-        {
-            blobs += total(outLayerShapes[i][j]) * sizeof(float);
-        }
+        weights += w[i];
+        blobs += b[i];
    }
 }

@@ -1024,6 +1225,47 @@ void Net::getMemoryConsumption(const MatShape& netInputShape,
                         weights, blobs);
 }

+void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                  std::vector<int>& layerIds, std::vector<size_t>& weights,
+                                  std::vector<size_t>& blobs) const
+{
+    layerIds.clear();
+    weights.clear();
+    blobs.clear();
+
+    std::vector<std::vector<MatShape> > outLayerShapes;
+
+    getLayersShapes(netInputShapes, &layerIds, 0, &outLayerShapes);
+
+    for(int i = 0; i < layerIds.size(); i++)
+    {
+        int w = 0, b = 0;
+        Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
+        CV_Assert(layer != impl->layers.end());
+
+        for(int j = 0; j < layer->second.params.blobs.size(); j++)
+        {
+            const Mat& weightsBlob = layer->second.params.blobs[j];
+            w += weightsBlob.total()*weightsBlob.elemSize();
+        }
+
+        for(int j = 0; j < outLayerShapes[i].size(); j++)
+        {
+            b += total(outLayerShapes[i][j]) * sizeof(float);
+        }
+
+        weights.push_back(w);
+        blobs.push_back(b);
+    }
+}
+
+void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
+                               std::vector<size_t>& weights, std::vector<size_t>& blobs) const
+{
+    getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
+                         weights, blobs);
+}
+
 //////////////////////////////////////////////////////////////////////////

 Importer::~Importer() {}
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -30,6 +30,15 @@ public:
        epsilon = params.get<float>("eps", 1E-5);
    }

+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(blobs.size() >= 2);
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -61,7 +61,12 @@ public:
        return true;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) {}
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        for (int i = 0, n = outputs.size(); i < n; ++i)
+            if (outputs[i].data != inputs[i]->data)
+                inputs[i]->copyTo(outputs[i]);
+    }
 };

 Ptr<BlankLayer> BlankLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -20,17 +20,17 @@ public:
    class PBody : public cv::ParallelLoopBody
    {
        Func &func;
-        Dtype *data;
+        Dtype *src, *dst;
    public:

-        PBody(Mat &mat, Func &func_) :
-            func(func_), data(mat.ptr<Dtype>())
+        PBody(Mat &src, Mat &dst, Func &func_) :
+            func(func_), src(src.ptr<Dtype>()), dst(dst.ptr<Dtype>())
        {}

        void operator()(const Range &r) const
        {
            for (int i = r.start; i < r.end; i++)
-                data[i] = func(data[i]);
+                dst[i] = func(src[i]);
        }
    };

@@ -49,13 +49,13 @@ public:
    {
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            const Mat &src = *inputs[i];
+            Mat &src = *inputs[i];
            Mat &dst = outputs[i];
-            CV_Assert(src.ptr() == dst.ptr() && src.isContinuous());
+            CV_Assert(src.isContinuous() && dst.isContinuous());

            Range sizeRange = Range(0, dst.total());
            CV_Assert(src.type() == CV_32F);
-            PBody<float> body(dst, func);
+            PBody<float> body(src, dst, func);
            if( run_parallel )
                cv::parallel_for_(sizeRange, body);
            else
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -178,7 +178,7 @@ public:
        for (size_t i = 0; i < inputs.size(); i++)
        {
            Mat srcBlob = *inputs[i];
-            MatShape inputShape = shape(srcBlob);
+            MatShape inputShape = shape(srcBlob), outShape = shape(outputs[i]);

            if (performReordering)
            {
@@ -204,6 +204,11 @@ public:
                }
                internals[i].copyTo(outputs[i]);
            }
+            else
+            {
+                if (outputs[i].data != srcBlob.data)
+                    srcBlob.reshape(1, outShape).copyTo(outputs[i]);
+            }
        }
    }

--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -27,6 +27,15 @@ public:
        hasBias = params.get<bool>("bias_term", false);
    }

+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(blobs.size() == 1 + hasBias);
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@@ -72,16 +72,16 @@ public:
    {
        CV_Assert(inputs.size() == 1);

-        outputs.resize(outputsCount >= 0 ? outputsCount : requiredOutputs,
-                       inputs[0]);
-
-        return false;
+        Layer::getMemoryShapes(inputs, outputsCount >= 0 ? outputsCount : requiredOutputs,
+                               outputs, internals);
+        return true;
    }

    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t i = 0; i < outputs.size(); i++)
        {
+            if (outputs[i].data != inputs[0]->data)
                inputs[0]->copyTo(outputs[i]);
        }
    }
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -121,6 +121,10 @@ TEST(Reproducibility_FCN, Accuracy)
    if (sample.size() != inputSize)
        resize(sample, sample, inputSize);

+    std::vector<int> layerIds;
+    std::vector<size_t> weights, blobs;
+    net.getMemoryConsumption(shape(1,3,227,227), layerIds, weights, blobs);
+
    net.setBlob(".data", blobFromImage(sample, 1.));
    net.forward();