tinyengine/code_generator/GeneralMemoryScheduler.py

# ----------------------------------------------------------------------
# Project: TinyEngine
# Title:   GeneralMemoryScheduler.py
#
# Reference papers:
#  - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
#  - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
#  - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
# Contact authors:
#  - Wei-Ming Chen, wmchen@mit.edu
#  - Wei-Chen Wang, wweichen@mit.edu
#  - Ji Lin, jilin@mit.edu
#  - Ligeng Zhu, ligeng@mit.edu
#  - Song Han, songhan@mit.edu
#
# Target ISA:  ARMv7E-M
# ----------------------------------------------------------------------

from .allocator.firstFit import FirstFit
from .constant import (
    FUSE_SGD_UPDATE_STR,
    FUSHION_CONFIG,
    INFERECE_WEIGHT_SIZE,
    TTYPE_INFERNECE,
    TTYPE_STATIC_WEIGHT,
    TTYPE_TRAINING_ACTIVATION,
    TTYPE_TRAINING_GRADIENT,
    TTYPE_TRAINING_WEIGHTS,
)


class GeneralMemoryScheduler:
    def __init__(
        self,
        layer,
        tflite_op=False,
        dummy_address=False,
        memory_limit=10 * 1024 * 1024,
        inplace=True,
        outputTables=None,
        mem_visual_path="codegen/allocation.png",
        VisaulizeTrainable=True,
        sort_by_lifetime=False,
    ):
        self.layer = layer
        self.heads = 0
        self.buffers = {
            "input_output": 0,
            "residual": 0,
            "im2col": 0,
            "kernel": 0,
            "feature": 0,
            "trainable": 0,
        }  # for feature pyramid
        # overall memory info
        self.peakmem = 0
        self.flash = 0
        self.bias = 0
        self.scale = 0
        self.code = 0
        self.allocator = FirstFit(memory_limit, sort_by_lifetime)
        self.outputTables = outputTables
        self.USE_INPLACE = inplace
        self.mem_visual_path = mem_visual_path
        self.tflite_op = tflite_op
        self.dummy_address = dummy_address
        self.VisaulizeTrainable = VisaulizeTrainable

        # for showing layer-wise memory usage
        self.layermem = []

    def _isTranable(self, name):
        for o in self.outputTables:
            if isinstance(name, str) and o.name in name:
                return True
        return False

    def allocateMemory(self):
        # assign the same graph index for inplace operations
        # note: we need to handle stride == 2 for int8 depthwise to save memory
        if self.USE_INPLACE:
            for i, op in enumerate(self.layer):
                if op.params["op"] == "DEPTHWISE_CONV_2D" and op.params["input_dtype"] == "int8" and not self.tflite_op:
                    # set the idx of output and next layer input
                    previous_output_idx = op.output_tensors[0].graph_idx
                    op.output_tensors[0].graph_idx = op.input_tensors[0].graph_idx
                    if (
                        i + 1 < len(self.layer)
                        and len(self.layer[i + 1].input_tensors) > 0
                        and str(self.layer[i + 1].input_tensors[0].graph_idx) == str(previous_output_idx)
                    ):
                        self.layer[i + 1].input_tensors[0].graph_idx = op.input_tensors[0].graph_idx
                    # update following ops' tensors
                    for following_idx in range(i, len(self.layer)):
                        for cnt, inp_tensor in enumerate(self.layer[following_idx].input_tensors):
                            if str(inp_tensor.graph_idx) == str(previous_output_idx):
                                inp_tensor.graph_idx = op.input_tensors[0].graph_idx
                if (
                    op.params["op"] == "TRANSPOSE_CONV_2D"
                    and op.params["group"] == op.params["input_c"]
                    and op.params["group"] == op.params["output_c"]
                    and not self.tflite_op
                    and op.params["stride_h"] == 1
                    and op.params["stride_w"] == 1
                ):
                    # set the idx of output and next layer input
                    previous_output_idx = op.output_tensors[0].graph_idx
                    op.output_tensors[0].graph_idx = op.input_tensors[0].graph_idx
                    # update following ops' tensors
                    for following_idx in range(i, len(self.layer)):
                        for cnt, inp_tensor in enumerate(self.layer[following_idx].input_tensors):
                            if inp_tensor.graph_idx == previous_output_idx:
                                inp_tensor.graph_idx = op.input_tensors[0].graph_idx
                                # set the name in params which will be used later
                                if (
                                    cnt == 1
                                    and "CONV" in self.layer[following_idx].params["op"]
                                    and isinstance(self.layer[following_idx].params["weight_value"], str)
                                ):
                                    self.layer[following_idx].params["weight_value"] = op.input_tensors[0].graph_idx

        num_layers = len(self.layer)
        # add all trainable tensors as one tensor
        length_model = len(self.layer)
        trainable = 0
        weight_size = 0
        bias_size = 0
        for out_t in self.outputTables:
            if "bias" in out_t.name:
                dtype_multiplier = 4
                trainable += int(out_t.len * dtype_multiplier)
                bias_size += int(out_t.len * dtype_multiplier)
            elif "weight" in out_t.name:
                dtype_multiplier = INFERECE_WEIGHT_SIZE
                # find the conv2d owning the tensor
                conv_2d_op = None
                for lay in self.layer:
                    if "weight_name" in lay.params and out_t.name in lay.params["weight_name"]:
                        conv_2d_op = lay
                        break
                assert conv_2d_op is not None
                # check if it is partial
                if "first_k_channel" in conv_2d_op.params and conv_2d_op.params["first_k_channel"] is not None:
                    trainable += int(
                        out_t.len
                        * dtype_multiplier
                        * conv_2d_op.params["first_k_channel"]
                        / conv_2d_op.params["input_c"]
                    )
                    weight_size += int(
                        out_t.len
                        * dtype_multiplier
                        * conv_2d_op.params["first_k_channel"]
                        / conv_2d_op.params["input_c"]
                    )
                else:
                    trainable += int(out_t.len * dtype_multiplier)
                    weight_size += int(out_t.len * dtype_multiplier)
            else:
                pass
        if self.VisaulizeTrainable:
            self.allocator.addTensor(0, length_model, trainable, type=TTYPE_STATIC_WEIGHT)

        all_t_size = 0
        # go through all tensors in the model
        for i, op in enumerate(self.layer):
            # get all unallocated tensors for this layer
            unallocated_tensors = []
            for t in op.input_tensors:
                if t.allocator_idx is None:
                    unallocated_tensors.append(t)
            for cnt, t in enumerate(op.output_tensors):
                if (
                    cnt == 0
                    and not (
                        self.USE_INPLACE
                        and op.params["op"] == "DEPTHWISE_CONV_2D"
                        and op.params["input_dtype"] == "int8"
                        and not self.tflite_op
                    )
                    and not (
                        self.USE_INPLACE
                        and op.params["op"] == "TRANSPOSE_CONV_2D"
                        and op.params["group"] == op.params["input_c"]
                        and op.params["group"] == op.params["output_c"]
                        and not self.tflite_op
                        and op.params["stride_h"] == 1
                        and op.params["stride_w"] == 1
                    )
                ):
                    if t.allocator_idx is None:
                        unallocated_tensors.append(t)
                # assume seocnd outputs will not be inplace updated
                else:
                    if t.allocator_idx is None:
                        unallocated_tensors.append(t)

            # add each tensor
            training_start_idx = _find_training_idx(layers=self.layer)
            for cnt, t in enumerate(unallocated_tensors):
                start_idx = i
                # TODO: this is temp solution
                if training_start_idx > i and "out_multiply" not in t.graph_idx:
                    end_idx = i + 1 if i == 0 else num_layers
                else:
                    end_idx = i + 1
                for idx in range(start_idx + 1, num_layers):
                    for input_t in self.layer[idx].input_tensors:
                        if str(t.graph_idx) == str(input_t.graph_idx):
                            end_idx = idx + 1
                # check if this is output
                ttype = TTYPE_INFERNECE
                if self.outputTables is not None and not FUSHION_CONFIG[FUSE_SGD_UPDATE_STR]:
                    for o in self.outputTables:
                        if o.idx in t.graph_idx:
                            end_idx = len(self.layer)
                            all_t_size += o.len
                            ttype = TTYPE_TRAINING_GRADIENT

                # for patchbased inference, we need the input tensro to be allocated in the patch inference stage
                if (
                    "is_start_of_normal_inference_block" in op.params
                    and op.params["is_start_of_normal_inference_block"]
                ):
                    if t in op.input_tensors:
                        start_idx = 0
                # add the tensor
                t.allocator_idx = self.allocator.addTensor(start_idx, end_idx, t.len(), name=t.graph_idx, type=ttype)
                # propagate the allocation to tensors with the same idx
                for j in range(i + 1, num_layers):
                    opp = self.layer[j]
                    for tt in opp.input_tensors:
                        if str(t.graph_idx) == str(tt.graph_idx):
                            tt.allocator_idx = t.allocator_idx
                    # not inplace update
                    for tt in opp.output_tensors:
                        if str(t.graph_idx) == str(tt.graph_idx):
                            tt.allocator_idx = t.allocator_idx

            # for detailed memory
            layermem = {}

            layermem["MAC"] = op.get_macs()
            layermem["activation"] = op.get_activation_size()
            layermem["scale"] = op.get_scale_size()
            layermem["runtime"] = op.get_sbuf_size()
            layermem["kernel"] = op.get_kbuf_size()
            self._enlargeBuffer("im2col", layermem["runtime"])
            self._enlargeBuffer("kernel", layermem["kernel"])

            if (
                "weight_name" in op.params
                and self._isTranable(op.params["weight_name"])
                and op.params["op"] != "TRANSPOSE_CONV_2D"
            ):
                size = int(op.get_weights_size())
                self.buffers["trainable"] += size
                layermem["trainable"] = size
                layermem["weight"] = 0
            else:
                layermem["weight"] = int(op.get_weights_size())
            if "bias_name" in op.params and self._isTranable(op.params["bias_name"]):
                size = int(op.get_bias_size())
                self.buffers["trainable"] += size
                if "trainable" in layermem:
                    layermem["trainable"] += size
                else:
                    layermem["trainable"] = size
                layermem["bias"] = 0
            else:
                layermem["bias"] = int(op.get_bias_size())
            # if it is float32 op, then their wegiths/bias should from SRAM buffers
            if op.params["input_dtype"] != "int8":
                layermem["scale"] = 0
                layermem["bias"] = 0
                layermem["weight"] = 0
            self.__increaseFlash(layermem["weight"])
            self.__increaseFlash(layermem["bias"])
            self.__increaseFlash(layermem["scale"])

            self.layermem.append(layermem)

        # assign data dtype for each tensor for visualization
        # we need to figure out training_weight and training_activation here
        # for training_weight, it should contain weights of "transpose conv"
        # then, other tensors in training can be categorized as training activation
        training_start_idx = _find_training_idx(self.layer)
        # assign every tenosrs labeled as TTYPE_INFERNECE after the index as TTYPE_TRAINING_ACTIVATION
        for r in self.allocator.rectangles:
            if r["type"] == TTYPE_INFERNECE and r["end"] > training_start_idx:
                r["type"] = TTYPE_TRAINING_ACTIVATION
        # for each tranpose conv, find it
        for i, op in enumerate(self.layer):
            if op.params["op"] == "TRANSPOSE_CONV_2D":
                # if any tenosr used by this layer
                for r in self.allocator.rectangles:
                    if r["end"] <= training_start_idx:
                        continue
                    if r["name"] == op.params["weight_name"]:
                        r["type"] = TTYPE_TRAINING_WEIGHTS

        # find out int8 inplace depthwise conv and stride == 2
        for i, op in enumerate(self.layer):
            if (
                op.params["op"] == "DEPTHWISE_CONV_2D"
                and op.params["input_dtype"] == "int8"
                and op.params["stride_h"] == op.params["stride_w"] == 2
            ):
                if op.input_tensors[0].allocator_idx == op.output_tensors[0].allocator_idx:
                    self.allocator.rectangles[op.input_tensors[0].allocator_idx]["stride2_inplace_idx"] = i

        # Reorder the rectangles to decide which tensor needs to be scheduled first
        self.allocator.sortSize()
        self.allocator.allocate()
        self.allocator.visualize(self.mem_visual_path)
        self._enlargeBuffer("input_output", self.allocator.get_peak())

        # sanity check, see if all tensors have been allocated
        for i, op in enumerate(self.layer):
            # get all unallocated tensors for this layer
            for cnt, t in enumerate(op.input_tensors):
                assert t.allocator_idx is not None
            for cnt, t in enumerate(op.output_tensors):
                assert t.allocator_idx is not None

        # assign the address according to placement
        for i, op in enumerate(self.layer):
            # get all unallocated tensors for this layer
            for cnt, t in enumerate(op.input_tensors):
                if cnt == 0:
                    op.params["input_buf_add_offset"] = self.allocator.getIdxAddress(t.allocator_idx)
                    op.params["input_buf_add"] = "front"
                elif cnt == 1:
                    op.params["input2_buf_add_offset"] = self.allocator.getIdxAddress(t.allocator_idx)
                    op.params["input2_buf_add"] = "front"
                elif cnt == 2:
                    op.params["input3_buf_add_offset"] = self.allocator.getIdxAddress(t.allocator_idx)
                    op.params["input3_buf_add"] = "front"
                op.input_tensors[cnt].buffer_name = "buffer0"
                op.input_tensors[cnt].buffer_address = self.allocator.getIdxAddress(t.allocator_idx)
            for cnt, t in enumerate(op.output_tensors):
                if cnt == 0:
                    op.params["output_buf_add_offset"] = self.allocator.getIdxAddress(t.allocator_idx)
                    op.params["output_buf_add"] = "front"
                    op.output_tensors[cnt].buffer_name = "buffer0"
                    op.output_tensors[cnt].buffer_address = self.allocator.getIdxAddress(t.allocator_idx)
                if cnt == 1:
                    op.params["output2_buf_add_offset"] = self.allocator.getIdxAddress(t.allocator_idx)
                    op.params["output2_buf_add"] = "front"
                    op.output_tensors[cnt].buffer_name = "buffer0"
                    op.output_tensors[cnt].buffer_address = self.allocator.getIdxAddress(t.allocator_idx)

        # calculate peak mem
        self.peakmem = (
            self.allocator.get_peak() + self.buffers["im2col"] + self.buffers["kernel"]  # + self.buffers["trainable"]
        )

    def dumpLayerIndex(self):
        # header
        print("-" * 14 + " Tensor Allocation Details " + "-" * 14)
        print(" #op |   operator type   | input index | output index |")
        for cnt, l in enumerate(self.layer):
            operator_num = "#" + str(cnt)
            type = str(l.params["op"])
            input_tensor = ""
            for cnt_inp, inp in enumerate(l.input_tensors):
                input_tensor += str(inp.allocator_idx)
                if cnt_inp < len(l.input_tensors) - 1:
                    input_tensor += ","
            output_tensor = str(l.output_tensors[0].allocator_idx)
            string = (
                operator_num.ljust(5)
                + "|"
                + type.ljust(19)
                + "|"
                + input_tensor.ljust(13)
                + "|"
                + output_tensor.ljust(14)
                + "|"
            )
            print(string)

    def dumpLayerMem(self):
        # header
        print(
            "----------------------------------------------------  Schedule Details ----------------------------------------------------------------"  # noqa: E501
        )
        print(
            "----------------------|                      SRAM                      ||                     Flash                      |             |"  # noqa: E501
        )
        print(
            "----------------------|  activation  |  runtime  | trainable  |  sum   ||   weight   |   bias   |  scale   |     sum     |     MAC     |"  # noqa: E501
        )

        layermem = self.layermem
        self.__dumpMemInfo(layermem)

    def __dumpMemInfo(self, layermem):
        string = "-------Schedule-------|"
        maxActive = self.buffers["input_output"]
        maxRuntime = self.buffers["im2col"] + self.buffers["kernel"]
        maxTrainable = self.buffers["trainable"]
        totalWeight = self.__sumKey(layermem, "weight")
        totalBias = self.__sumKey(layermem, "bias")
        totalScale = self.__sumKey(layermem, "scale")
        totalMAC = self.__sumKey(layermem, "MAC")
        string += str(maxActive).ljust(14) + "|"
        string += str(maxRuntime).ljust(11) + "|"
        string += str(maxTrainable).ljust(12) + "|"
        string += str(maxActive + maxRuntime + maxTrainable).ljust(8) + "||"
        string += str(totalWeight).ljust(12) + "|"
        string += str(totalBias).ljust(10) + "|"
        string += str(totalScale).ljust(10) + "|"
        string += str(totalWeight + totalBias + totalScale).ljust(13) + "|"
        string += str(totalMAC).ljust(13) + "|"
        print(string)
        for i, _ in enumerate(layermem):
            layer_info = self.layer[i].get_layer_info()
            string = ""
            string += str(i) + ":" + layer_info["op"]
            string = string.ljust(22) + "|"
            SRAM = 0
            if "activation" in layermem[i]:
                substr = (
                    str(layermem[i]["activation"]) + " (" + "{:.0%}".format(layermem[i]["activation"] / maxActive) + ")"
                )
                string += substr.ljust(14) + "|"
                SRAM += layermem[i]["activation"]
            if "runtime" in layermem[i]:
                sbuf = layermem[i]["runtime"] + layermem[i]["kernel"]
                substr = str(sbuf) + " (" + "{:.0%}".format(sbuf / maxRuntime) + ")"
                string += substr.ljust(11) + "|"
                SRAM += sbuf
            else:
                string = string.ljust(49) + "|"
            if "trainable" in layermem[i]:
                substr = (
                    str(layermem[i]["trainable"])
                    + " ("
                    + "{:.0%}".format(layermem[i]["trainable"] / maxTrainable)
                    + ")"
                )
                string += substr.ljust(12) + "|"
                SRAM += layermem[i]["trainable"]
            else:
                string = string.ljust(62) + "|"

            # SRAM end
            string += str(SRAM)
            string = string.ljust(71) + "||"
            flash = 0
            if "weight" in layermem[i]:
                substr = (
                    str(layermem[i]["weight"])
                    + " ("
                    + "{:.0%}".format(layermem[i]["weight"] / (totalWeight + 0.0001))
                    + ")"
                )
                string += str(substr).ljust(12) + "|"
                flash += layermem[i]["weight"]
            if "bias" in layermem[i]:
                substr = (
                    str(layermem[i]["bias"]) + " (" + "{:.0%}".format(layermem[i]["bias"] / (totalBias + 0.0001)) + ")"
                )
                string += str(substr).ljust(10) + "|"
                flash += layermem[i]["bias"]
            if "scale" in layermem[i]:
                substr = (
                    str(layermem[i]["scale"]) + " (" + "{:.0%}".format(layermem[i]["scale"] / totalScale + 0.0001) + ")"
                )
                string += str(substr).ljust(10) + "|"
                flash += layermem[i]["scale"]

                if flash > 0:
                    string += (
                        str(flash)
                        + " ("
                        + "{:.0%}".format(flash / (totalWeight + totalBias + totalScale + 0.0001))
                        + ")"
                    )
                    string = string.ljust(121) + "|"
            # flash end
            if "MAC" in layermem[i]:
                substr = str(layermem[i]["MAC"]) + " (" + "{:.0%}".format(layermem[i]["MAC"] / totalMAC) + ")"
                string += str(substr).ljust(13) + "|"
            print(string)

    def __sumKey(self, layers, key):
        result = 0
        for _, layer in enumerate(layers):
            if key in layer:
                result += layer[key]

        return result

    def getBuffers(self):
        return self.buffers

    # Maximum binary size: This should be updated if any change in the inference side
    # TODO: Combine with code generation to get more accurate result
    def profileResult(self):
        return self.peakmem, self.flash + self.bias + self.scale + int(self.code * 1024)

    def __increaseFlash(self, size):
        self.flash += int(size)

    def _enlargeBuffer(self, buf_str, size):
        if buf_str == "input_output" or buf_str == "residual":
            self.buffers[buf_str] = max(self.buffers[buf_str], int(size))
        else:
            if buf_str not in self.buffers:
                self.buffers[buf_str] = size
            else:
                self.buffers[buf_str] = max(self.buffers[buf_str], size)


def _find_training_idx(layers):
    idx = len(layers)
    for cnt, l in enumerate(layers):
        if l.params["op"] in ["CAST"]:
            return cnt
    return idx