tinyengine/code_generator/codetemplate/depthwiseTemplate_bitmask.py

class depthwiseInplace_bitmask(object):
    def __init__(self, kernel_h, kernel_w, pad_h, pad_w, stride, dataflow="CHW", fp_requantize=False):
        self.pad_h = pad_h
        self.pad_w = pad_w
        self.kernel_h = kernel_h
        self.kernel_w = kernel_w
        self.stride = stride
        self.arch = "ARMv7E-M"  # by default
        self.dataflow = dataflow
        self.fp_requantize = fp_requantize

    def setArch(self, arch):
        self.arch = arch

    def __genCode(self):
        retString = self.__genHeader()
        retString += self.__genKernelDefine() + ";\n"
        retString += self.genFuncDefine() + "\n{\n"
        if self.dataflow == "CHW":
            retString += self.__genBufferInitialization()
        elif self.dataflow == "CWH":
            retString += self.__genBufferInitializationCWH()

        if self.dataflow == "CHW":
            retString += self.__genInplaceKernel()
        elif self.dataflow == "CWH":
            retString += self.__genInplaceKernelCWH()

        retString += self.__genEndStr()
        if self.dataflow == "CHW":
            retString += "\n" + self.__genKernel()
        elif self.dataflow == "CWH":
            retString += "\n" + self.__genKernelCWH()

        return retString

    def __getFunctionName(self):
        return (
            "depthwise_kernel"
            + str(self.kernel_h)
            + "x"
            + str(self.kernel_w)
            + "_stride"
            + str(self.stride)
            + "_inplace_"
            + self.dataflow
            + "_fpreq_bitmask"
        )

    def __getKernelName(self):
        return (
            "depthwise_kernel"
            + str(self.kernel_h)
            + "x"
            + str(self.kernel_w)
            + "_stride"
            + str(self.stride)
            + "_inplace_kernel_"
            + self.dataflow
            + "_fpreq_bitmask"
        )

    def genFuncDefine(self):
        retString = ""
        function_name = self.__getFunctionName()
        retString += "tinyengine_status " + function_name
        retString += """(q7_t *input, const uint16_t input_x, const uint16_t input_y,
            const uint16_t input_ch, const q7_t *kernel, const int32_t *bias, const int32_t *biasR,
            const float *scales,
            const int32_t output_offset, const int32_t input_offset,
            const int32_t output_activation_min,
            const int32_t output_activation_max, q7_t *output, q7_t *output_mask,
            const uint16_t output_x, const uint16_t output_y,
            const uint16_t output_ch, q15_t *runtime_buf, q7_t pad_value)"""
        return retString

    def __genKernelDefine(self):
        retString = ""
        retString += (
            "void "
            + self.__getKernelName()
            + """(
    const uint16_t output_y, const uint16_t output_x,
    const int32_t *bias, const int32_t *biasR, const q7_t *ksrc, const float *scales,
    q7_t *output, q7_t *output_mask, const int mask_idx, const int32_t output_offset,
    const int32_t activation_min, const int32_t activation_max,
    q7_t *cols_8b_iterptr, const uint16_t column_x, int channel_offset)"""
        )

        return retString

    def __genHeader(self):
        retString = (
            """/* This file is automatically generated */
/* ----------------------------------------------------------------------
 * Project:      TinyEngine
 * Description:  for sparse in-place """
            + str(self.kernel_h)
            + "x"
            + str(self.kernel_w)
            + """ depth-wise convolution (HWC->CHW->HWC)
 * Target ISA:  """
            + self.arch
            + """
 * Author: wmchen@mit.edu
 * -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h" //TODO: remove this in the future for self-contained
#include "tinyengine_function.h"\n"""
        )

        return retString

    def __genBufferInitialization(self):
        retString = (
            """
    uint16_t c,i,j;
    q7_t *cols_8b_start = (q7_t *)runtime_buf;
    q7_t* cols_8b = (q7_t* )cols_8b_start;

    //Set padding value
    q7_t PAD8 = pad_value;
    /* setup the padding regions for Im2col buffers */
    //top region: 8bit x (input_x + pad_w * 2) x pad_h: unroll by pad value
    for(i = 0; i < input_x + """
            + str(self.pad_w * 2)
            + """; i++){"""
        )
        for i in range(self.pad_h):
            retString += """
        *cols_8b++ = PAD8;"""
        retString += """
    }

    //middle regions: left and right regions
    for(i = 0; i < input_y; i++){"""
        for i in range(self.pad_w):
            retString += """
        *cols_8b++ = PAD8;//left"""
        retString += """
        cols_8b += input_x; //skip middle"""
        for i in range(self.pad_w):
            retString += """
        *cols_8b++ = PAD8;//right"""
        retString += (
            """
    }

    //bottom region: 8bit x (input_x + pad_w * 2) x pad_h: unroll by pad value
    for(i = 0; i < input_x + """
            + str(self.pad_w * 2)
            + """; i++){"""
        )
        for i in range(self.pad_h):
            retString += """
        *cols_8b++ = PAD8;"""
        retString += """
    }

    const q7_t *src;
    const q7_t *ksrc = kernel;
"""
        return retString

    def __genBufferInitializationCWH(self):
        # HWC to CWH data flow
        retString = (
            """
    uint16_t c,i,j;
    q7_t *cols_8b_start = (q7_t *)runtime_buf;
    q7_t* cols_8b = (q7_t* )cols_8b_start;

    //HWC to CWH data flow
    //Set padding value
    q7_t PAD8 = pad_value;
    /* setup the padding regions for Im2col buffers */
    //top region: 8bit x (input_y + pad_h * 2) x pad_w: unroll by pad value
    for(i = 0; i < input_y + """
            + str(self.pad_h * 2)
            + """; i++){"""
        )
        for i in range(self.pad_w):
            retString += """
        *cols_8b++ = PAD8;"""
        retString += """
    }

    //middle regions: left and right regions
    for(i = 0; i < input_x; i++){"""
        for i in range(self.pad_h):
            retString += """
        *cols_8b++ = PAD8;//left"""
        retString += """
        cols_8b += input_y; //skip middle"""
        for i in range(self.pad_h):
            retString += """
        *cols_8b++ = PAD8;//right"""
        retString += (
            """
    }

    //bottom region: 8bit x (input_y + pad_h * 2) x pad_w: unroll by pad value
    for(i = 0; i < input_y + """
            + str(self.pad_h * 2)
            + """; i++){"""
        )
        for i in range(self.pad_w):
            retString += """
        *cols_8b++ = PAD8;"""
        retString += """
    }

    const q7_t *src;
    const q7_t *ksrc = kernel;
"""
        return retString

    def __genInplaceKernelCWH(self):
        retString = ""
        retString += """
    for (c = 0; c < input_ch; c++){"""
        retString += self.__genFixedLoadHWC2CWH()
        retString += self.__assignInplaceOut(2, "input")
        retString += self.__genHandle1CH_CWH(2, "input_ch")
        retString += """
    }\n"""
        return retString

    def __genFixedLoadHWC2CWH(self):
        retString = (
            """
        cols_8b = (q7_t*)(cols_8b_start + """
            + str(self.pad_w)
            + """ * (input_y) + """
            + str(self.pad_h * self.pad_w * 2)
            + """); //skip """
            + str(self.pad_w)
            + """ rows
        src = input;
        for(i = 0; i < input_x; i++){
            cols_8b += """
            + str(self.pad_h)
            + """;//skip front
            for(j = 0; j < input_y; j++){
                *cols_8b++ = src[input_x * j * input_ch];// + input_offset;
            }
            src += input_ch;
            cols_8b += """
            + str(self.pad_h)
            + """;//skip end
        }\n"""
        )

        return retString

    def __genInplaceKernel(self):
        retString = ""
        retString += """
    for (c = 0; c < input_ch; c++){"""
        retString += self.__genFixedLoadHWC2CHW()
        retString += self.__assignInplaceOut(2, "input")
        retString += self.__genHandle1CH(2, "input_ch")
        retString += """
    }\n"""
        return retString

    def __genFixedLoadHWC2CHW(self):
        retString = (
            """
        cols_8b = (q7_t*)(cols_8b_start + """
            + str(self.pad_h)
            + """ * (input_x) + """
            + str(self.pad_h * self.pad_w * 2)
            + """); //skip """
            + str(self.pad_h)
            + """ rows
        src = input;
        for(i = 0; i < input_y; i++){
            cols_8b += """
            + str(self.pad_w)
            + """;//skip front
            for(j = 0; j < input_x; j++){
                *cols_8b++ = *src;// + input_offset;
                src += input_ch;
            }
            cols_8b += """
            + str(self.pad_w)
            + """;//skip end
        }\n"""
        )

        return retString

    def __genEndStr(self):
        return "\n}"

    def __assignInplaceOut(self, pre_indent, out_str):
        return "    " * pre_indent + "q7_t *inplace_out = " + out_str + ";\n"

    def __genHandle1CH(self, pre_indent, out_offset_str):
        retString = "    " * pre_indent + "if (c % 8 == 0 && c > 1) output_mask++;\n"
        retString += (
            "    " * pre_indent
            + self.__getKernelName()
            + "(output_y, output_x, bias++, biasR++, ksrc, scales++, inplace_out, "
            + "output_mask, c % 8, output_offset,output_activation_min, output_activation_max,cols_8b_start, input_x, "
            + out_offset_str
            + ");\n"
        )
        retString += "    " * pre_indent + "ksrc += " + str(self.kernel_h * self.kernel_w) + ";\n"
        retString += "    " * pre_indent + "input++;\n"
        return retString

    def genFile(self, path):
        import os

        outpath = os.path.join(path, self.__getFunctionName() + ".c")
        outf = open(outpath, "w")
        outf.write(self.__genCode())
        outf.close()

    # ********************* KERNEL ********************* #
    def __genMACStr(self, input_cnt, kernel_cnt, stride):
        inbuf = "cols_8b"
        kerbuf = "ksrc"
        sum0 = "sum0"
        sum1 = "sum1"

        ret = (
            "            "
            + sum0
            + " += "
            + inbuf
            + "["
            + str(input_cnt)
            + "]"
            + "*"
            + kerbuf
            + "["
            + str(kernel_cnt)
            + "]"
            + ";\n"
        )
        ret += (
            "            "
            + sum1
            + " += "
            + inbuf
            + "["
            + str(input_cnt + stride)
            + "]"
            + "*"
            + kerbuf
            + "["
            + str(kernel_cnt)
            + "]"
            + ";\n"
        )

        return ret

    def __genConvString(self):
        retString = ""
        kercnt = 0
        for i in range(self.kernel_h):
            if i > 0:
                retString += "            cols_8b += column_x + " + str(self.pad_w * 2) + ";\n"
            for j in range(self.kernel_w):
                retString += self.__genMACStr(j, kercnt, self.stride)
                kercnt += 1

        return retString

    def __genConvStringCWH(self):
        retString = ""
        kercnt = 0
        for i in range(self.kernel_w):
            if i > 0:
                retString += "            cols_8b += column_x + " + str(self.pad_h * 2) + ";\n"
            for j in range(self.kernel_h):
                retString += self.__genMACStr(j, kercnt, self.stride)
                kercnt += 1

        return retString

    def __genConvLeftStringCWH(self):
        retString = ""
        kercnt = 0
        for i in range(self.kernel_w):
            if i > 0:
                retString += "            cols_8b += column_x + " + str(self.pad_h * 2) + ";\n"
            for j in range(self.kernel_h):
                retString += self.__genLeftMACStr(j, kercnt, self.stride)
                kercnt += 1

        return retString

    def __genLeftMACStr(self, input_cnt, kernel_cnt, stride):
        inbuf = "cols_8b"
        kerbuf = "ksrc"
        sum0 = "sum"

        ret = (
            "            "
            + sum0
            + " += "
            + inbuf
            + "["
            + str(input_cnt)
            + "]"
            + "*"
            + kerbuf
            + "["
            + str(kernel_cnt)
            + "]"
            + ";\n"
        )

        return ret

    def __genConvLeftString(self):
        retString = ""
        kercnt = 0
        for i in range(self.kernel_h):
            if i > 0:
                retString += "            cols_8b += column_x + " + str(self.pad_w * 2) + ";\n"
            for j in range(self.kernel_w):
                retString += self.__genLeftMACStr(j, kercnt, self.stride)
                kercnt += 1

        return retString

    def __genKernel(self):
        retString = ""
        # function name
        retString += self.__genKernelDefine() + "\n{\n"
        retString += "    #define STRIDE " + str(self.stride) + "\n"
        retString += "    int i, j;\n"
        retString += "    q7_t mask_value;\n"

        # initialize accumulators as bias
        retString += """    /* MACs for each output */
    for (i = 0; i < output_y; i++) {
        for (j = 0; j < output_x / 2; j++) {
            q7_t *cols_8b = cols_8b_iterptr;

            q31_t sum0 = bias[0] + biasR[0];
            q31_t sum1 = bias[0] + biasR[0];
            """
        # computation
        retString += """
            /* computation */\n"""
        retString += self.__genConvString()

        # requantize
        retString += """
            /* requantize */
            sum0 = (float) sum0 * *scales;
            sum0 += output_offset;
            mask_value = 1;
            if (sum0 < activation_min){
                sum0 = activation_min;
                mask_value = 0;
            }
            if (sum0 > activation_max){
                sum0 = activation_max;
                mask_value = 0;
            }
            output[(i * output_x + j * 2) * channel_offset] = sum0;
            if (mask_value == 1)
                BIT_SET(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx);
            else
                BIT_CLEAR(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx);

            sum1 = (float) sum1 * *scales;
            sum1 += output_offset;
            mask_value = 1;
            if (sum1 < activation_min){
                sum1 = activation_min;
                mask_value = 0;
            }
            if (sum1 > activation_max){
                sum1 = activation_max;
                mask_value = 0;
            }
            output[(i * output_x + (j * 2 + 1)) * channel_offset] = sum1;
            if (mask_value == 1)
                BIT_SET(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx);
            else
                BIT_CLEAR(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx);

            cols_8b_iterptr += STRIDE * 2;
        }
        """

        # left over for odd dimension
        retString += """if (output_x & 1) {
            q7_t * cols_8b = cols_8b_iterptr;
            q31_t sum = bias[0] + biasR[0];\n"""

        retString += self.__genConvLeftString()
        retString += """
            sum = (float) sum * *scales;
            sum += output_offset;
            mask_value = 1;
            if (sum < activation_min){
                sum = activation_min;
                mask_value = 0;
            }
            if (sum > activation_max){
                sum = activation_max;
                mask_value = 0;
            }
            output[(i * output_x + output_x - 1) * channel_offset] = sum;
            if (mask_value == 1)
                BIT_SET(output_mask[(i * output_x + output_x - 1) * channel_offset / 8], mask_idx);
            else
                BIT_CLEAR(output_mask[(i * output_x + output_x - 1) * channel_offset / 8], mask_idx);

            cols_8b_iterptr += STRIDE;
        }
        """
        if self.stride != 1:
            retString += "cols_8b_iterptr += " + str(self.pad_w) + " * 2 - (column_x & 1);\n"
            retString += "        cols_8b_iterptr += (STRIDE - 1) * (column_x + " + str(self.pad_w) + " * 2);\n"
        else:
            retString += "cols_8b_iterptr += " + str(self.pad_w) + " * 2;\n"

        # end of function
        retString += """    }
}\n"""
        return retString

    def __genKernelCWH(self):
        retString = ""
        # function name
        retString += self.__genKernelDefine() + "\n{\n"
        retString += "    #define STRIDE " + str(self.stride) + "\n"
        retString += "    int i, j;\n"

        # initialize accumulators as bias
        retString += """    /* MACs for each output */
    for (j = 0; j < output_x; j++) {
        for (i = 0; i < output_y / 2; i++) {
            q7_t *cols_8b = cols_8b_iterptr;

            q31_t sum0 = bias[0] + biasR[0];
            q31_t sum1 = bias[0] + biasR[0];
            """
        # computation
        retString += """
            /* computation */\n"""
        retString += self.__genConvStringCWH()

        # requantize
        retString += """
            /* requantize */
            sum0 = (float) sum0 * *scales;
            sum0 += output_offset;
            mask_value = 1;
            if (sum0 < activation_min){
                sum0 = activation_min;
                mask_value = 0;
            }
            if (sum0 > activation_max){
                sum0 = activation_max;
                mask_value = 0;
            }
            output[(i * output_x + j * 2) * channel_offset] = sum0;
            if (mask_value == 1)
                BIT_SET(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx);
            else
                BIT_CLEAR(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx);

            sum1 = (float) sum1 * *scales;
            sum1 += output_offset;
            mask_value = 1;
            if (sum1 < activation_min){
                sum1 = activation_min;
                mask_value = 0;
            }
            if (sum1 > activation_max){
                sum1 = activation_max;
                mask_value = 0;
            }
            output[(i * output_x + (j * 2 + 1)) * channel_offset] = sum1;
            if (mask_value == 1)
                BIT_SET(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx);
            else
                BIT_CLEAR(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx);

            cols_8b_iterptr += STRIDE * 2;
        }
        """

        # left over for odd dimension
        retString += """if (output_y & 1) {
            q7_t * cols_8b = cols_8b_iterptr;
            q31_t sum = bias[0] + biasR[0];\n"""

        retString += self.__genConvLeftStringCWH()

        retString += """
            sum = (float) sum * *scales;
            sum += output_offset;
            mask_value = 1;
            if (sum < activation_min){
                sum = activation_min;
                mask_value = 0;
            }
            if (sum > activation_max){
                sum = activation_max;
                mask_value = 0;
            }
            output[(i * output_x + output_x - 1) * channel_offset] = sum;
            output_mask[(i * output_x + output_x - 1) * channel_offset] = mask_value;

            cols_8b_iterptr += STRIDE;
        }
        """
        if self.stride != 1:
            retString += "cols_8b_iterptr += " + str(self.pad_h) + " * 2 - (column_x & 1);\n"
            retString += "        cols_8b_iterptr += (STRIDE - 1) * (column_x + " + str(self.pad_h) + " * 2);\n"
        else:
            retString += "cols_8b_iterptr += " + str(self.pad_h) + " * 2;\n"

        # end of function
        retString += """    }
}\n"""
        return retString

    # ********************* unit test ********************* #
    def genTestUint(self, dirpath, height, width, channel):
        # code to test this layer
        outpath = dirpath + "\\test_" + str(self.__getFunctionName()) + ".h"
        outf = open(outpath, "w")
        string = (
            """#include "stdio.h"
#include "scratch_buffer.h"
extern "C"
{
#include "kernel_buffer.h"
#include "tinyengine_function.h"
}
UART_HandleTypeDef UART;
void printLog(const char* s) {
  static bool is_initialized = false;
  if (!is_initialized) {
        UART.Instance = USART1;
        UART.Init.BaudRate = 115200;
        UART.Init.WordLength = UART_WORDLENGTH_8B;
        UART.Init.StopBits = UART_STOPBITS_1;
        UART.Init.Parity = UART_PARITY_NONE;
        UART.Init.Mode = UART_MODE_TX_RX;
        UART.Init.HwFlowCtl = UART_HWCONTROL_NONE;
        UART.Init.OverSampling = UART_OVERSAMPLING_16;
        UART.Init.OneBitSampling = UART_ONE_BIT_SAMPLE_DISABLE;
        UART.AdvancedInit.AdvFeatureInit = UART_ADVFEATURE_NO_INIT;
        if (HAL_UART_Init(&UART) != HAL_OK)
        {
            //Error handling
        }
        is_initialized = true;
  }
  HAL_UART_Transmit(&UART, (uint8_t *)s, strlen(s), 10);
}
q7_t input["""
            + str(width * height * channel)
            + """] = {"""
        )
        for i in range(width * height * channel):
            string += str(i % 128) + ","
        string += (
            """};
q7_t input1["""
            + str(width * height * channel)
            + """] = {"""
        )
        for i in range(width * height * channel):
            string += str(i % 128) + ","
        string += (
            """};
q7_t output1["""
            + str(width * height * channel)
            + """];
q7_t output2["""
            + str(width * height * channel)
            + """];
void test_"""
            + str(self.__getFunctionName())
            + """()\n{
    printLog("test layer """
            + str(self.__getFunctionName())
            + """'s depthwise convolution, input_y:"""
            + str(height)
            + """,input_x:"""
            + str(width)
            + """,input_ch:"""
            + str(channel)
            + """\\r\\n");
    int input_x = """
            + str(width)
            + ", input_y = "
            + str(height)
            + ", input_ch = "
            + str(channel)
            + ", output_ch = "
            + str(channel)
            + ";\n"
            + """
    int output_x = """
            + str(width // self.stride)
            + ", output_y = "
            + str(height // self.stride)
            + ";\n"
            + """
    int32_t kbuf[1];
    set_kernel_buffer((int32_t*)kbuf, 1);
    int16_t sbuf["""
            + str(
                max((width + self.pad_w * 2) * (height + self.pad_h * 2), self.kernel_h * self.kernel_w * channel * 2)
            )
            + """];
    set_scratch_buffer((int16_t*)sbuf, """
            + str(
                max((width + self.pad_w * 2) * (height + self.pad_h * 2), self.kernel_h * self.kernel_w * channel * 2)
            )
            + """);
    q7_t kernelchw["""
            + str(self.kernel_h * self.kernel_w * channel)
            + "] = {"
        )
        for c in range(channel):
            for i in range(self.kernel_h):
                for j in range(self.kernel_w):
                    string += "1,"
        string += (
            """};
    q7_t kernelhwc["""
            + str(self.kernel_h * self.kernel_w * channel)
            + "] = {"
        )
        for j in range(channel):
            for i in range(self.kernel_h):
                for c in range(self.kernel_w):
                    string += "1,"
        string += (
            """};
    int32_t bias["""
            + str(channel)
            + "] = {"
        )
        for i in range(channel):
            string += str(i * 100) + ","
        string += (
            """};
    int32_t multiplier["""
            + str(channel)
            + "] = {"
        )
        for i in range(int(channel / 2)):
            string += "1706270591,1380135680,"
        string += (
            """};
    int32_t shift["""
            + str(channel)
            + "] = {"
        )
        for i in range(int(channel / 2)):
            string += "-6,-5,"
        string += (
            """};

    arm_depthwise_conv_s8_opt(input, input_x, input_y, input_ch, kernelhwc, output_ch, """
            + str(self.kernel_w)
            + ","
            + str(self.kernel_h)
            + ","
            + str(self.pad_w)
            + ","
            + str(self.pad_h)
            + ","
            + str(self.stride)
            + ","
            + str(self.stride)
            + """, bias, output1, shift, multiplier, output_x, output_y, -128, 0, -128, 127, 1, 1, sbuf);
    """
            + self.__getFunctionName()
            + """(input, input_x, input_y, input_ch, kernelchw, bias, shift, multiplier, -128, 0,-128, 127,"""
            + """ output2, output_x, output_y, output_ch, sbuf, 0);
    bool correct = true;
    int i;
    for (i = 0; i < output_x * output_y * output_ch; i++){
        if(output1[i] != input[i]){
            correct = false;
            break;
        }
    }

    char buf[100];
    if (correct)
        printLog("Results match\\r\\n");
    else{
        printLog("Error results\\r\\n");
        sprintf(buf,"First inconsistency: %d\\r\\n", i);
        printLog(buf);
    }

    printLog("Test speed now\\r\\n");
    # define BATCHNUM 100
    float fmac = output_x * output_y * input_ch * """
            + str(self.kernel_h * self.kernel_w)
            + """;
    int opstart = HAL_GetTick();
    for(i = 0; i < BATCHNUM; i++){
        arm_depthwise_conv_s8_opt(input, input_x, input_y, input_ch, kernelhwc, output_ch, """
            + str(self.kernel_w)
            + ","
            + str(self.kernel_h)
            + ","
            + str(self.pad_w)
            + ","
            + str(self.pad_h)
            + ","
            + str(self.stride)
            + ","
            + str(self.stride)
            + """, bias, output1, shift, multiplier, output_x, output_y, -128, 128, -128, 127, 1, 1, sbuf);
    }
    int opend = HAL_GetTick();

    float time = opend - opstart;
    time /= BATCHNUM;//ms
    float mac_pers = fmac / time / 1000;//*1000000/1000 > /1000
    sprintf(buf,"%.2f|%.3f|", time, mac_pers);
    printLog(buf);

    opstart = HAL_GetTick();
    for(int i = 0; i < BATCHNUM; i++){
        """
            + self.__getFunctionName()
            + """(input, input_x, input_y, input_ch, kernelchw, bias, shift, multiplier, -128, 128,-128, 127,"""
            + """ output2, output_x, output_y, output_ch, sbuf, -128);
    }
    opend = HAL_GetTick();

    time = opend - opstart;
    time /= BATCHNUM;//ms
    mac_pers = fmac / time / 1000;//*1000000/1000 > /1000
    sprintf(buf,"%.2f|%.3f|\\r\\n", time, mac_pers);
    printLog(buf);
}\n
"""
        )
        outf.write(string)
        outf.close()


class includeFile(object):
    def __init__(self, path):
        self.path = path
        self.defstring = ""

    def addDefine(self, defstr):
        self.defstring += defstr + ";\n"

    def writeFile(self):
        import os

        outpath = os.path.join(self.path, "genInclude.h")
        outf = open(outpath, "w")
        outf.write(self.defstring)