tinyengine/code_generator/CodeGenerator.py
2022-08-26 17:42:09 +00:00

668 lines
24 KiB
Python

# ----------------------------------------------------------------------
# Project: TinyEngine
# Title: CodeGenerator.py
#
# Reference papers:
# - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
# - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
# - MCUNetV3: On-Device Training Under 256KB Memory, arXiv:2206.15472
# Contact authors:
# - Wei-Ming Chen, wmchen@mit.edu
# - Wei-Chen Wang, wweichen@mit.edu
# - Ji Lin, jilin@mit.edu
# - Ligeng Zhu, ligeng@mit.edu
# - Song Han, songhan@mit.edu
#
# Target ISA: ARMv7E-M
# ----------------------------------------------------------------------
import os
from .OpGenerator import OpGenerator
Codegen_root = "./codegen/"
include_path = Codegen_root + "Include/"
source_path = Codegen_root + "Source/"
use_hard_switsh = False
gen_kernels = True
use_aggressive_unroll = True
class CodeGenerator:
"""Provide utilities to generate C code for a given model and memory schdeule."""
parse_count = 0
header_handle = None
source_handle = None
def __init__(
self,
memsche,
inplace,
precision=8,
unsigned_input=False,
patch_params=None,
FP_output=False,
profile_mode=False,
fp_requantize=False,
tflite_op=False,
dummy_address=False,
outputTables=None,
detectionUtils=None,
):
self.MemSche = memsche
# Check if path exists, create it if not
if not os.path.exists(include_path):
os.makedirs(include_path)
if not os.path.exists(source_path):
os.makedirs(source_path)
self.header_handle = open(include_path + "genModel.h", "w")
self.source_handle = open(source_path + "genModel.c", "w")
self.inplace = inplace
self.BIT = precision
self.unsigned_input = unsigned_input
self.patch_params = patch_params
self.FP_output = FP_output
self.profile_mode = profile_mode
self.fp_requantize = fp_requantize
self.tflite_op = tflite_op
self.dummy_address = dummy_address
self.trainSRAMTable = []
self.outputTables = outputTables
self.detectionUtils = detectionUtils
def _readOnly(self, name):
if self.outputTables is None or name is None:
return True
else:
for o in self.outputTables:
if o.name in name:
return False
return True
def codeGeneration(self):
# buffer in SRAM
self._genMemBuffer()
# parse trainable parameters & assign the corresponding buffers for layers
self._parseTrainable()
# include all headers
self._includeHeaders()
# generate detection output if any
self._genDetprocessing()
# generate patch-based
self._genPatchInference()
# generate invoke function
self._genInvoke()
self._closefp()
# generate operatior kernels
if gen_kernels:
op_gen = OpGenerator(include_path, source_path, self.MemSche.layer, self.fp_requantize)
op_gen.genOpcode()
def _genDetprocessing(self):
if self.detectionUtils is not None:
fp = self.source_handle
fp.write(self.detectionUtils.genPostProcessing())
def _genOpstr(self, op, *args):
if self.profile_mode:
if len(args) > 0:
return op.generate_profiling_str(*args)
else:
return op.generate_profiling_str()
else:
if len(args) > 0:
return op.generate_inference_str(*args)
else:
return op.generate_inference_str()
def _genPatchInference(self):
schedule = self.MemSche
layer_info = schedule.layer[0].get_layer_info()
if "is_patch" in layer_info and layer_info["is_patch"]:
fp = self.source_handle
string = ""
first_height = layer_info["input_h"]
first_width = layer_info["input_w"]
img_w = (first_width - self.patch_params["pad_l"] - self.patch_params["pad_r"]) * self.patch_params[
"n_patch"
]
# by default, we go three stride 2 conv in the patch-based inference
patch_out_w = int((first_width - self.patch_params["pad_l"]) / 8)
# by default, we go three stride 2 conv in the patch-based inference
patch_out_h = int((first_height - self.patch_params["pad_l"]) / 8)
out_w = self.patch_params["output_w"]
# generate code for testing whole inference time
string += (
"""void end2endinference(q7_t* img){
//stage 1
int i, j, h, w, c;
for (i = 0; i < """
+ str(self.patch_params["n_patch"])
+ """; i++){
uint16_t pad_t=0,pad_b=0;
if (i == 0){
pad_t = """
+ str(self.patch_params["pad_l"])
+ """;
}
else if (i == """
+ str(self.patch_params["n_patch"] - 1)
+ """){
pad_b = """
+ str(self.patch_params["pad_r"])
+ """;
}
for (j = 0; j < """
+ str(self.patch_params["n_patch"])
+ """; j++){
uint16_t pad_l=0,pad_r=0;
if (j == 0){
pad_l = """
+ str(self.patch_params["pad_l"])
+ """;
}
else if (j == """
+ str(self.patch_params["n_patch"] - 1)
+ """){
pad_r = """
+ str(self.patch_params["pad_r"])
+ """;
}
/* load partial input from the img */
q7_t* patch_input = &buffer0[0]; // for partial input
int start_x = MAX("""
+ str(first_width - self.patch_params["pad_l"])
+ """ * j - """
+ str(self.patch_params["pad_l"])
+ """,0);
int start_y = MAX("""
+ str(first_height - self.patch_params["pad_l"])
+ """ * i - """
+ str(self.patch_params["pad_l"])
+ """,0);
q7_t* img_ptr = &img[(start_x + start_y * """
+ str(img_w)
+ """) * 3];
//skip top
patch_input += pad_t * """
+ str(first_width)
+ """ * 3;
for (h = pad_t; h < """
+ str(first_height)
+ """ - pad_b; h++){
//skip left
patch_input += pad_l * 3;
//fill middle
int bytes = ("""
+ str(first_width)
+ """ - (pad_l + pad_r)) * 3;
memcpy (patch_input, img_ptr, bytes);
img_ptr += """
+ str(img_w)
+ """ * 3;
patch_input += bytes;
//skip right
patch_input += pad_r * 3;
}
invoke_1patch(pad_t,pad_b,pad_l,pad_r);
/* concat the output from buffer0 (this is set manually for now) */
q7_t* output_ptr = buffer1 + (i * """
+ str(patch_out_w)
+ """ * """
+ str(out_w)
+ """ + j * """
+ str(patch_out_w)
+ """) * """
+ str(self.patch_params["output_c"])
+ """ ;
for (h = 0; h < """
+ str(patch_out_h)
+ """; h++){
for (w = 0; w < """
+ str(patch_out_w)
+ """; w++){
for (c = 0; c < """
+ str(self.patch_params["output_c"])
+ """; c++){
output_ptr[(w + h * """
+ str(out_w)
+ """) * """
+ str(self.patch_params["output_c"])
+ """ + c] = buffer0[(w + h * """
+ str(patch_out_w)
+ """) * """
+ str(self.patch_params["output_c"])
+ """ + c];
}
}
}
}
}
//stage 2
invoke();
}"""
)
string += """
void invoke_1patch(uint16_t pad_t, uint16_t pad_b, uint16_t pad_l ,uint16_t pad_r){
"""
fp.write(string)
# gen patch-based inference code
patch_layers = []
layercnt = 0
for i, op in enumerate(schedule.layer):
layer_info = op.get_layer_info()
if "is_patch" not in layer_info or not layer_info["is_patch"]:
break # end of patch-based
string = "/* layer " + str(layercnt) + ":" + layer_info["op"] + " */\n"
layercnt += 1
fp.write(string)
if layer_info["op"] == "CONV_2D":
# hardcode this memory schedule for quick implementation
# TODO: adjust this according to model architecture and split index
next_layer_info = schedule.layer[i + 1].get_layer_info()
if "is_patch" not in next_layer_info or not next_layer_info["is_patch"]:
layer_info["output_buf_add"] = "front"
layer_info["output_buf_add_offset"] = 0
if self.unsigned_input:
raise Exception("unsigned input is not supported by patch-based yet")
string = self._genOpstr(
op,
False,
self.FP_output,
use_aggressive_unroll,
use_hard_switsh,
self.fp_requantize,
)
fp.write(string)
elif layer_info["op"] == "DEPTHWISE_CONV_2D":
string = self._genOpstr(op, self.fp_requantize)
fp.write(string)
elif layer_info["op"] == "ADD":
string = self._genOpstr(op)
fp.write(string)
patch_layers.append(schedule.layer[i])
# remove these layers for patching for the following code gen
for layer in patch_layers:
schedule.layer.remove(layer)
string = "}\n\n"
fp.write(string)
else: # not patch-based
string = """void end2endinference(q7_t* img){
invoke(NULL);
}
"""
fp = self.source_handle
fp.write(string)
def _genInvoke(self):
fp = self.source_handle
string = "void invoke(float* labels){\n"
fp.write(string)
schedule = self.MemSche
for i, op in enumerate(schedule.layer):
layer_info = op.get_layer_info()
string = "/* layer " + str(i) + ":" + layer_info["op"] + " */\n"
fp.write(string)
if layer_info["op"] == "CONV_2D":
if (
self.FP_output
and "effective_scale" in layer_info
and layer_info["output_scale"] is not None
and layer_info["effective_scale"] is not None
):
use_fp = True
else:
use_fp = False
string = self._genOpstr(
op,
self.unsigned_input,
use_fp,
use_aggressive_unroll,
use_hard_switsh,
self.fp_requantize,
self.tflite_op,
self.dummy_address,
)
fp.write(string)
elif layer_info["op"] == "DEPTHWISE_CONV_2D":
string = self._genOpstr(op, self.fp_requantize)
fp.write(string)
else:
string = self._genOpstr(op)
fp.write(string)
string = "}\n"
fp.write(string)
def _getBufferIndex(self, location):
if location == "front":
return 0
elif location == "end":
return 0
elif location == "residual":
return 1
return None
def _genMemBuffer(self):
schedule = self.MemSche
# define output tensor
string = "#define NNoutput &buffer0[" + str(_findtheinferenceOutput(schedule.layer)) + "];"
fp = self.header_handle
fp.write("\n" + string + "\n")
# activation buffers
string = "\n/* sram:" + str(schedule.peakmem) + ", flash:" + str(schedule.flash) + " */\n"
fp.write(string + "\n")
string = "static signed char buffer[" + str(schedule.peakmem) + "];\n"
fp.write(string)
accumulate_ptr = 0
string = "static signed char *buffer0 = &buffer[" + str(accumulate_ptr) + "];\n"
accumulate_ptr += int(schedule.buffers["input_output"])
fp.write(string)
string = "static signed char *buffer1 = &buffer[" + str(accumulate_ptr) + "];\n"
accumulate_ptr += int(schedule.buffers["residual"])
fp.write(string)
string = "static int16_t *sbuf = (int16_t *)&buffer[" + str(accumulate_ptr) + "];\n"
accumulate_ptr += int(schedule.buffers["im2col"])
fp.write(string)
string = "static int32_t *kbuf = (int32_t *)&buffer[" + str(accumulate_ptr) + "];\n"
accumulate_ptr += int(schedule.buffers["kernel"])
fp.write(string)
string = "const int SBuffer_size = " + str(int(schedule.buffers["im2col"])) + ";\n"
fp.write(string)
string = "const int KBuffer_size = " + str(int(schedule.buffers["kernel"])) + ";\n"
fp.write(string + "\n")
def _includeHeaders(self):
include_string = """/* Automatically generated source file */
#include <float.h>
#include "arm_nnfunctions.h"
#include "genNN.h"
#include "genModel.h"
#include "tinyengine_function.h"
//#include "tinyengine_function_fp.h"
"""
if self.profile_mode:
include_string += '#include "profile.h"\n'
include_string += """
/* Variables used by all ops */
ADD_params add_params;
//Conv_Params conv_params;
//Depthwise_Params dpconv_params;
int i;
int8_t *int8ptr;
float *fptr,*fptr2,*fptr3;
signed char* getInput() {
return &buffer0[""" + f"{self.MemSche.layer[0].params['input_buf_add_offset']}" + """];
}
signed char* getOutput() {
return NNoutput;
}\n"""
fp = self.source_handle
fp.write(include_string)
def _parseTrainable(self):
schedule = self.MemSche
for i, op in enumerate(schedule.layer):
layer_info = op.get_layer_info()
if layer_info["op"] == "CONV_2D":
self._parseWeight(
self.parse_count,
layer_info["weight_value"].flatten(),
layer_info["weight_name"],
self._readOnly(layer_info["weight_name"]),
)
if "bias_name" in layer_info:
self._parseBias(
self.parse_count,
layer_info["bias"].flatten(),
layer_info["bias_name"],
self._readOnly(layer_info["bias_name"]),
)
else:
self._parseBias(self.parse_count, layer_info["bias"].flatten())
self._parseEffectivescales(self.parse_count, layer_info["effective_scale"].flatten())
self._parseRequantize(
self.parse_count,
layer_info["shift"].flatten(),
layer_info["multiplier"].flatten(),
)
layer_info["parsed_trainable"] = self.parse_count
self.parse_count += 1
elif layer_info["op"] == "DEPTHWISE_CONV_2D":
if layer_info["kernel_h"] > layer_info["kernel_w"]:
self._parseCWHWeight(
self.parse_count,
layer_info["weight_value"].flatten(),
layer_info["kernel_h"],
layer_info["kernel_w"],
layer_info["input_c"],
)
else:
if "weight_name" in layer_info:
self._parseCHWWeight(
self.parse_count,
layer_info["weight_value"].flatten(),
layer_info["input_c"],
)
else:
self._parseCHWWeight(
self.parse_count,
layer_info["weight_value"].flatten(),
layer_info["input_c"],
)
if "bias_name" in layer_info:
self._parseoffsetBias(
self.parse_count,
layer_info["bias"].flatten(),
layer_info["input_zero_point"] * -1,
layer_info["weight_value"].flatten(),
layer_info["input_c"],
layer_info["bias_name"],
self._readOnly(layer_info["bias_name"]),
)
else:
self._parseoffsetBias(
self.parse_count,
layer_info["bias"].flatten(),
layer_info["input_zero_point"] * -1,
layer_info["weight_value"].flatten(),
layer_info["input_c"],
)
self._parseEffectivescales(self.parse_count, layer_info["effective_scale"].flatten())
self._parseRequantize(
self.parse_count,
layer_info["shift"].flatten(),
layer_info["multiplier"].flatten(),
)
layer_info["parsed_trainable"] = self.parse_count
self.parse_count += 1
elif layer_info["op"] == "FULLY_CONNECTED":
self._parseWeight(
self.parse_count,
layer_info["weight_value"].flatten(),
layer_info["weight_name"],
self._readOnly(layer_info["weight_name"]),
)
self._parseBias(self.parse_count, layer_info["bias"].flatten())
layer_info["parsed_trainable"] = self.parse_count
self.parse_count += 1
elif layer_info["op"] == "SOFTMAX":
pass
def _parseCWHWeight(self, Lindex, weight, height, width, channel):
fp = self.header_handle
# 8bit implementation
if self.BIT == 8:
string = "const unsigned char CWHweight" + str(Lindex) + "[" + str(len(weight)) + "] = {"
fp.write(string)
for j in range(channel):
for w in range(width):
for h in range(height):
value = weight[(h * width + w) * channel + j]
if value < 0:
value += 256
fp.write(str(format(value, "#04x")) + ", ")
else:
raise NotImplementedError
fp.write("};\n")
def _parseCHWWeight(self, Lindex, weight, channel):
fp = self.header_handle
kernelsize = int(len(weight) / channel)
# 8bit implementation
if self.BIT == 8:
string = "const unsigned char CHWweight" + str(Lindex) + "[" + str(len(weight)) + "] = {"
fp.write(string)
for j in range(channel):
for i in range(kernelsize):
value = int(weight[i * channel + j])
if value < 0:
value += 256
fp.write(str(format(value, "#04x")) + ", ")
else:
raise NotImplementedError
fp.write("};\n")
def _parseEffectivescales(self, Lindex, scales):
fp = self.header_handle
string = "const float scales" + str(Lindex) + "[" + str(len(scales)) + "] = {"
fp.write(string)
for _, value in enumerate(scales):
fp.write(str(value) + ", ")
fp.write("};\n")
def _parseWeight(self, Lindex, weight, weight_name=None, is_const=True):
fp = self.header_handle
const_str = "const " if is_const else ""
string = f"{const_str}unsigned char weight" + str(Lindex) + "[" + str(len(weight)) + "] = {"
fp.write(string)
for _, value in enumerate(weight):
value = int(value)
if value < 0:
value += 256
fp.write(str(format(value, "#04x")) + ", ")
fp.write("};\n")
if weight_name is not None:
for r in self.trainSRAMTable:
if r.name == weight_name:
return
self.trainSRAMTable.append(tensorRecorder(weight_name, len(weight), "unknown"))
if weight.dtype == "int8":
string = f"{const_str}unsigned char* {weight_name}=weight" + str(Lindex) + ";\n"
else:
raise NotImplementedError
fp.write(string)
def _parseoffsetBias(self, Lindex, bias, input_offset, weight, channel, bias_name=None, is_const=True):
fp = self.header_handle
const_str = "const " if is_const else ""
string = f"{const_str}int32_t offsetBias" + str(Lindex) + "[" + str(len(bias)) + "] = {"
fp.write(string)
kernelsize = int(len(weight) / channel)
# fuse the offset into bias
for i in range(channel):
tmpW = 0
for j in range(kernelsize):
tmpW += weight[j * channel + i]
fp.write(str(self.int32_clip(bias[i] + tmpW * input_offset)) + ", ")
fp.write("};\n")
string = f"{const_str}int32_t offsetRBias" + str(Lindex) + "[" + str(len(bias)) + "] = {"
fp.write(string)
kernelsize = int(len(weight) / channel)
for i in range(channel):
tmpW = 0
for j in range(kernelsize):
tmpW += weight[j * channel + i]
fp.write(str(bias[i] + tmpW * input_offset - self.int32_clip(bias[i] + tmpW * input_offset)) + ", ")
fp.write("};\n")
def _parseBias(self, Lindex, bias, bias_name=None, is_const=True):
fp = self.header_handle
const_str = "const " if is_const else ""
string = f"{const_str}int32_t bias" + str(Lindex) + "[" + str(len(bias)) + "] = {"
fp.write(string)
for _, value in enumerate(bias):
value = int(value)
fp.write(str(value) + ", ")
fp.write("};\n")
def _parseRequantize(self, Lindex, shift, multiplier):
fp = self.header_handle
string = "const int32_t shift" + str(Lindex) + "[" + str(len(shift)) + "] = {"
fp.write(string)
for _, value in enumerate(shift):
fp.write(str(value) + ", ")
fp.write("};\n")
string = "const int32_t multiplier" + str(Lindex) + "[" + str(len(multiplier)) + "] = {"
fp.write(string)
for _, value in enumerate(multiplier):
fp.write(str(value) + ", ")
fp.write("};\n")
def int32_clip(self, a):
if a < -(2**31):
return -(2**31)
elif a > 2**31 - 1:
return 2**31 - 1
return a.astype(int)
def _closefp(self):
self.header_handle.close()
self.source_handle.close()
def _findtheinferenceOutput(layers):
for cnt, op in enumerate(layers):
if op.params["output_dtype"] != "int8":
return layers[cnt - 1].params["output_buf_add_offset"]
return layers[-1].params["output_buf_add_offset"]
class tensorRecorder:
def __init__(self, name, len, dtype):
self.name = name
self.len = len
self.dtype = dtype