class depthwiseInplace_bitmask(object): def __init__(self, kernel_h, kernel_w, pad_h, pad_w, stride, dataflow="CHW", fp_requantize=False): self.pad_h = pad_h self.pad_w = pad_w self.kernel_h = kernel_h self.kernel_w = kernel_w self.stride = stride self.arch = "ARMv7E-M" # by default self.dataflow = dataflow self.fp_requantize = fp_requantize def setArch(self, arch): self.arch = arch def __genCode(self): retString = self.__genHeader() retString += self.__genKernelDefine() + ";\n" retString += self.genFuncDefine() + "\n{\n" if self.dataflow == "CHW": retString += self.__genBufferInitialization() elif self.dataflow == "CWH": retString += self.__genBufferInitializationCWH() if self.dataflow == "CHW": retString += self.__genInplaceKernel() elif self.dataflow == "CWH": retString += self.__genInplaceKernelCWH() retString += self.__genEndStr() if self.dataflow == "CHW": retString += "\n" + self.__genKernel() elif self.dataflow == "CWH": retString += "\n" + self.__genKernelCWH() return retString def __getFunctionName(self): return ( "depthwise_kernel" + str(self.kernel_h) + "x" + str(self.kernel_w) + "_stride" + str(self.stride) + "_inplace_" + self.dataflow + "_fpreq_bitmask" ) def __getKernelName(self): return ( "depthwise_kernel" + str(self.kernel_h) + "x" + str(self.kernel_w) + "_stride" + str(self.stride) + "_inplace_kernel_" + self.dataflow + "_fpreq_bitmask" ) def genFuncDefine(self): retString = "" function_name = self.__getFunctionName() retString += "tinyengine_status " + function_name retString += """(q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const int32_t *bias, const int32_t *biasR, const float *scales, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, q7_t *output, q7_t *output_mask, const uint16_t output_x, const uint16_t output_y, const uint16_t output_ch, q15_t *runtime_buf, q7_t pad_value)""" return retString def __genKernelDefine(self): retString = "" retString += ( "void " + self.__getKernelName() + """( const uint16_t output_y, const uint16_t output_x, const int32_t *bias, const int32_t *biasR, const q7_t *ksrc, const float *scales, q7_t *output, q7_t *output_mask, const int mask_idx, const int32_t output_offset, const int32_t activation_min, const int32_t activation_max, q7_t *cols_8b_iterptr, const uint16_t column_x, int channel_offset)""" ) return retString def __genHeader(self): retString = ( """/* This file is automatically generated */ /* ---------------------------------------------------------------------- * Project: TinyEngine * Description: for sparse in-place """ + str(self.kernel_h) + "x" + str(self.kernel_w) + """ depth-wise convolution (HWC->CHW->HWC) * Target ISA: """ + self.arch + """ * Author: wmchen@mit.edu * -------------------------------------------------------------------- */ #include "arm_nnsupportfunctions.h" //TODO: remove this in the future for self-contained #include "tinyengine_function.h"\n""" ) return retString def __genBufferInitialization(self): retString = ( """ uint16_t c,i,j; q7_t *cols_8b_start = (q7_t *)runtime_buf; q7_t* cols_8b = (q7_t* )cols_8b_start; //Set padding value q7_t PAD8 = pad_value; /* setup the padding regions for Im2col buffers */ //top region: 8bit x (input_x + pad_w * 2) x pad_h: unroll by pad value for(i = 0; i < input_x + """ + str(self.pad_w * 2) + """; i++){""" ) for i in range(self.pad_h): retString += """ *cols_8b++ = PAD8;""" retString += """ } //middle regions: left and right regions for(i = 0; i < input_y; i++){""" for i in range(self.pad_w): retString += """ *cols_8b++ = PAD8;//left""" retString += """ cols_8b += input_x; //skip middle""" for i in range(self.pad_w): retString += """ *cols_8b++ = PAD8;//right""" retString += ( """ } //bottom region: 8bit x (input_x + pad_w * 2) x pad_h: unroll by pad value for(i = 0; i < input_x + """ + str(self.pad_w * 2) + """; i++){""" ) for i in range(self.pad_h): retString += """ *cols_8b++ = PAD8;""" retString += """ } const q7_t *src; const q7_t *ksrc = kernel; """ return retString def __genBufferInitializationCWH(self): # HWC to CWH data flow retString = ( """ uint16_t c,i,j; q7_t *cols_8b_start = (q7_t *)runtime_buf; q7_t* cols_8b = (q7_t* )cols_8b_start; //HWC to CWH data flow //Set padding value q7_t PAD8 = pad_value; /* setup the padding regions for Im2col buffers */ //top region: 8bit x (input_y + pad_h * 2) x pad_w: unroll by pad value for(i = 0; i < input_y + """ + str(self.pad_h * 2) + """; i++){""" ) for i in range(self.pad_w): retString += """ *cols_8b++ = PAD8;""" retString += """ } //middle regions: left and right regions for(i = 0; i < input_x; i++){""" for i in range(self.pad_h): retString += """ *cols_8b++ = PAD8;//left""" retString += """ cols_8b += input_y; //skip middle""" for i in range(self.pad_h): retString += """ *cols_8b++ = PAD8;//right""" retString += ( """ } //bottom region: 8bit x (input_y + pad_h * 2) x pad_w: unroll by pad value for(i = 0; i < input_y + """ + str(self.pad_h * 2) + """; i++){""" ) for i in range(self.pad_w): retString += """ *cols_8b++ = PAD8;""" retString += """ } const q7_t *src; const q7_t *ksrc = kernel; """ return retString def __genInplaceKernelCWH(self): retString = "" retString += """ for (c = 0; c < input_ch; c++){""" retString += self.__genFixedLoadHWC2CWH() retString += self.__assignInplaceOut(2, "input") retString += self.__genHandle1CH_CWH(2, "input_ch") retString += """ }\n""" return retString def __genFixedLoadHWC2CWH(self): retString = ( """ cols_8b = (q7_t*)(cols_8b_start + """ + str(self.pad_w) + """ * (input_y) + """ + str(self.pad_h * self.pad_w * 2) + """); //skip """ + str(self.pad_w) + """ rows src = input; for(i = 0; i < input_x; i++){ cols_8b += """ + str(self.pad_h) + """;//skip front for(j = 0; j < input_y; j++){ *cols_8b++ = src[input_x * j * input_ch];// + input_offset; } src += input_ch; cols_8b += """ + str(self.pad_h) + """;//skip end }\n""" ) return retString def __genInplaceKernel(self): retString = "" retString += """ for (c = 0; c < input_ch; c++){""" retString += self.__genFixedLoadHWC2CHW() retString += self.__assignInplaceOut(2, "input") retString += self.__genHandle1CH(2, "input_ch") retString += """ }\n""" return retString def __genFixedLoadHWC2CHW(self): retString = ( """ cols_8b = (q7_t*)(cols_8b_start + """ + str(self.pad_h) + """ * (input_x) + """ + str(self.pad_h * self.pad_w * 2) + """); //skip """ + str(self.pad_h) + """ rows src = input; for(i = 0; i < input_y; i++){ cols_8b += """ + str(self.pad_w) + """;//skip front for(j = 0; j < input_x; j++){ *cols_8b++ = *src;// + input_offset; src += input_ch; } cols_8b += """ + str(self.pad_w) + """;//skip end }\n""" ) return retString def __genEndStr(self): return "\n}" def __assignInplaceOut(self, pre_indent, out_str): return " " * pre_indent + "q7_t *inplace_out = " + out_str + ";\n" def __genHandle1CH(self, pre_indent, out_offset_str): retString = " " * pre_indent + "if (c % 8 == 0 && c > 1) output_mask++;\n" retString += ( " " * pre_indent + self.__getKernelName() + "(output_y, output_x, bias++, biasR++, ksrc, scales++, inplace_out, " + "output_mask, c % 8, output_offset,output_activation_min, output_activation_max,cols_8b_start, input_x, " + out_offset_str + ");\n" ) retString += " " * pre_indent + "ksrc += " + str(self.kernel_h * self.kernel_w) + ";\n" retString += " " * pre_indent + "input++;\n" return retString def genFile(self, path): import os outpath = os.path.join(path, self.__getFunctionName() + ".c") outf = open(outpath, "w") outf.write(self.__genCode()) outf.close() # ********************* KERNEL ********************* # def __genMACStr(self, input_cnt, kernel_cnt, stride): inbuf = "cols_8b" kerbuf = "ksrc" sum0 = "sum0" sum1 = "sum1" ret = ( " " + sum0 + " += " + inbuf + "[" + str(input_cnt) + "]" + "*" + kerbuf + "[" + str(kernel_cnt) + "]" + ";\n" ) ret += ( " " + sum1 + " += " + inbuf + "[" + str(input_cnt + stride) + "]" + "*" + kerbuf + "[" + str(kernel_cnt) + "]" + ";\n" ) return ret def __genConvString(self): retString = "" kercnt = 0 for i in range(self.kernel_h): if i > 0: retString += " cols_8b += column_x + " + str(self.pad_w * 2) + ";\n" for j in range(self.kernel_w): retString += self.__genMACStr(j, kercnt, self.stride) kercnt += 1 return retString def __genConvStringCWH(self): retString = "" kercnt = 0 for i in range(self.kernel_w): if i > 0: retString += " cols_8b += column_x + " + str(self.pad_h * 2) + ";\n" for j in range(self.kernel_h): retString += self.__genMACStr(j, kercnt, self.stride) kercnt += 1 return retString def __genConvLeftStringCWH(self): retString = "" kercnt = 0 for i in range(self.kernel_w): if i > 0: retString += " cols_8b += column_x + " + str(self.pad_h * 2) + ";\n" for j in range(self.kernel_h): retString += self.__genLeftMACStr(j, kercnt, self.stride) kercnt += 1 return retString def __genLeftMACStr(self, input_cnt, kernel_cnt, stride): inbuf = "cols_8b" kerbuf = "ksrc" sum0 = "sum" ret = ( " " + sum0 + " += " + inbuf + "[" + str(input_cnt) + "]" + "*" + kerbuf + "[" + str(kernel_cnt) + "]" + ";\n" ) return ret def __genConvLeftString(self): retString = "" kercnt = 0 for i in range(self.kernel_h): if i > 0: retString += " cols_8b += column_x + " + str(self.pad_w * 2) + ";\n" for j in range(self.kernel_w): retString += self.__genLeftMACStr(j, kercnt, self.stride) kercnt += 1 return retString def __genKernel(self): retString = "" # function name retString += self.__genKernelDefine() + "\n{\n" retString += " #define STRIDE " + str(self.stride) + "\n" retString += " int i, j;\n" retString += " q7_t mask_value;\n" # initialize accumulators as bias retString += """ /* MACs for each output */ for (i = 0; i < output_y; i++) { for (j = 0; j < output_x / 2; j++) { q7_t *cols_8b = cols_8b_iterptr; q31_t sum0 = bias[0] + biasR[0]; q31_t sum1 = bias[0] + biasR[0]; """ # computation retString += """ /* computation */\n""" retString += self.__genConvString() # requantize retString += """ /* requantize */ sum0 = (float) sum0 * *scales; sum0 += output_offset; mask_value = 1; if (sum0 < activation_min){ sum0 = activation_min; mask_value = 0; } if (sum0 > activation_max){ sum0 = activation_max; mask_value = 0; } output[(i * output_x + j * 2) * channel_offset] = sum0; if (mask_value == 1) BIT_SET(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx); else BIT_CLEAR(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx); sum1 = (float) sum1 * *scales; sum1 += output_offset; mask_value = 1; if (sum1 < activation_min){ sum1 = activation_min; mask_value = 0; } if (sum1 > activation_max){ sum1 = activation_max; mask_value = 0; } output[(i * output_x + (j * 2 + 1)) * channel_offset] = sum1; if (mask_value == 1) BIT_SET(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx); else BIT_CLEAR(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx); cols_8b_iterptr += STRIDE * 2; } """ # left over for odd dimension retString += """if (output_x & 1) { q7_t * cols_8b = cols_8b_iterptr; q31_t sum = bias[0] + biasR[0];\n""" retString += self.__genConvLeftString() retString += """ sum = (float) sum * *scales; sum += output_offset; mask_value = 1; if (sum < activation_min){ sum = activation_min; mask_value = 0; } if (sum > activation_max){ sum = activation_max; mask_value = 0; } output[(i * output_x + output_x - 1) * channel_offset] = sum; if (mask_value == 1) BIT_SET(output_mask[(i * output_x + output_x - 1) * channel_offset / 8], mask_idx); else BIT_CLEAR(output_mask[(i * output_x + output_x - 1) * channel_offset / 8], mask_idx); cols_8b_iterptr += STRIDE; } """ if self.stride != 1: retString += "cols_8b_iterptr += " + str(self.pad_w) + " * 2 - (column_x & 1);\n" retString += " cols_8b_iterptr += (STRIDE - 1) * (column_x + " + str(self.pad_w) + " * 2);\n" else: retString += "cols_8b_iterptr += " + str(self.pad_w) + " * 2;\n" # end of function retString += """ } }\n""" return retString def __genKernelCWH(self): retString = "" # function name retString += self.__genKernelDefine() + "\n{\n" retString += " #define STRIDE " + str(self.stride) + "\n" retString += " int i, j;\n" # initialize accumulators as bias retString += """ /* MACs for each output */ for (j = 0; j < output_x; j++) { for (i = 0; i < output_y / 2; i++) { q7_t *cols_8b = cols_8b_iterptr; q31_t sum0 = bias[0] + biasR[0]; q31_t sum1 = bias[0] + biasR[0]; """ # computation retString += """ /* computation */\n""" retString += self.__genConvStringCWH() # requantize retString += """ /* requantize */ sum0 = (float) sum0 * *scales; sum0 += output_offset; mask_value = 1; if (sum0 < activation_min){ sum0 = activation_min; mask_value = 0; } if (sum0 > activation_max){ sum0 = activation_max; mask_value = 0; } output[(i * output_x + j * 2) * channel_offset] = sum0; if (mask_value == 1) BIT_SET(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx); else BIT_CLEAR(output_mask[(i * output_x + j * 2) * channel_offset / 8], mask_idx); sum1 = (float) sum1 * *scales; sum1 += output_offset; mask_value = 1; if (sum1 < activation_min){ sum1 = activation_min; mask_value = 0; } if (sum1 > activation_max){ sum1 = activation_max; mask_value = 0; } output[(i * output_x + (j * 2 + 1)) * channel_offset] = sum1; if (mask_value == 1) BIT_SET(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx); else BIT_CLEAR(output_mask[(i * output_x + (j * 2 + 1)) * channel_offset / 8], mask_idx); cols_8b_iterptr += STRIDE * 2; } """ # left over for odd dimension retString += """if (output_y & 1) { q7_t * cols_8b = cols_8b_iterptr; q31_t sum = bias[0] + biasR[0];\n""" retString += self.__genConvLeftStringCWH() retString += """ sum = (float) sum * *scales; sum += output_offset; mask_value = 1; if (sum < activation_min){ sum = activation_min; mask_value = 0; } if (sum > activation_max){ sum = activation_max; mask_value = 0; } output[(i * output_x + output_x - 1) * channel_offset] = sum; output_mask[(i * output_x + output_x - 1) * channel_offset] = mask_value; cols_8b_iterptr += STRIDE; } """ if self.stride != 1: retString += "cols_8b_iterptr += " + str(self.pad_h) + " * 2 - (column_x & 1);\n" retString += " cols_8b_iterptr += (STRIDE - 1) * (column_x + " + str(self.pad_h) + " * 2);\n" else: retString += "cols_8b_iterptr += " + str(self.pad_h) + " * 2;\n" # end of function retString += """ } }\n""" return retString # ********************* unit test ********************* # def genTestUint(self, dirpath, height, width, channel): # code to test this layer outpath = dirpath + "\\test_" + str(self.__getFunctionName()) + ".h" outf = open(outpath, "w") string = ( """#include "stdio.h" #include "scratch_buffer.h" extern "C" { #include "kernel_buffer.h" #include "tinyengine_function.h" } UART_HandleTypeDef UART; void printLog(const char* s) { static bool is_initialized = false; if (!is_initialized) { UART.Instance = USART1; UART.Init.BaudRate = 115200; UART.Init.WordLength = UART_WORDLENGTH_8B; UART.Init.StopBits = UART_STOPBITS_1; UART.Init.Parity = UART_PARITY_NONE; UART.Init.Mode = UART_MODE_TX_RX; UART.Init.HwFlowCtl = UART_HWCONTROL_NONE; UART.Init.OverSampling = UART_OVERSAMPLING_16; UART.Init.OneBitSampling = UART_ONE_BIT_SAMPLE_DISABLE; UART.AdvancedInit.AdvFeatureInit = UART_ADVFEATURE_NO_INIT; if (HAL_UART_Init(&UART) != HAL_OK) { //Error handling } is_initialized = true; } HAL_UART_Transmit(&UART, (uint8_t *)s, strlen(s), 10); } q7_t input[""" + str(width * height * channel) + """] = {""" ) for i in range(width * height * channel): string += str(i % 128) + "," string += ( """}; q7_t input1[""" + str(width * height * channel) + """] = {""" ) for i in range(width * height * channel): string += str(i % 128) + "," string += ( """}; q7_t output1[""" + str(width * height * channel) + """]; q7_t output2[""" + str(width * height * channel) + """]; void test_""" + str(self.__getFunctionName()) + """()\n{ printLog("test layer """ + str(self.__getFunctionName()) + """'s depthwise convolution, input_y:""" + str(height) + """,input_x:""" + str(width) + """,input_ch:""" + str(channel) + """\\r\\n"); int input_x = """ + str(width) + ", input_y = " + str(height) + ", input_ch = " + str(channel) + ", output_ch = " + str(channel) + ";\n" + """ int output_x = """ + str(width // self.stride) + ", output_y = " + str(height // self.stride) + ";\n" + """ int32_t kbuf[1]; set_kernel_buffer((int32_t*)kbuf, 1); int16_t sbuf[""" + str( max((width + self.pad_w * 2) * (height + self.pad_h * 2), self.kernel_h * self.kernel_w * channel * 2) ) + """]; set_scratch_buffer((int16_t*)sbuf, """ + str( max((width + self.pad_w * 2) * (height + self.pad_h * 2), self.kernel_h * self.kernel_w * channel * 2) ) + """); q7_t kernelchw[""" + str(self.kernel_h * self.kernel_w * channel) + "] = {" ) for c in range(channel): for i in range(self.kernel_h): for j in range(self.kernel_w): string += "1," string += ( """}; q7_t kernelhwc[""" + str(self.kernel_h * self.kernel_w * channel) + "] = {" ) for j in range(channel): for i in range(self.kernel_h): for c in range(self.kernel_w): string += "1," string += ( """}; int32_t bias[""" + str(channel) + "] = {" ) for i in range(channel): string += str(i * 100) + "," string += ( """}; int32_t multiplier[""" + str(channel) + "] = {" ) for i in range(int(channel / 2)): string += "1706270591,1380135680," string += ( """}; int32_t shift[""" + str(channel) + "] = {" ) for i in range(int(channel / 2)): string += "-6,-5," string += ( """}; arm_depthwise_conv_s8_opt(input, input_x, input_y, input_ch, kernelhwc, output_ch, """ + str(self.kernel_w) + "," + str(self.kernel_h) + "," + str(self.pad_w) + "," + str(self.pad_h) + "," + str(self.stride) + "," + str(self.stride) + """, bias, output1, shift, multiplier, output_x, output_y, -128, 0, -128, 127, 1, 1, sbuf); """ + self.__getFunctionName() + """(input, input_x, input_y, input_ch, kernelchw, bias, shift, multiplier, -128, 0,-128, 127,""" + """ output2, output_x, output_y, output_ch, sbuf, 0); bool correct = true; int i; for (i = 0; i < output_x * output_y * output_ch; i++){ if(output1[i] != input[i]){ correct = false; break; } } char buf[100]; if (correct) printLog("Results match\\r\\n"); else{ printLog("Error results\\r\\n"); sprintf(buf,"First inconsistency: %d\\r\\n", i); printLog(buf); } printLog("Test speed now\\r\\n"); # define BATCHNUM 100 float fmac = output_x * output_y * input_ch * """ + str(self.kernel_h * self.kernel_w) + """; int opstart = HAL_GetTick(); for(i = 0; i < BATCHNUM; i++){ arm_depthwise_conv_s8_opt(input, input_x, input_y, input_ch, kernelhwc, output_ch, """ + str(self.kernel_w) + "," + str(self.kernel_h) + "," + str(self.pad_w) + "," + str(self.pad_h) + "," + str(self.stride) + "," + str(self.stride) + """, bias, output1, shift, multiplier, output_x, output_y, -128, 128, -128, 127, 1, 1, sbuf); } int opend = HAL_GetTick(); float time = opend - opstart; time /= BATCHNUM;//ms float mac_pers = fmac / time / 1000;//*1000000/1000 > /1000 sprintf(buf,"%.2f|%.3f|", time, mac_pers); printLog(buf); opstart = HAL_GetTick(); for(int i = 0; i < BATCHNUM; i++){ """ + self.__getFunctionName() + """(input, input_x, input_y, input_ch, kernelchw, bias, shift, multiplier, -128, 128,-128, 127,""" + """ output2, output_x, output_y, output_ch, sbuf, -128); } opend = HAL_GetTick(); time = opend - opstart; time /= BATCHNUM;//ms mac_pers = fmac / time / 1000;//*1000000/1000 > /1000 sprintf(buf,"%.2f|%.3f|\\r\\n", time, mac_pers); printLog(buf); }\n """ ) outf.write(string) outf.close() class includeFile(object): def __init__(self, path): self.path = path self.defstring = "" def addDefine(self, defstr): self.defstring += defstr + ";\n" def writeFile(self): import os outpath = os.path.join(self.path, "genInclude.h") outf = open(outpath, "w") outf.write(self.defstring)