diff --git a/README.md b/README.md index 9584020..8874071 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ TinyEngine is a part of MCUNet, which also consists of TinyNAS. MCUNet is a syst **If you are interested in getting updates, please sign up [here](https://forms.gle/UW1uUmnfk1k6UJPPA) to get notified!** +- **(2023/02)** We release the source code of the [training demo](examples/openmv_training_sparse) on OpenMV Cam H7. - **(2022/12)** We update the [measured results](README.md#measured-results) on STM32H743 with the new versions of the inference libraries. - **(2022/12)** We release the source code for patch-based inference and update the [tutorial of our inference demo](tutorial/inference/README.md) to provide option that generates patch-based inference code for the visual wake words (VWW) demo. - **(2022/11)** We release the source code of Tiny Training Engine, and include the [tutorial of our training demo](tutorial/training) for training a visual wake words (VWW) model on microcontrollers. diff --git a/TinyEngine/include/genNN.h b/TinyEngine/include/genNN.h index 135445b..91e88ee 100644 --- a/TinyEngine/include/genNN.h +++ b/TinyEngine/include/genNN.h @@ -28,8 +28,8 @@ signed char* getInput(); signed char* getOutput(); float* getOutput_fp(); int32_t* getOutput_int32(); -static float lr = 0.0008; -static float blr = 0.0004; +static float lr __attribute__((unused)) = 0.0008; // To suppress warning +static float blr __attribute__((unused)) = 0.0004; // To suppress warning void setupBuffer(); void invoke(float* labels); diff --git a/TinyEngine/include/img2col_element.h b/TinyEngine/include/img2col_element.h index 6eca017..7cdf5b0 100644 --- a/TinyEngine/include/img2col_element.h +++ b/TinyEngine/include/img2col_element.h @@ -1,546 +1,482 @@ -/* ---------------------------------------------------------------------- - * Project: TinyEngine - * Title: img2col_element.h - * - * Reference papers: - * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020 - * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021 - * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022 - * Contact authors: - * - Wei-Ming Chen, wmchen@mit.edu - * - Wei-Chen Wang, wweichen@mit.edu - * - Ji Lin, jilin@mit.edu - * - Ligeng Zhu, ligeng@mit.edu - * - Song Han, songhan@mit.edu - * - * Target ISA: ARMv7E-M - * -------------------------------------------------------------------- */ - -#ifndef ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ -#define ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ - -#include "arm_nnsupportfunctions.h" -#include "arm_math_memory.h" - -#define b2_q7_q15_offset_ele(src,dst) \ -/* convert from q7 to q15 and then store the results in the destination buffer */ \ -/*in_q7x4 = b2_nn_read_q7x4_ia((const q7_t **)&src); \ -in_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ -in_q15x2_2 = __SXTB16(in_q7x4); */ \ -in_q15x2_1 = ((src[0] & 0x0C) >> 2) + ((src[0] & 0xC0) << 10);\ -in_q15x2_2 = (src[0] & 0x03) + ((src[0] & 0x30) << 12);\ -src +=1;\ -out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ -/* Maximum of 9 bits from the addition is expected */ \ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ - \ -out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ - \ -write_q15x2_ia(&dst, out_q15x2_1); \ -write_q15x2_ia(&dst, out_q15x2_2); - -#define b4_q7_q15_offset_ele(src,dst) \ -/* convert from q7 to q15 and then store the results in the destination buffer */ \ -/*in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \ -in_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ -in_q15x2_2 = __SXTB16(in_q7x4); */ \ -in_q15x2_1 = ((src[0] & 0xF0) >> 4) + ((src[1] & 0xF0) << 12);\ -in_q15x2_2 = (src[0] & 0x0F) + ((src[1] & 0x0F) << 16);\ -src +=2;\ -out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ -/* Maximum of 9 bits from the addition is expected */ \ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ - \ -out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ - \ -write_q15x2_ia(&dst, out_q15x2_1); \ -write_q15x2_ia(&dst, out_q15x2_2); - -#define q7_q15_offset_ele(src,dst) \ -/* convert from q7 to q15 and then store the results in the destination buffer */ \ -in_q7x4 = arm_nn_read_q7x4_ia((const q7_t **)&src); \ -/* Extract and sign extend each of the four q7 values to q15 */ \ -in_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ -in_q15x2_2 = __SXTB16(in_q7x4); \ - \ -out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ -/* Maximum of 9 bits from the addition is expected */ \ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ - \ -out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ - \ -write_q15x2_ia(&dst, out_q15x2_1); \ -write_q15x2_ia(&dst, out_q15x2_2); - -#define q8_q15_offset_ele(src,dst) \ -/* convert from q8 to q15 and then store the results in the destination buffer */ \ -in_q7x4 = arm_nn_read_q7x4_ia((const q8_t **)&src); \ -/* Extend each of the four q8 values to q15 */ \ -in_q15x2_1 = __UXTB16(__ROR(in_q7x4, 8)); \ -in_q15x2_2 = __UXTB16(in_q7x4); \ - \ -out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ -/* Maximum of 9 bits from the addition is expected */ \ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ - \ -out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ - \ -write_q15x2_ia(&dst, out_q15x2_1); \ -write_q15x2_ia(&dst, out_q15x2_2); - -#define b4_q15_offset_reordered_ele(src,dst)\ -/* convert from q7 to q15 and then store the results in the destination buffer */\ -in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src);\ -\ -/* Extract and sign extend each of the four q7 values to q15 */\ -out_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8));\ -out_q15x2_2 = __SXTB16(in_q7x4);\ -\ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2);\ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2);\ -\ -write_q15x2_ia(&dst, out_q15x2_2);\ -write_q15x2_ia(&dst, out_q15x2_1); - -#define b2_q15_offset_reordered_ele(src,dst)\ -/* convert from q7 to q15 and then store the results in the destination buffer */\ -in_q7x4 = b2_nn_read_q7x4_ia(&src);\ -\ -/* Extract and sign extend each of the four q7 values to q15 */\ -out_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8));\ -out_q15x2_2 = __SXTB16(in_q7x4);\ -\ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2);\ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2);\ -\ -write_q15x2_ia(&dst, out_q15x2_2);\ -write_q15x2_ia(&dst, out_q15x2_1); - -#define q7_q15_offset_reordered_ele(src,dst)\ -/* convert from q7 to q15 and then store the results in the destination buffer */\ -in_q7x4 = arm_nn_read_q7x4_ia((const q7_t **)&src);\ -\ -/* Extract and sign extend each of the four q7 values to q15 */\ -out_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8));\ -out_q15x2_2 = __SXTB16(in_q7x4);\ -\ -out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2);\ -out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2);\ -\ -write_q15x2_ia(&dst, out_q15x2_2);\ -write_q15x2_ia(&dst, out_q15x2_1); - -#define q31_assign2(src,dst) \ -*dst++ = *src++; \ -*dst++ = *src++; - -#define q31_assign4(src,dst) \ -q31_assign2(src,dst) \ -q31_assign2(src,dst) \ - -#define q31_assign6(src,dst) \ -q31_assign4(src,dst) \ -q31_assign2(src,dst) \ - -#define q31_assign8(src,dst) \ -q31_assign4(src,dst) \ -q31_assign4(src,dst) \ - -#define q31_assign10(src,dst) \ -q31_assign8(src,dst) \ -q31_assign2(src,dst) \ - -#define q31_assign12(src,dst) \ -q31_assign10(src,dst) \ -q31_assign2(src,dst) \ - -#define q31_pad2(dst,padvalue) \ -*dst++ = padvalue; \ -*dst++ = padvalue; \ - -#define q31_pad4(dst,padvalue) \ -q31_pad2(dst,padvalue) \ -q31_pad2(dst,padvalue) \ - -#define q31_pad6(dst,padvalue) \ -q31_pad4(dst,padvalue) \ -q31_pad2(dst,padvalue) \ - -#define q31_pad10(dst,padvalue) \ -q31_pad6(dst,padvalue) \ -q31_pad4(dst,padvalue) \ - -#define q31_pad14(dst,padvalue) \ -q31_pad6(dst,padvalue) \ -q31_pad6(dst,padvalue) \ -q31_pad2(dst,padvalue) \ - - -#define assignq31toq15()\ -dst = (q15_t*)dst_31;\ -dst2 = (q15_t*)dst2_31;\ -dst3 = (q15_t*)dst3_31;\ -dst4 = (q15_t*)dst4_31;\ -dst5 = (q15_t*)dst5_31;\ -dst6 = (q15_t*)dst6_31;\ -dst7 = (q15_t*)dst7_31;\ - -#define assignq15toq31()\ -dst_31 = (q31_t*)dst;\ -dst2_31 = (q31_t*)dst2;\ -dst3_31 = (q31_t*)dst3;\ -dst4_31 = (q31_t*)dst4;\ -dst5_31 = (q31_t*)dst5;\ -dst6_31 = (q31_t*)dst6;\ -dst7_31 = (q31_t*)dst7;\ - -/* ---------------------------------- Pad ---------------------------------- */ -#define basic_pad_1row(col,dst_31,pad_out_q15x2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0)\ -{ \ - q31_pad2(dst_31,pad_out_q15x2) \ - block_cnt--; \ -} - -#define basic_pad_2row(col,dst_31,dst2_31,pad_out_q15x2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0)\ -{ \ - q31_pad2(dst_31,pad_out_q15x2) \ - q31_pad2(dst2_31,pad_out_q15x2) \ - block_cnt--; \ -} - -#define basic_pad_3row(col,dst_31,dst2_31,dst3_31,pad_out_q15x2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0)\ -{ \ - q31_pad2(dst_31,pad_out_q15x2) \ - q31_pad2(dst2_31,pad_out_q15x2) \ - q31_pad2(dst3_31,pad_out_q15x2) \ - block_cnt--; \ -} - -#define basic_pad_4row(col,dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0)\ -{ \ - q31_pad2(dst_31,pad_out_q15x2) \ - q31_pad2(dst2_31,pad_out_q15x2) \ - q31_pad2(dst3_31,pad_out_q15x2) \ - q31_pad2(dst4_31,pad_out_q15x2) \ - block_cnt--; \ -} - -#define basic_pad_5row(col,dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0)\ -{ \ - q31_pad2(dst_31,pad_out_q15x2) \ - q31_pad2(dst2_31,pad_out_q15x2) \ - q31_pad2(dst3_31,pad_out_q15x2) \ - q31_pad2(dst4_31,pad_out_q15x2) \ - q31_pad2(dst5_31,pad_out_q15x2) \ - block_cnt--; \ -} - -#define pad_1row_1col(dst_31,pad_out_q15x2) basic_pad_1row(1,dst_31,pad_out_q15x2) -#define pad_1row_2col(dst_31,pad_out_q15x2) basic_pad_1row(2,dst_31,pad_out_q15x2) -#define pad_1row_3col(dst_31,pad_out_q15x2) basic_pad_1row(3,dst_31,pad_out_q15x2) -#define pad_2row_1col(dst_31,dst2_31,pad_out_q15x2) basic_pad_2row(1,dst_31,dst2_31,pad_out_q15x2) -#define pad_2row_2col(dst_31,dst2_31,pad_out_q15x2) basic_pad_2row(2,dst_31,dst2_31,pad_out_q15x2) -#define pad_2row_3col(dst_31,dst2_31,pad_out_q15x2) basic_pad_2row(3,dst_31,dst2_31,pad_out_q15x2) -#define pad_2row_4col(dst_31,dst2_31,pad_out_q15x2) basic_pad_2row(4,dst_31,dst2_31,pad_out_q15x2) -#define pad_2row_5col(dst_31,dst2_31,pad_out_q15x2) basic_pad_2row(5,dst_31,dst2_31,pad_out_q15x2) -#define pad_3row_1col(dst_31,dst2_31,dst3_31,pad_out_q15x2) basic_pad_3row(1,dst_31,dst2_31,dst3_31,pad_out_q15x2) -#define pad_3row_2col(dst_31,dst2_31,dst3_31,pad_out_q15x2) basic_pad_3row(2,dst_31,dst2_31,dst3_31,pad_out_q15x2) -#define pad_3row_3col(dst_31,dst2_31,dst3_31,pad_out_q15x2) basic_pad_3row(3,dst_31,dst2_31,dst3_31,pad_out_q15x2) -#define pad_4row_1col(dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2) basic_pad_4row(1,dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2) -#define pad_4row_2col(dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2) basic_pad_4row(2,dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2) -#define pad_4row_3col(dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2) basic_pad_4row(3,dst_31,dst2_31,dst3_31,dst4_31,pad_out_q15x2) -#define pad_5row_1col(dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2) basic_pad_5row(1,dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2) -#define pad_5row_2col(dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2) basic_pad_5row(2,dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2) -#define pad_5row_3col(dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2) basic_pad_5row(3,dst_31,dst2_31,dst3_31,dst4_31,dst5_31,pad_out_q15x2) - -/* ---------------------------------- Load ---------------------------------- */ -#define basic_load_1row(col,src,dst)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - q7_q15_offset_ele(src,dst)\ - block_cnt--;\ -} -#define basic_load_2row(col,src,src2,dst,dst2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - q7_q15_offset_ele(src,dst)\ - q7_q15_offset_ele(src2,dst2)\ - block_cnt--;\ -} -#define basic_load_3row(col,src,src2,src3,dst,dst2,dst3)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - q7_q15_offset_ele(src,dst)\ - q7_q15_offset_ele(src2,dst2)\ - q7_q15_offset_ele(src3,dst3)\ - block_cnt--;\ -} -#define basic_load_4row(col,src,src2,src3,src4,dst,dst2,dst3,dst4)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - q7_q15_offset_ele(src,dst)\ - q7_q15_offset_ele(src2,dst2)\ - q7_q15_offset_ele(src3,dst3)\ - q7_q15_offset_ele(src4,dst4)\ - block_cnt--;\ -} -#define basic_load_5row(col,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - q7_q15_offset_ele(src,dst)\ - q7_q15_offset_ele(src2,dst2)\ - q7_q15_offset_ele(src3,dst3)\ - q7_q15_offset_ele(src4,dst4)\ - q7_q15_offset_ele(src5,dst5)\ - block_cnt--;\ -} - -///////////////////////// 4bit ////////////////////////// -#define b4_load_1row(col,src,dst)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b4_q7_q15_offset_ele(src,dst)\ - block_cnt--;\ -} -#define b4_load_2row(col,src,src2,dst,dst2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b4_q7_q15_offset_ele(src,dst)\ - b4_q7_q15_offset_ele(src2,dst2)\ - block_cnt--;\ -} -#define b4_load_3row(col,src,src2,src3,dst,dst2,dst3)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b4_q7_q15_offset_ele(src,dst)\ - b4_q7_q15_offset_ele(src2,dst2)\ - b4_q7_q15_offset_ele(src3,dst3)\ - block_cnt--;\ -} -#define b4_load_4row(col,src,src2,src3,src4,dst,dst2,dst3,dst4)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b4_q7_q15_offset_ele(src,dst)\ - b4_q7_q15_offset_ele(src2,dst2)\ - b4_q7_q15_offset_ele(src3,dst3)\ - b4_q7_q15_offset_ele(src4,dst4)\ - block_cnt--;\ -} -#define b4_load_5row(col,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b4_q7_q15_offset_ele(src,dst)\ - b4_q7_q15_offset_ele(src2,dst2)\ - b4_q7_q15_offset_ele(src3,dst3)\ - b4_q7_q15_offset_ele(src4,dst4)\ - b4_q7_q15_offset_ele(src5,dst5)\ - block_cnt--;\ -} -///////////////////////// 2bit ////////////////////////// -#define b2_load_1row(col,src,dst)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b2_q7_q15_offset_ele(src,dst)\ - block_cnt--;\ -} -#define b2_load_2row(col,src,src2,dst,dst2)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b2_q7_q15_offset_ele(src,dst)\ - b2_q7_q15_offset_ele(src2,dst2)\ - block_cnt--;\ -} -#define b2_load_3row(col,src,src2,src3,dst,dst2,dst3)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b2_q7_q15_offset_ele(src,dst)\ - b2_q7_q15_offset_ele(src2,dst2)\ - b2_q7_q15_offset_ele(src3,dst3)\ - block_cnt--;\ -} -#define b2_load_4row(col,src,src2,src3,src4,dst,dst2,dst3,dst4)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b2_q7_q15_offset_ele(src,dst)\ - b2_q7_q15_offset_ele(src2,dst2)\ - b2_q7_q15_offset_ele(src3,dst3)\ - b2_q7_q15_offset_ele(src4,dst4)\ - block_cnt--;\ -} -#define b2_load_5row(col,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5)\ -block_cnt = channel_div4 * col; \ -while (block_cnt > 0) \ -{\ - b2_q7_q15_offset_ele(src,dst)\ - b2_q7_q15_offset_ele(src2,dst2)\ - b2_q7_q15_offset_ele(src3,dst3)\ - b2_q7_q15_offset_ele(src4,dst4)\ - b2_q7_q15_offset_ele(src5,dst5)\ - block_cnt--;\ -} - -#define b4_load_1row_1col(src,dst) b4_load_1row(1,src,dst) -#define b4_load_1row_2col(src,dst) b4_load_1row(2,src,dst) -#define b4_load_1row_3col(src,dst) b4_load_1row(3,src,dst) -#define b4_load_1row_4col(src,dst) b4_load_1row(4,src,dst) -#define b4_load_2row_1col(src,src2,dst,dst2) b4_load_2row(1,src,src2,dst,dst2) -#define b4_load_2row_2col(src,src2,dst,dst2) b4_load_2row(2,src,src2,dst,dst2) -#define b4_load_2row_3col(src,src2,dst,dst2) b4_load_2row(3,src,src2,dst,dst2) -#define b4_load_2row_4col(src,src2,dst,dst2) b4_load_2row(4,src,src2,dst,dst2) -#define b4_load_3row_1col(src,src2,src3,dst,dst2,dst3) b4_load_3row(1,src,src2,src3,dst,dst2,dst3) -#define b4_load_3row_2col(src,src2,src3,dst,dst2,dst3) b4_load_3row(2,src,src2,src3,dst,dst2,dst3) -#define b4_load_3row_3col(src,src2,src3,dst,dst2,dst3) b4_load_3row(3,src,src2,src3,dst,dst2,dst3) -#define b4_load_3row_4col(src,src2,src3,dst,dst2,dst3) b4_load_3row(4,src,src2,src3,dst,dst2,dst3) -#define b4_load_4row_1col(src,src2,src3,src4,dst,dst2,dst3,dst4) b4_load_4row(1,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b4_load_4row_2col(src,src2,src3,src4,dst,dst2,dst3,dst4) b4_load_4row(2,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b4_load_4row_3col(src,src2,src3,src4,dst,dst2,dst3,dst4) b4_load_4row(3,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b4_load_4row_4col(src,src2,src3,src4,dst,dst2,dst3,dst4) b4_load_4row(4,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b4_load_5row_1col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b4_load_5row(1,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define b4_load_5row_2col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b4_load_5row(2,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define b4_load_5row_3col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b4_load_5row(3,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define b4_load_5row_4col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b4_load_5row(4,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) - -#define b2_load_1row_1col(src,dst) b2_load_1row(1,src,dst) -#define b2_load_1row_2col(src,dst) b2_load_1row(2,src,dst) -#define b2_load_1row_3col(src,dst) b2_load_1row(3,src,dst) -#define b2_load_1row_4col(src,dst) b2_load_1row(4,src,dst) -#define b2_load_2row_1col(src,src2,dst,dst2) b2_load_2row(1,src,src2,dst,dst2) -#define b2_load_2row_2col(src,src2,dst,dst2) b2_load_2row(2,src,src2,dst,dst2) -#define b2_load_2row_3col(src,src2,dst,dst2) b2_load_2row(3,src,src2,dst,dst2) -#define b2_load_2row_4col(src,src2,dst,dst2) b2_load_2row(4,src,src2,dst,dst2) -#define b2_load_3row_1col(src,src2,src3,dst,dst2,dst3) b2_load_3row(1,src,src2,src3,dst,dst2,dst3) -#define b2_load_3row_2col(src,src2,src3,dst,dst2,dst3) b2_load_3row(2,src,src2,src3,dst,dst2,dst3) -#define b2_load_3row_3col(src,src2,src3,dst,dst2,dst3) b2_load_3row(3,src,src2,src3,dst,dst2,dst3) -#define b2_load_3row_4col(src,src2,src3,dst,dst2,dst3) b2_load_3row(4,src,src2,src3,dst,dst2,dst3) -#define b2_load_4row_1col(src,src2,src3,src4,dst,dst2,dst3,dst4) b2_load_4row(1,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b2_load_4row_2col(src,src2,src3,src4,dst,dst2,dst3,dst4) b2_load_4row(2,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b2_load_4row_3col(src,src2,src3,src4,dst,dst2,dst3,dst4) b2_load_4row(3,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b2_load_4row_4col(src,src2,src3,src4,dst,dst2,dst3,dst4) b2_load_4row(4,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define b2_load_5row_1col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b2_load_5row(1,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define b2_load_5row_2col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b2_load_5row(2,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define b2_load_5row_3col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b2_load_5row(3,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define b2_load_5row_4col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) b2_load_5row(4,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) - -#define load_1row_1col(src,dst) basic_load_1row(1,src,dst) -#define load_1row_2col(src,dst) basic_load_1row(2,src,dst) -#define load_1row_3col(src,dst) basic_load_1row(3,src,dst) -#define load_1row_4col(src,dst) basic_load_1row(4,src,dst) -#define load_2row_1col(src,src2,dst,dst2) basic_load_2row(1,src,src2,dst,dst2) -#define load_2row_2col(src,src2,dst,dst2) basic_load_2row(2,src,src2,dst,dst2) -#define load_2row_3col(src,src2,dst,dst2) basic_load_2row(3,src,src2,dst,dst2) -#define load_2row_4col(src,src2,dst,dst2) basic_load_2row(4,src,src2,dst,dst2) -#define load_3row_1col(src,src2,src3,dst,dst2,dst3) basic_load_3row(1,src,src2,src3,dst,dst2,dst3) -#define load_3row_2col(src,src2,src3,dst,dst2,dst3) basic_load_3row(2,src,src2,src3,dst,dst2,dst3) -#define load_3row_3col(src,src2,src3,dst,dst2,dst3) basic_load_3row(3,src,src2,src3,dst,dst2,dst3) -#define load_3row_4col(src,src2,src3,dst,dst2,dst3) basic_load_3row(4,src,src2,src3,dst,dst2,dst3) -#define load_4row_1col(src,src2,src3,src4,dst,dst2,dst3,dst4) basic_load_4row(1,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define load_4row_2col(src,src2,src3,src4,dst,dst2,dst3,dst4) basic_load_4row(2,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define load_4row_3col(src,src2,src3,src4,dst,dst2,dst3,dst4) basic_load_4row(3,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define load_4row_4col(src,src2,src3,src4,dst,dst2,dst3,dst4) basic_load_4row(4,src,src2,src3,src4,dst,dst2,dst3,dst4) -#define load_5row_1col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) basic_load_5row(1,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define load_5row_2col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) basic_load_5row(2,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define load_5row_3col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) basic_load_5row(3,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) -#define load_5row_4col(src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) basic_load_5row(4,src,src2,src3,src4,src5,dst,dst2,dst3,dst4,dst5) - -/* ---------------------------------- Reuse ---------------------------------- */ -#define basic_reuse_1row(col,src_31,dst_31)\ -block_cnt = channel_div4 * col;\ -while (block_cnt > 0)\ -{\ - q31_assign2(src_31,dst_31)\ - block_cnt--;\ -} -#define basic_reuse_2row(col,src_31,src2_31,dst_31,dst2_31)\ -block_cnt = channel_div4 * col;\ -while (block_cnt > 0)\ -{\ - q31_assign2(src_31,dst_31)\ - q31_assign2(src2_31,dst2_31)\ - block_cnt--;\ -} -#define basic_reuse_3row(col,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31)\ -block_cnt = channel_div4 * col;\ -while (block_cnt > 0)\ -{\ - q31_assign2(src_31,dst_31)\ - q31_assign2(src2_31,dst2_31)\ - q31_assign2(src3_31,dst3_31)\ - block_cnt--;\ -} -#define basic_reuse_4row(col,src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31)\ -block_cnt = channel_div4 * col;\ -while (block_cnt > 0)\ -{\ - q31_assign2(src_31,dst_31)\ - q31_assign2(src2_31,dst2_31)\ - q31_assign2(src3_31,dst3_31)\ - q31_assign2(src4_31,dst4_31)\ - block_cnt--;\ -} -#define basic_reuse_5row(col,src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31)\ -block_cnt = channel_div4 * col;\ -while (block_cnt > 0)\ -{\ - q31_assign2(src_31,dst_31)\ - q31_assign2(src2_31,dst2_31)\ - q31_assign2(src3_31,dst3_31)\ - q31_assign2(src4_31,dst4_31)\ - q31_assign2(src5_31,dst5_31)\ - block_cnt--;\ -} - -#define reuse_1row_1col(src_31,dst_31) basic_reuse_1row(1,src_31,dst_31) -#define reuse_1row_2col(src_31,dst_31) basic_reuse_1row(2,src_31,dst_31) -#define reuse_1row_3col(src_31,dst_31) basic_reuse_1row(3,src_31,dst_31) -#define reuse_1row_4col(src_31,dst_31) basic_reuse_1row(4,src_31,dst_31) -#define reuse_1row_5col(src_31,dst_31) basic_reuse_1row(5,src_31,dst_31) -#define reuse_1row_6col(src_31,dst_31) basic_reuse_1row(6,src_31,dst_31) -#define reuse_2row_1col(src_31,src2_31,dst_31,dst2_31) basic_reuse_2row(1,src_31,src2_31,dst_31,dst2_31) -#define reuse_2row_2col(src_31,src2_31,dst_31,dst2_31) basic_reuse_2row(2,src_31,src2_31,dst_31,dst2_31) -#define reuse_2row_3col(src_31,src2_31,dst_31,dst2_31) basic_reuse_2row(3,src_31,src2_31,dst_31,dst2_31) -#define reuse_2row_4col(src_31,src2_31,dst_31,dst2_31) basic_reuse_2row(4,src_31,src2_31,dst_31,dst2_31) -#define reuse_2row_5col(src_31,src2_31,dst_31,dst2_31) basic_reuse_2row(5,src_31,src2_31,dst_31,dst2_31) -#define reuse_2row_6col(src_31,src2_31,dst_31,dst2_31) basic_reuse_2row(6,src_31,src2_31,dst_31,dst2_31) -#define reuse_3row_1col(src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) basic_reuse_3row(1,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) -#define reuse_3row_2col(src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) basic_reuse_3row(2,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) -#define reuse_3row_3col(src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) basic_reuse_3row(3,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) -#define reuse_3row_4col(src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) basic_reuse_3row(4,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) -#define reuse_3row_5col(src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) basic_reuse_3row(5,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) -#define reuse_3row_6col(src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) basic_reuse_3row(6,src_31,src2_31,src3_31,dst_31,dst2_31,dst3_31) -#define reuse_4row_3col(src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) basic_reuse_4row(3,src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) -#define reuse_4row_4col(src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) basic_reuse_4row(4,src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) -#define reuse_4row_5col(src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) basic_reuse_4row(5,src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) -#define reuse_4row_6col(src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) basic_reuse_4row(6,src_31,src2_31,src3_31,src4_31,dst_31,dst2_31,dst3_31,dst4_31) -#define reuse_5row_3col(src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) basic_reuse_5row(3,src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) -#define reuse_5row_4col(src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) basic_reuse_5row(4,src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) -#define reuse_5row_5col(src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) basic_reuse_5row(5,src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) -#define reuse_5row_6col(src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) basic_reuse_5row(6,src_31,src2_31,src3_31,src4_31,src5_31,dst_31,dst2_31,dst3_31,dst4_31,dst5_31) -#endif /* ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ */ +/* ---------------------------------------------------------------------- + * Project: TinyEngine + * Title: img2col_element.h + * + * Reference papers: + * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020 + * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021 + * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022 + * Contact authors: + * - Wei-Ming Chen, wmchen@mit.edu + * - Wei-Chen Wang, wweichen@mit.edu + * - Ji Lin, jilin@mit.edu + * - Ligeng Zhu, ligeng@mit.edu + * - Song Han, songhan@mit.edu + * + * Target ISA: ARMv7E-M + * -------------------------------------------------------------------- */ + +#ifndef ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ +#define ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ + +#include "arm_nnsupportfunctions.h" + +#define b2_q7_q15_offset_ele(src, dst) \ + /* convert from q7 to q15 and then store the results in the destination buffer */ \ + /*in_q7x4 = b2_nn_read_q7x4_ia((const q7_t **)&src); \ + in_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ + in_q15x2_2 = __SXTB16(in_q7x4); */ \ + in_q15x2_1 = ((src[0] & 0x0C) >> 2) + ((src[0] & 0xC0) << 10); \ + in_q15x2_2 = (src[0] & 0x03) + ((src[0] & 0x30) << 12); \ + src += 1; \ + out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ + /* Maximum of 9 bits from the addition is expected */ \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_1); \ + write_q15x2_ia(&dst, out_q15x2_2); + +#define b4_q7_q15_offset_ele(src, dst) \ + /* convert from q7 to q15 and then store the results in the destination buffer */ \ + /*in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \ + in_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ + in_q15x2_2 = __SXTB16(in_q7x4); */ \ + in_q15x2_1 = ((src[0] & 0xF0) >> 4) + ((src[1] & 0xF0) << 12); \ + in_q15x2_2 = (src[0] & 0x0F) + ((src[1] & 0x0F) << 16); \ + src += 2; \ + out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ + /* Maximum of 9 bits from the addition is expected */ \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_1); \ + write_q15x2_ia(&dst, out_q15x2_2); + +#define q7_q15_offset_ele(src, dst) \ + /* convert from q7 to q15 and then store the results in the destination buffer */ \ + in_q7x4 = arm_nn_read_q7x4_ia((const q7_t **)&src); \ + /* Extract and sign extend each of the four q7 values to q15 */ \ + in_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ + in_q15x2_2 = __SXTB16(in_q7x4); \ + \ + out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ + /* Maximum of 9 bits from the addition is expected */ \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_1); \ + write_q15x2_ia(&dst, out_q15x2_2); + +#define q8_q15_offset_ele(src, dst) \ + /* convert from q8 to q15 and then store the results in the destination buffer */ \ + in_q7x4 = arm_nn_read_q7x4_ia((const q8_t **)&src); \ + /* Extend each of the four q8 values to q15 */ \ + in_q15x2_1 = __UXTB16(__ROR(in_q7x4, 8)); \ + in_q15x2_2 = __UXTB16(in_q7x4); \ + \ + out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \ + /* Maximum of 9 bits from the addition is expected */ \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_1); \ + write_q15x2_ia(&dst, out_q15x2_2); + +#define b4_q15_offset_reordered_ele(src, dst) \ + /* convert from q7 to q15 and then store the results in the destination buffer */ \ + in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \ + \ + /* Extract and sign extend each of the four q7 values to q15 */ \ + out_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ + out_q15x2_2 = __SXTB16(in_q7x4); \ + \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_2); \ + write_q15x2_ia(&dst, out_q15x2_1); + +#define b2_q15_offset_reordered_ele(src, dst) \ + /* convert from q7 to q15 and then store the results in the destination buffer */ \ + in_q7x4 = b2_nn_read_q7x4_ia(&src); \ + \ + /* Extract and sign extend each of the four q7 values to q15 */ \ + out_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ + out_q15x2_2 = __SXTB16(in_q7x4); \ + \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_2); \ + write_q15x2_ia(&dst, out_q15x2_1); + +#define q7_q15_offset_reordered_ele(src, dst) \ + /* convert from q7 to q15 and then store the results in the destination buffer */ \ + in_q7x4 = arm_nn_read_q7x4_ia((const q7_t **)&src); \ + \ + /* Extract and sign extend each of the four q7 values to q15 */ \ + out_q15x2_1 = __SXTB16(__ROR(in_q7x4, 8)); \ + out_q15x2_2 = __SXTB16(in_q7x4); \ + \ + out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \ + out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \ + \ + write_q15x2_ia(&dst, out_q15x2_2); \ + write_q15x2_ia(&dst, out_q15x2_1); + +#define q31_assign2(src, dst) \ + *dst++ = *src++; \ + *dst++ = *src++; + +#define q31_assign4(src, dst) q31_assign2(src, dst) q31_assign2(src, dst) + +#define q31_assign6(src, dst) q31_assign4(src, dst) q31_assign2(src, dst) + +#define q31_assign8(src, dst) q31_assign4(src, dst) q31_assign4(src, dst) + +#define q31_assign10(src, dst) q31_assign8(src, dst) q31_assign2(src, dst) + +#define q31_assign12(src, dst) q31_assign10(src, dst) q31_assign2(src, dst) + +#define q31_pad2(dst, padvalue) \ + *dst++ = padvalue; \ + *dst++ = padvalue; + +#define q31_pad4(dst, padvalue) q31_pad2(dst, padvalue) q31_pad2(dst, padvalue) + +#define q31_pad6(dst, padvalue) q31_pad4(dst, padvalue) q31_pad2(dst, padvalue) + +#define q31_pad10(dst, padvalue) q31_pad6(dst, padvalue) q31_pad4(dst, padvalue) + +#define q31_pad14(dst, padvalue) q31_pad6(dst, padvalue) q31_pad6(dst, padvalue) q31_pad2(dst, padvalue) + +#define assignq31toq15() \ + dst = (q15_t *)dst_31; \ + dst2 = (q15_t *)dst2_31; \ + dst3 = (q15_t *)dst3_31; \ + dst4 = (q15_t *)dst4_31; \ + dst5 = (q15_t *)dst5_31; \ + dst6 = (q15_t *)dst6_31; \ + dst7 = (q15_t *)dst7_31; + +#define assignq15toq31() \ + dst_31 = (q31_t *)dst; \ + dst2_31 = (q31_t *)dst2; \ + dst3_31 = (q31_t *)dst3; \ + dst4_31 = (q31_t *)dst4; \ + dst5_31 = (q31_t *)dst5; \ + dst6_31 = (q31_t *)dst6; \ + dst7_31 = (q31_t *)dst7; + +/* ---------------------------------- Pad ---------------------------------- */ +#define basic_pad_1row(col, dst_31, pad_out_q15x2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_pad2(dst_31, pad_out_q15x2) block_cnt--; \ + } + +#define basic_pad_2row(col, dst_31, dst2_31, pad_out_q15x2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) block_cnt--; \ + } + +#define basic_pad_3row(col, dst_31, dst2_31, dst3_31, pad_out_q15x2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) block_cnt--; \ + } + +#define basic_pad_4row(col, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) \ + q31_pad2(dst4_31, pad_out_q15x2) block_cnt--; \ + } + +#define basic_pad_5row(col, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) \ + q31_pad2(dst4_31, pad_out_q15x2) q31_pad2(dst5_31, pad_out_q15x2) block_cnt--; \ + } + +#define pad_1row_1col(dst_31, pad_out_q15x2) basic_pad_1row(1, dst_31, pad_out_q15x2) +#define pad_1row_2col(dst_31, pad_out_q15x2) basic_pad_1row(2, dst_31, pad_out_q15x2) +#define pad_1row_3col(dst_31, pad_out_q15x2) basic_pad_1row(3, dst_31, pad_out_q15x2) +#define pad_2row_1col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(1, dst_31, dst2_31, pad_out_q15x2) +#define pad_2row_2col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(2, dst_31, dst2_31, pad_out_q15x2) +#define pad_2row_3col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(3, dst_31, dst2_31, pad_out_q15x2) +#define pad_2row_4col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(4, dst_31, dst2_31, pad_out_q15x2) +#define pad_2row_5col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(5, dst_31, dst2_31, pad_out_q15x2) +#define pad_3row_1col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \ + basic_pad_3row(1, dst_31, dst2_31, dst3_31, pad_out_q15x2) +#define pad_3row_2col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \ + basic_pad_3row(2, dst_31, dst2_31, dst3_31, pad_out_q15x2) +#define pad_3row_3col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \ + basic_pad_3row(3, dst_31, dst2_31, dst3_31, pad_out_q15x2) +#define pad_4row_1col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \ + basic_pad_4row(1, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) +#define pad_4row_2col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \ + basic_pad_4row(2, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) +#define pad_4row_3col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \ + basic_pad_4row(3, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) +#define pad_5row_1col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \ + basic_pad_5row(1, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) +#define pad_5row_2col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \ + basic_pad_5row(2, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) +#define pad_5row_3col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \ + basic_pad_5row(3, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) + +/* ---------------------------------- Load ---------------------------------- */ +#define basic_load_1row(col, src, dst) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q7_q15_offset_ele(src, dst) block_cnt--; \ + } +#define basic_load_2row(col, src, src2, dst, dst2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) block_cnt--; \ + } +#define basic_load_3row(col, src, src2, src3, dst, dst2, dst3) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) block_cnt--; \ + } +#define basic_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) \ + q7_q15_offset_ele(src4, dst4) block_cnt--; \ + } +#define basic_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) \ + q7_q15_offset_ele(src4, dst4) q7_q15_offset_ele(src5, dst5) block_cnt--; \ + } + +///////////////////////// 4bit ////////////////////////// +#define b4_load_1row(col, src, dst) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b4_q7_q15_offset_ele(src, dst) block_cnt--; \ + } +#define b4_load_2row(col, src, src2, dst, dst2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) block_cnt--; \ + } +#define b4_load_3row(col, src, src2, src3, dst, dst2, dst3) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) block_cnt--; \ + } +#define b4_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) \ + b4_q7_q15_offset_ele(src4, dst4) block_cnt--; \ + } +#define b4_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) \ + b4_q7_q15_offset_ele(src4, dst4) b4_q7_q15_offset_ele(src5, dst5) block_cnt--; \ + } +///////////////////////// 2bit ////////////////////////// +#define b2_load_1row(col, src, dst) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b2_q7_q15_offset_ele(src, dst) block_cnt--; \ + } +#define b2_load_2row(col, src, src2, dst, dst2) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) block_cnt--; \ + } +#define b2_load_3row(col, src, src2, src3, dst, dst2, dst3) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) block_cnt--; \ + } +#define b2_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) \ + b2_q7_q15_offset_ele(src4, dst4) block_cnt--; \ + } +#define b2_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) \ + b2_q7_q15_offset_ele(src4, dst4) b2_q7_q15_offset_ele(src5, dst5) block_cnt--; \ + } + +#define b4_load_1row_1col(src, dst) b4_load_1row(1, src, dst) +#define b4_load_1row_2col(src, dst) b4_load_1row(2, src, dst) +#define b4_load_1row_3col(src, dst) b4_load_1row(3, src, dst) +#define b4_load_1row_4col(src, dst) b4_load_1row(4, src, dst) +#define b4_load_2row_1col(src, src2, dst, dst2) b4_load_2row(1, src, src2, dst, dst2) +#define b4_load_2row_2col(src, src2, dst, dst2) b4_load_2row(2, src, src2, dst, dst2) +#define b4_load_2row_3col(src, src2, dst, dst2) b4_load_2row(3, src, src2, dst, dst2) +#define b4_load_2row_4col(src, src2, dst, dst2) b4_load_2row(4, src, src2, dst, dst2) +#define b4_load_3row_1col(src, src2, src3, dst, dst2, dst3) b4_load_3row(1, src, src2, src3, dst, dst2, dst3) +#define b4_load_3row_2col(src, src2, src3, dst, dst2, dst3) b4_load_3row(2, src, src2, src3, dst, dst2, dst3) +#define b4_load_3row_3col(src, src2, src3, dst, dst2, dst3) b4_load_3row(3, src, src2, src3, dst, dst2, dst3) +#define b4_load_3row_4col(src, src2, src3, dst, dst2, dst3) b4_load_3row(4, src, src2, src3, dst, dst2, dst3) +#define b4_load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b4_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b4_load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b4_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b4_load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b4_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b4_load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b4_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b4_load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b4_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define b4_load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b4_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define b4_load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b4_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define b4_load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b4_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) + +#define b2_load_1row_1col(src, dst) b2_load_1row(1, src, dst) +#define b2_load_1row_2col(src, dst) b2_load_1row(2, src, dst) +#define b2_load_1row_3col(src, dst) b2_load_1row(3, src, dst) +#define b2_load_1row_4col(src, dst) b2_load_1row(4, src, dst) +#define b2_load_2row_1col(src, src2, dst, dst2) b2_load_2row(1, src, src2, dst, dst2) +#define b2_load_2row_2col(src, src2, dst, dst2) b2_load_2row(2, src, src2, dst, dst2) +#define b2_load_2row_3col(src, src2, dst, dst2) b2_load_2row(3, src, src2, dst, dst2) +#define b2_load_2row_4col(src, src2, dst, dst2) b2_load_2row(4, src, src2, dst, dst2) +#define b2_load_3row_1col(src, src2, src3, dst, dst2, dst3) b2_load_3row(1, src, src2, src3, dst, dst2, dst3) +#define b2_load_3row_2col(src, src2, src3, dst, dst2, dst3) b2_load_3row(2, src, src2, src3, dst, dst2, dst3) +#define b2_load_3row_3col(src, src2, src3, dst, dst2, dst3) b2_load_3row(3, src, src2, src3, dst, dst2, dst3) +#define b2_load_3row_4col(src, src2, src3, dst, dst2, dst3) b2_load_3row(4, src, src2, src3, dst, dst2, dst3) +#define b2_load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b2_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b2_load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b2_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b2_load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b2_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b2_load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + b2_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define b2_load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b2_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define b2_load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b2_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define b2_load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b2_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define b2_load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + b2_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) + +#define load_1row_1col(src, dst) basic_load_1row(1, src, dst) +#define load_1row_2col(src, dst) basic_load_1row(2, src, dst) +#define load_1row_3col(src, dst) basic_load_1row(3, src, dst) +#define load_1row_4col(src, dst) basic_load_1row(4, src, dst) +#define load_2row_1col(src, src2, dst, dst2) basic_load_2row(1, src, src2, dst, dst2) +#define load_2row_2col(src, src2, dst, dst2) basic_load_2row(2, src, src2, dst, dst2) +#define load_2row_3col(src, src2, dst, dst2) basic_load_2row(3, src, src2, dst, dst2) +#define load_2row_4col(src, src2, dst, dst2) basic_load_2row(4, src, src2, dst, dst2) +#define load_3row_1col(src, src2, src3, dst, dst2, dst3) basic_load_3row(1, src, src2, src3, dst, dst2, dst3) +#define load_3row_2col(src, src2, src3, dst, dst2, dst3) basic_load_3row(2, src, src2, src3, dst, dst2, dst3) +#define load_3row_3col(src, src2, src3, dst, dst2, dst3) basic_load_3row(3, src, src2, src3, dst, dst2, dst3) +#define load_3row_4col(src, src2, src3, dst, dst2, dst3) basic_load_3row(4, src, src2, src3, dst, dst2, dst3) +#define load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + basic_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + basic_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + basic_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \ + basic_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4) +#define load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + basic_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + basic_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + basic_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) +#define load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \ + basic_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) + +/* ---------------------------------- Reuse ---------------------------------- */ +#define basic_reuse_1row(col, src_31, dst_31) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_assign2(src_31, dst_31) block_cnt--; \ + } +#define basic_reuse_2row(col, src_31, src2_31, dst_31, dst2_31) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) block_cnt--; \ + } +#define basic_reuse_3row(col, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) block_cnt--; \ + } +#define basic_reuse_4row(col, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) \ + q31_assign2(src4_31, dst4_31) block_cnt--; \ + } +#define basic_reuse_5row(col, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \ + block_cnt = channel_div4 * col; \ + while (block_cnt > 0) { \ + q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) \ + q31_assign2(src4_31, dst4_31) q31_assign2(src5_31, dst5_31) block_cnt--; \ + } + +#define reuse_1row_1col(src_31, dst_31) basic_reuse_1row(1, src_31, dst_31) +#define reuse_1row_2col(src_31, dst_31) basic_reuse_1row(2, src_31, dst_31) +#define reuse_1row_3col(src_31, dst_31) basic_reuse_1row(3, src_31, dst_31) +#define reuse_1row_4col(src_31, dst_31) basic_reuse_1row(4, src_31, dst_31) +#define reuse_1row_5col(src_31, dst_31) basic_reuse_1row(5, src_31, dst_31) +#define reuse_1row_6col(src_31, dst_31) basic_reuse_1row(6, src_31, dst_31) +#define reuse_2row_1col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(1, src_31, src2_31, dst_31, dst2_31) +#define reuse_2row_2col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(2, src_31, src2_31, dst_31, dst2_31) +#define reuse_2row_3col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(3, src_31, src2_31, dst_31, dst2_31) +#define reuse_2row_4col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(4, src_31, src2_31, dst_31, dst2_31) +#define reuse_2row_5col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(5, src_31, src2_31, dst_31, dst2_31) +#define reuse_2row_6col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(6, src_31, src2_31, dst_31, dst2_31) +#define reuse_3row_1col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + basic_reuse_3row(1, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) +#define reuse_3row_2col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + basic_reuse_3row(2, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) +#define reuse_3row_3col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + basic_reuse_3row(3, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) +#define reuse_3row_4col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + basic_reuse_3row(4, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) +#define reuse_3row_5col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + basic_reuse_3row(5, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) +#define reuse_3row_6col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \ + basic_reuse_3row(6, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) +#define reuse_4row_3col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \ + basic_reuse_4row(3, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) +#define reuse_4row_4col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \ + basic_reuse_4row(4, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) +#define reuse_4row_5col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \ + basic_reuse_4row(5, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) +#define reuse_4row_6col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \ + basic_reuse_4row(6, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) +#define reuse_5row_3col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \ + basic_reuse_5row(3, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) +#define reuse_5row_4col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \ + basic_reuse_5row(4, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) +#define reuse_5row_5col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \ + basic_reuse_5row(5, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) +#define reuse_5row_6col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \ + basic_reuse_5row(6, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) +#endif /* ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ */ diff --git a/examples/openmv_training/codegen_ffc_only.zip b/examples/openmv_training/codegen_ffc_only.zip deleted file mode 100644 index 39ff7ce..0000000 Binary files a/examples/openmv_training/codegen_ffc_only.zip and /dev/null differ diff --git a/examples/openmv_training_sparse/README.md b/examples/openmv_training_sparse/README.md new file mode 100644 index 0000000..48164e6 --- /dev/null +++ b/examples/openmv_training_sparse/README.md @@ -0,0 +1,85 @@ +# Training on OpenMV Cam H7 + +This is an example showing how to train a model using a predefined sparse update schema with TinyEngine. + +## Install build dependencies on Linux + +Note: This section is basically from https://github.com/openmv/openmv/blob/master/src/README.md. Please refer to the OpenMV's repo for more details or steps on different env. + +``` +sudo apt-get update +sudo apt-get install git build-essential +``` + +## Install GNU ARM toolchain + +``` +Install arm toolchain +TOOLCHAIN_PATH=/usr/local/arm-none-eabi +TOOLCHAIN_URL="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2" +sudo mkdir ${TOOLCHAIN_PATH} +wget --no-check-certificate -O - ${TOOLCHAIN_URL} | sudo tar --strip-components=1 -jx -C ${TOOLCHAIN_PATH} +export PATH=${TOOLCHAIN_PATH}/bin:${PATH} +``` + +## Clone the OpenMV source + +``` +cd tinyengine/examples/openmv_training_sparse/ +git clone https://github.com/openmv/openmv.git +``` + +Currently, we don't have compatibility tests for the OpenMV source, so let's use the version that has been manually tested before. + +``` +cd openmv +git checkout 918ccb937730cc759ee5709df089d9de516dc7bf +git submodule update --init --recursive +``` + +## Build the source + +Let's first build the firmware from the source to make sure all required dependencies are correctly installed. The `TARGET `is set to `OPENMV4` for OpenMV Cam H7. + +``` +make -j4 -C src/micropython/mpy-cross +make -j4 TARGET=OPENMV4 -C src +``` + +You should see the compiled binary at `openmv/src/build/bin/firmware.bin`. + +## Apply customized patch + +The patch is to + +1. disable some features in the firmware for SRAM and flash space +1. setup for TinyEngine source +1. add the application code for training in `exampleemodule.c` + +``` +git apply ../openmv_sparse_training.patch +``` + +## Generate model-specific code and recompile the firmware with TinyEngine + +``` +cd .. +sh gen_code.sh +cd openmv +make -j4 TARGET=OPENMV4 -C src +``` + +Flash the binary `openmv/src/build/bin/firmware.bin` into your OpenMV. Please refer to the official [Instructions](https://github.com/openmv/openmv/blob/master/src/README.md#flashing-the-firmware%5D). + +## Connect two buttons to your board + +Connect two buttons with jump wires to pin1 and pin4. Please refer to the [pinout](http://wiki.amperka.ru/_media/products:openmv-cam-h7:openmv-cam-h7-pinout.pdf). + +These two buttons will be used to label images captured by the camera. +![image](https://user-images.githubusercontent.com/17592131/217367877-6a500f31-be3b-4258-a86e-4eabbb947a7e.png) + +## Start the demo + +1. Open OpenMV IDE +1. Connect your OpenMV cam to the PC +1. Run the python script `tinyengine/examples/openmv_vww/vww_openmv_demo.py` in OpenMV IDE. diff --git a/examples/openmv_training_sparse/gen_code.sh b/examples/openmv_training_sparse/gen_code.sh new file mode 100644 index 0000000..392b143 --- /dev/null +++ b/examples/openmv_training_sparse/gen_code.sh @@ -0,0 +1,7 @@ +cd ../../ +export PYTHONPATH=${PYTHONPATH}:$(pwd) +cp -r TinyEngine examples/openmv_training_sparse/openmv/src/omv/modules/ +cd examples/openmv_training_sparse +mkdir codegen +python ../tiny_training.py -f ../../assets/49kb-int8-graph.json -D ../../assets/full-int8-params.pkl -QAS ../../assets/scale.json -m -g -d -FR +mv codegen openmv/src/omv/modules/TinyEngine/ diff --git a/examples/openmv_training/openmv_training.patch b/examples/openmv_training_sparse/openmv_sparse_training.patch similarity index 73% rename from examples/openmv_training/openmv_training.patch rename to examples/openmv_training_sparse/openmv_sparse_training.patch index 3fd52b1..f456f78 100644 --- a/examples/openmv_training/openmv_training.patch +++ b/examples/openmv_training_sparse/openmv_sparse_training.patch @@ -150,10 +150,10 @@ index 84601904..abc6fe04 100644 * @brief defition to adding rouding offset */ diff --git a/src/omv/Makefile b/src/omv/Makefile -index 159d07a5..6bdfd47a 100644 +index 159d07a5..239fa50a 100644 --- a/src/omv/Makefile +++ b/src/omv/Makefile -@@ -96,6 +96,25 @@ SRCS += $(addprefix imlib/, \ +@@ -96,6 +96,50 @@ SRCS += $(addprefix imlib/, \ zbar.c \ ) @@ -162,18 +162,43 @@ index 159d07a5..6bdfd47a 100644 + codegen/Source/depthwise_kernel3x3_stride1_inplace_CHW_fpreq.c \ + codegen/Source/depthwise_kernel3x3_stride2_inplace_CHW_fpreq.c \ + codegen/Source/depthwise_kernel5x5_stride1_inplace_CHW_fpreq.c \ -+ codegen/Source/depthwise_kernel5x5_stride2_inplace_CHW_fpreq.c \ + codegen/Source/depthwise_kernel7x7_stride1_inplace_CHW_fpreq.c \ + codegen/Source/depthwise_kernel7x7_stride2_inplace_CHW_fpreq.c \ ++ codegen/Source/depthwise_kernel3x3_stride1_inplace_CHW_fpreq_bitmask.c \ ++ codegen/Source/depthwise_kernel3x3_stride2_inplace_CHW_fpreq_bitmask.c \ ++ codegen/Source/depthwise_kernel5x5_stride1_inplace_CHW_fpreq_bitmask.c \ ++ codegen/Source/depthwise_kernel7x7_stride1_inplace_CHW_fpreq_bitmask.c \ ++ codegen/Source/depthwise_kernel7x7_stride2_inplace_CHW_fpreq_bitmask.c \ + src/kernels/fp_requantize_op/add_fpreq.c \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch8_fpreq.c \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch16_fpreq.c \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch24_fpreq.c \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch48_fpreq.c \ + src/kernels/fp_requantize_op/convolve_1x1_s8_fpreq.c \ -+ src/kernels/int_only/avgpooling.c \ ++ src/kernels/int_forward_op/avgpooling.c \ + src/kernels/fp_requantize_op/convolve_s8_kernel3_inputch3_stride2_pad1_fpreq.c \ + src/kernels/fp_requantize_op/mat_mul_kernels_fpreq.c \ ++ src/kernels/fp_requantize_op/convolve_1x1_s8_fpreq_mask.c \ ++ src/kernels/fp_requantize_op/convolve_1x1_s8_fpreq_mask_partialCH.c \ ++ src/kernels/fp_backward_op/sum_4D_exclude_fp.c \ ++ src/kernels/fp_backward_op/where_fp.c \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel3_stride1_inpad1_outpad0.c \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel3_stride2_inpad1_outpad1.c \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel5_stride1_inpad2_outpad0.c \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel5_stride2_inpad2_outpad1.c \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel7_stride1_inpad3_outpad0.c \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel7_stride2_inpad3_outpad1.c \ ++ src/kernels/fp_backward_op/tte_exp_fp.c \ ++ src/kernels/fp_backward_op/sub_fp.c \ ++ src/kernels/fp_backward_op/mul_fp.c \ ++ src/kernels/fp_backward_op/pointwise_conv_fp.c \ ++ src/kernels/fp_backward_op/group_pointwise_conv_fp.c \ ++ src/kernels/fp_backward_op/group_conv_fp_kernel4_stride1_pad0.c \ ++ src/kernels/fp_backward_op/group_conv_fp_kernel8_stride1_pad0.c \ ++ src/kernels/fp_backward_op/strided_slice_4Dto4D_fp.c \ ++ src/kernels/fp_backward_op/sum_3D_fp.c \ ++ src/kernels/fp_backward_op/nll_loss_fp.c \ ++ src/kernels/fp_backward_op/log_softmax_fp.c \ + ) + SRCS += $(wildcard ports/$(PORT)/*.c) @@ -406,10 +431,10 @@ index 412de472..f7da2c03 100644 // Domain 2 DMA buffers region. diff --git a/src/omv/modules/examplemodule.c b/src/omv/modules/examplemodule.c -index 37e2b4f4..1f6ce7d4 100644 +index 37e2b4f4..52d1bda2 100644 --- a/src/omv/modules/examplemodule.c +++ b/src/omv/modules/examplemodule.c -@@ -1,17 +1,277 @@ +@@ -1,17 +1,81 @@ // Include MicroPython API. #include "py/runtime.h" +#include "genNN.h" @@ -417,224 +442,27 @@ index 37e2b4f4..1f6ce7d4 100644 +#include +#include "py_image.h" -+#define TEST_SIZE 1 * 1024 -+// signed char w[TEST_SIZE]; - // This is the function which will be called from Python as cexample.add_ints(a, b). - STATIC mp_obj_t example_add_ints(mp_obj_t a_obj, mp_obj_t b_obj) { +-// This is the function which will be called from Python as cexample.add_ints(a, b). +-STATIC mp_obj_t example_add_ints(mp_obj_t a_obj, mp_obj_t b_obj) { - // Extract the ints from the micropython input objects. - int a = mp_obj_get_int(a_obj); - int b = mp_obj_get_int(b_obj); -+ invoke(NULL); -+ return mp_obj_new_int(999); -+} -+ ++#define TEST_SIZE 1 * 1024 +#define TN_MAX(A,B) ((A) > (B) ? (A) : (B)) +#define TN_MIN(A,B) ((A) < (B) ? (A) : (B))`` + +// for fc only -+#define ORIGIN_H 72 -+#define ORIGIN_W 88 -+#define IMAGE_H 80 -+#define IMAGE_W 80 -+#define INPUT_CH 160 -+#define OUTPUT_CH 2 -+#define IMAGES 6 -+ -+float feat_fp[INPUT_CH]; -+int8_t feat[INPUT_CH]; -+float w[INPUT_CH * OUTPUT_CH]; -+float b[OUTPUT_CH]; -+float out[OUTPUT_CH]; -+float dw[OUTPUT_CH*INPUT_CH]; -+float lr = 0.1; -+const signed char zero_x = 6; -+const float scale_x = 0.060486205; -+ -+void fully_connected_fp( -+ const float *input, const uint16_t input_x, const uint16_t input_y, -+ const uint16_t input_ch, const uint16_t output_ch, const float *bias, -+ const float *weights, float *output) -+{ -+ int h, w, out_c, in_c; -+ for (h = 0; h < input_y; h++){ -+ for (w = 0; w < input_x; w++){ -+ int pixel_cnt = w + input_x * h; -+ for (out_c = 0; out_c < output_ch; out_c++){ -+ float intermediate = bias[out_c]; -+ const float *start_weight = weights + out_c * input_ch; -+ const float *start_input = input + input_ch * pixel_cnt; -+ float *start_out = output + output_ch * pixel_cnt; -+ for (in_c = 0; in_c < input_ch; in_c++){ -+ intermediate += start_weight[in_c] * start_input[in_c]; -+ } -+ start_out[out_c] = intermediate; -+ } -+ } -+ } -+} -+ -+void mat_mul_fp( -+ const float *matA, const uint16_t matA_row, const uint16_t matA_col, -+ const float* matB, const uint16_t matB_col, float* output) -+{ -+ int m, n, i; -+ for (n = 0; n < matA_row; n++){ -+ for (m = 0; m < matB_col; m++){ -+ float sum = 0; -+ for (i = 0; i < matA_col; i++){ -+ sum += matA[i + n * matA_col] * matB[m + i * matA_col]; -+ } -+ output[m + n * matB_col] = sum; -+ } -+ } -+} -+ -+void statble_softmax_inplace(float *input, const uint16_t length) -+{ -+ float max = FLT_MIN; -+ float exp_sum = 0; -+ uint16_t i; -+ for (i = 0; i < length; i++){ -+ if (input[i] > max) max = input[i]; -+ } -+ -+ // inplace update -+ for (i = 0; i < length; i++){ -+ input[i] = exp(input[i] - max); -+ exp_sum += input[i]; -+ } -+ for (i = 0; i < length; i++){ -+ input[i] = input[i] / exp_sum; -+ } -+} -+ -+ -+void invoke_new_weights(const int8_t* img, float *out){ -+ int i; -+ signed char *input = getInput(); -+ const int8_t* image = img; -+ for (i = 0; i < IMAGE_H * IMAGE_W * 3; i++){ -+ input[i] = *image++; -+ } -+ invoke(NULL); -+ signed char *output = getOutput(); -+ for (i = 0; i < INPUT_CH; i++){ -+ feat_fp[i] = (output[i] - zero_x)*scale_x; -+ } -+ -+ // out = new_w @ feat + new_b -+ fully_connected_fp(feat_fp, 1, 1, INPUT_CH, OUTPUT_CH, b, w, out); -+} -+ -+void invoke_new_weights_givenimg(float *out){ -+ int i; -+ invoke(NULL); -+ signed char *output = getOutput(); -+ for (i = 0; i < INPUT_CH; i++){ -+ feat_fp[i] = (output[i] - zero_x)*scale_x; -+ } -+ -+ // out = new_w @ feat + new_b -+ fully_connected_fp(feat_fp, 1, 1, INPUT_CH, OUTPUT_CH, b, w, out); -+} -+ -+void train_one_img(const int8_t* img, int cls) -+{ -+ int i; -+ signed char *input = getInput(); -+ const int8_t* image = img; -+ for (i = 0; i < IMAGE_H * IMAGE_W * 3; i++){ -+ input[i] = *image++; -+ } -+ invoke(NULL); -+ signed char *output = getOutput(); -+ for (i = 0; i < INPUT_CH; i++){ -+ feat_fp[i] = (output[i] - zero_x)*scale_x; -+ } -+ -+ // out = new_w @ feat + new_b -+ fully_connected_fp(feat_fp, 1, 1, INPUT_CH, OUTPUT_CH, b, w, out); -+ -+ // softmax = _stable_softmax(out) -+ statble_softmax_inplace(out, OUTPUT_CH); ++#define ORIGIN_H 128 ++#define ORIGIN_W 128 ++#define IMAGE_H 128 ++#define IMAGE_W 128 - // Calculate the addition and convert to MicroPython object. - return mp_obj_new_int(a + b); -+ out[cls] -= 1; -+ -+ //dw = dy.reshape(-1, 1) @ feat.reshape(1, -1) -+ mat_mul_fp(out, OUTPUT_CH, 1, feat_fp, INPUT_CH, dw); -+ -+ for (i = 0; i < OUTPUT_CH * INPUT_CH; i++){ -+ w[i] = w[i] - lr * dw[i]; -+ } -+ //new_w = new_w - lr * dw -+ //new_b = new_b - lr * -+ b[0] = b[0] - lr * out[0]; -+ b[1] = b[1] - lr * out[1]; - } -+ -+void train(int cls) -+{ -+ int i; -+ invoke(NULL); -+ signed char *output = getOutput(); -+ for (i = 0; i < INPUT_CH; i++){ -+ feat_fp[i] = (output[i] - zero_x)*scale_x; -+ } -+ -+ // out = new_w @ feat + new_b -+ fully_connected_fp(feat_fp, 1, 1, INPUT_CH, OUTPUT_CH, b, w, out); -+ -+ // softmax = _stable_softmax(out) -+ statble_softmax_inplace(out, OUTPUT_CH); -+ -+ out[cls] -= 1; -+ -+ //dw = dy.reshape(-1, 1) @ feat.reshape(1, -1) -+ mat_mul_fp(out, OUTPUT_CH, 1, feat_fp, INPUT_CH, dw); -+ -+ for (i = 0; i < OUTPUT_CH * INPUT_CH; i++){ -+ w[i] = w[i] - lr * dw[i]; -+ } -+ //new_w = new_w - lr * dw -+ //new_b = new_b - lr * -+ b[0] = b[0] - lr * out[0]; -+ b[1] = b[1] - lr * out[1]; -+} -+ -+void train_one_feat(const float* feat, int cls) -+{ -+ int i; -+ signed char *input = getInput(); -+ for (i = 0; i < IMAGE_H * IMAGE_W * 3; i++){ -+ input[i] = feat[i]; -+ } -+ -+ // out = new_w @ feat + new_b -+ fully_connected_fp(feat, 1, 1, INPUT_CH, OUTPUT_CH, b, w, out); -+ -+ // softmax = _stable_softmax(out) -+ statble_softmax_inplace(out, OUTPUT_CH); -+ -+ out[cls] -= 1; -+ -+ //dw = dy.reshape(-1, 1) @ feat.reshape(1, -1) -+ mat_mul_fp(out, OUTPUT_CH, 1, feat, INPUT_CH, dw); -+ -+ for (i = 0; i < OUTPUT_CH * INPUT_CH; i++){ -+ w[i] = w[i] - lr * dw[i]; -+ } -+ //new_w = new_w - lr * dw -+ //new_b = new_b - lr * -+ b[0] = b[0] - lr * out[0]; -+ b[1] = b[1] - lr * out[1]; -+} -+ -+ +uint16_t color; ++float labels[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +// This is the function which will be called from Python as cexample.add_ints(a, b). -+STATIC mp_obj_t example_VWW(mp_obj_t a, mp_obj_t b) { ++STATIC mp_obj_t example_train_demo_fn(mp_obj_t a, mp_obj_t b) { + image_t* img = py_image_cobj(a); + // >= 0, for training with the label, -1 is for inference + int command = mp_obj_get_int(b); @@ -663,12 +491,16 @@ index 37e2b4f4..1f6ce7d4 100644 + } + } + if (command >= 0){ ++ labels[0] = 0; ++ labels[1] = 0; ++ labels[command] = 1; ++ invoke(labels); + printf("train class %d\n", command); -+ train(command); + } + else{ -+ invoke_new_weights_givenimg(out); -+ if(out[0] > out[1]){ ++ invoke_inf(); ++ uint8_t* output = (uint8_t*)getOutput(); ++ if(output[0] > output[1]){ + printf("infer class 0\n"); + color = 63488; + } @@ -684,30 +516,31 @@ index 37e2b4f4..1f6ce7d4 100644 + } + } + return mp_obj_new_int(0); -+} + } + // Define a Python reference to the function above. - STATIC MP_DEFINE_CONST_FUN_OBJ_2(example_add_ints_obj, example_add_ints); -+STATIC MP_DEFINE_CONST_FUN_OBJ_2(example_VWW_obj, example_VWW); +-STATIC MP_DEFINE_CONST_FUN_OBJ_2(example_add_ints_obj, example_add_ints); ++STATIC MP_DEFINE_CONST_FUN_OBJ_2(example_train_demo, example_train_demo_fn); // Define all properties of the module. // Table entries are key/value pairs of the attribute name (a string) -@@ -21,6 +281,7 @@ STATIC MP_DEFINE_CONST_FUN_OBJ_2(example_add_ints_obj, example_add_ints); +@@ -20,7 +84,7 @@ STATIC MP_DEFINE_CONST_FUN_OBJ_2(example_add_ints_obj, example_add_ints); + // optimized to word-sized integers by the build system (interned strings). STATIC const mp_rom_map_elem_t example_module_globals_table[] = { { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_cexample) }, - { MP_ROM_QSTR(MP_QSTR_add_ints), MP_ROM_PTR(&example_add_ints_obj) }, -+ { MP_ROM_QSTR(MP_QSTR_VWW), MP_ROM_PTR(&example_VWW_obj) }, +- { MP_ROM_QSTR(MP_QSTR_add_ints), MP_ROM_PTR(&example_add_ints_obj) }, ++ { MP_ROM_QSTR(MP_QSTR_train_demo), MP_ROM_PTR(&example_train_demo) }, }; STATIC MP_DEFINE_CONST_DICT(example_module_globals, example_module_globals_table); -@@ -33,4 +294,4 @@ const mp_obj_module_t example_user_cmodule = { +@@ -33,4 +97,4 @@ const mp_obj_module_t example_user_cmodule = { // Register the module to make it available in Python. // Note: This module is disabled, set the thrid argument to 1 to enable it, or // use a macro like MODULE_CEXAMPLE_ENABLED to conditionally enable this module. -MP_REGISTER_MODULE(MP_QSTR_cexample, example_user_cmodule, 0); +MP_REGISTER_MODULE(MP_QSTR_cexample, example_user_cmodule, 1); diff --git a/src/omv/ports/stm32/omv_portconfig.mk b/src/omv/ports/stm32/omv_portconfig.mk -index 200ffb7d..e742c135 100644 +index 200ffb7d..b3049e25 100644 --- a/src/omv/ports/stm32/omv_portconfig.mk +++ b/src/omv/ports/stm32/omv_portconfig.mk @@ -4,7 +4,7 @@ STARTUP ?= st/startup_$(shell echo $(MCU) | tr '[:upper:]' '[:lower:]') @@ -715,7 +548,7 @@ index 200ffb7d..e742c135 100644 # Compiler Flags -CFLAGS += -std=gnu99 -Wall -Werror -Warray-bounds -mthumb -nostartfiles -fdata-sections -ffunction-sections -+CFLAGS += -std=gnu99 -Wall -Warray-bounds -mthumb -nostartfiles -fdata-sections -ffunction-sections ++CFLAGS += -std=gnu99 -Warray-bounds -mthumb -nostartfiles -fdata-sections -ffunction-sections -lm CFLAGS += -fno-inline-small-functions -D$(MCU) -D$(CFLAGS_MCU) -D$(ARM_MATH) -DARM_NN_TRUNCATE\ -fsingle-precision-constant -Wdouble-promotion -mcpu=$(CPU) -mtune=$(CPU) -mfpu=$(FPU) -mfloat-abi=hard CFLAGS += -D__FPU_PRESENT=1 -D__VFP_FP__ -DUSE_USB_FS -DUSE_DEVICE_MODE -DUSE_USB_OTG_ID=0 -DHSE_VALUE=$(OMV_HSE_VALUE)\ @@ -730,7 +563,7 @@ index 200ffb7d..e742c135 100644 OMV_CFLAGS += -I$(TOP_DIR)/$(OMV_DIR)/sensors/ OMV_CFLAGS += -I$(TOP_DIR)/$(OMV_DIR)/ports/$(PORT)/ OMV_CFLAGS += -I$(TOP_DIR)/$(OMV_DIR)/ports/$(PORT)/modules/ -@@ -213,6 +217,25 @@ FIRM_OBJ += $(addprefix $(BUILD)/$(OMV_DIR)/imlib/, \ +@@ -213,6 +217,50 @@ FIRM_OBJ += $(addprefix $(BUILD)/$(OMV_DIR)/imlib/, \ zbar.o \ ) @@ -739,20 +572,63 @@ index 200ffb7d..e742c135 100644 + codegen/Source/depthwise_kernel3x3_stride1_inplace_CHW_fpreq.o \ + codegen/Source/depthwise_kernel3x3_stride2_inplace_CHW_fpreq.o \ + codegen/Source/depthwise_kernel5x5_stride1_inplace_CHW_fpreq.o \ -+ codegen/Source/depthwise_kernel5x5_stride2_inplace_CHW_fpreq.o \ + codegen/Source/depthwise_kernel7x7_stride1_inplace_CHW_fpreq.o \ + codegen/Source/depthwise_kernel7x7_stride2_inplace_CHW_fpreq.o \ ++ codegen/Source/depthwise_kernel3x3_stride1_inplace_CHW_fpreq_bitmask.o \ ++ codegen/Source/depthwise_kernel3x3_stride2_inplace_CHW_fpreq_bitmask.o \ ++ codegen/Source/depthwise_kernel5x5_stride1_inplace_CHW_fpreq_bitmask.o \ ++ codegen/Source/depthwise_kernel7x7_stride1_inplace_CHW_fpreq_bitmask.o \ ++ codegen/Source/depthwise_kernel7x7_stride2_inplace_CHW_fpreq_bitmask.o \ + src/kernels/fp_requantize_op/add_fpreq.o \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch8_fpreq.o \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch16_fpreq.o \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch24_fpreq.o \ + src/kernels/fp_requantize_op/convolve_1x1_s8_ch48_fpreq.o \ + src/kernels/fp_requantize_op/convolve_1x1_s8_fpreq.o \ -+ src/kernels/int_only/avgpooling.o \ ++ src/kernels/int_forward_op/avgpooling.o \ + src/kernels/fp_requantize_op/convolve_s8_kernel3_inputch3_stride2_pad1_fpreq.o \ + src/kernels/fp_requantize_op/mat_mul_kernels_fpreq.o \ ++ src/kernels/fp_requantize_op/convolve_1x1_s8_fpreq_mask.o \ ++ src/kernels/fp_requantize_op/convolve_1x1_s8_fpreq_mask_partialCH.o \ ++ src/kernels/fp_backward_op/sum_4D_exclude_fp.o \ ++ src/kernels/fp_backward_op/where_fp.o \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel3_stride1_inpad1_outpad0.o \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel3_stride2_inpad1_outpad1.o \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel5_stride1_inpad2_outpad0.o \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel5_stride2_inpad2_outpad1.o \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel7_stride1_inpad3_outpad0.o \ ++ src/kernels/fp_backward_op/transpose_depthwise_conv_fp_kernel7_stride2_inpad3_outpad1.o \ ++ src/kernels/fp_backward_op/tte_exp_fp.o \ ++ src/kernels/fp_backward_op/sub_fp.o \ ++ src/kernels/fp_backward_op/mul_fp.o \ ++ src/kernels/fp_backward_op/pointwise_conv_fp.o \ ++ src/kernels/fp_backward_op/group_pointwise_conv_fp.o \ ++ src/kernels/fp_backward_op/group_conv_fp_kernel4_stride1_pad0.o \ ++ src/kernels/fp_backward_op/group_conv_fp_kernel8_stride1_pad0.o \ ++ src/kernels/fp_backward_op/strided_slice_4Dto4D_fp.o \ ++ src/kernels/fp_backward_op/sum_3D_fp.o \ ++ src/kernels/fp_backward_op/nll_loss_fp.o \ ++ src/kernels/fp_backward_op/log_softmax_fp.o \ + ) + FIRM_OBJ += $(wildcard $(BUILD)/$(OMV_DIR)/ports/$(PORT)/*.o) FIRM_OBJ += $(wildcard $(BUILD)/$(MICROPY_DIR)/modules/*.o) FIRM_OBJ += $(wildcard $(BUILD)/$(MICROPY_DIR)/ports/$(PORT)/modules/*.o) +@@ -625,7 +673,7 @@ endif + # This target generates the main/app firmware image located at 0x08010000 + $(FIRMWARE): FIRMWARE_OBJS + $(CPP) -P -E -I$(OMV_BOARD_CONFIG_DIR) $(OMV_DIR)/ports/$(PORT)/$(LDSCRIPT).ld.S > $(BUILD)/$(LDSCRIPT).lds +- $(CC) $(LDFLAGS) $(FIRM_OBJ) -o $(FW_DIR)/$(FIRMWARE).elf $(LIBS) -lgcc ++ $(CC) $(LDFLAGS) $(FIRM_OBJ) -o $(FW_DIR)/$(FIRMWARE).elf $(LIBS) -lgcc -lm + $(OBJCOPY) -Obinary -R .big_const* $(FW_DIR)/$(FIRMWARE).elf $(FW_DIR)/$(FIRMWARE).bin + $(PYTHON) $(MKDFU) -D $(DFU_DEVICE) -b $(MAIN_APP_ADDR):$(FW_DIR)/$(FIRMWARE).bin $(FW_DIR)/$(FIRMWARE).dfu + +@@ -633,7 +681,7 @@ ifeq ($(OMV_ENABLE_BL), 1) + # This target generates the bootloader. + $(BOOTLOADER): FIRMWARE_OBJS BOOTLOADER_OBJS + $(CPP) -P -E -I$(OMV_BOARD_CONFIG_DIR) $(BOOTLDR_DIR)/stm32fxxx.ld.S > $(BUILD)/$(BOOTLDR_DIR)/stm32fxxx.lds +- $(CC) $(BL_LDFLAGS) $(BOOT_OBJ) -o $(FW_DIR)/$(BOOTLOADER).elf -lgcc ++ $(CC) $(BL_LDFLAGS) $(BOOT_OBJ) -o $(FW_DIR)/$(BOOTLOADER).elf -lgcc -lm + $(OBJCOPY) -Obinary $(FW_DIR)/$(BOOTLOADER).elf $(FW_DIR)/$(BOOTLOADER).bin + $(PYTHON) $(MKDFU) -D $(DFU_DEVICE) -b 0x08000000:$(FW_DIR)/$(BOOTLOADER).bin $(FW_DIR)/$(BOOTLOADER).dfu + endif diff --git a/examples/openmv_training_sparse/training_demo.py b/examples/openmv_training_sparse/training_demo.py new file mode 100644 index 0000000..7dc91a9 --- /dev/null +++ b/examples/openmv_training_sparse/training_demo.py @@ -0,0 +1,31 @@ +# This example shows how to invoke to training or inference function calls of tinyengine. +import cexample +import lcd +import sensor +from pyb import Pin + +sensor.reset() # Reset and initialize the sensor. +sensor.set_pixformat(sensor.RGB565) # Set pixel format to RGB565 (or GRAYSCALE) +sensor.set_framesize(sensor.B128X128) # Set frame size to QVGA (128x128) +lcd.init() # Initialize the lcd screen. + + +# class 1: green +pin4 = Pin("P4", Pin.IN, Pin.PULL_UP) +# class 0: red +pin1 = Pin("P1", Pin.IN, Pin.PULL_UP) + +while True: + img = sensor.snapshot() # Take a picture and return the image. + + pin4_value = pin4.value() + pin1_value = pin1.value() + if pin4_value == 0: + ret = cexample.train_demo(img, 0) + print("train class 0") + elif pin1_value == 0: + ret = cexample.train_demo(img, 1) + print("train class 1") + else: + ret = cexample.train_demo(img, -1) + lcd.display(img) # Display the image. diff --git a/examples/openmv_vww/README.md b/examples/openmv_vww/README.md index f0ad5a2..05e9433 100644 --- a/examples/openmv_vww/README.md +++ b/examples/openmv_vww/README.md @@ -12,6 +12,7 @@ sudo apt-get install git build-essential ``` ## Install GNU ARM toolchain + ``` Install arm toolchain TOOLCHAIN_PATH=/usr/local/arm-none-eabi @@ -27,41 +28,48 @@ export PATH=${TOOLCHAIN_PATH}/bin:${PATH} cd tinyengine/examples/openmv_vww/ git clone https://github.com/openmv/openmv.git ``` + Currently, we don't have compatibility tests for the OpenMV source, so let's use the version that has been manually tested before. + ``` cd openmv git checkout 918ccb937730cc759ee5709df089d9de516dc7bf git submodule update --init --recursive ``` -## Build the source +## Build the source + Let's first build the firmware from the source to make sure all required dependencies are correctly installed. The `TARGET `is set to `OPENMV4` for OpenMV Cam H7. + ``` make -j4 -C src/micropython/mpy-cross make -j4 TARGET=OPENMV4 -C src ``` + You should see the compiled binary at `openmv/src/build/bin/firmware.bin`. -## Apply customized patch +## Apply customized patch The patch is to 1. disable some features in the firmware for SRAM and Flash space -2. setup for TinyEngine source -3. add vww application code in `exampleemodule.c` +1. setup for TinyEngine source +1. add vww application code in `exampleemodule.c` + ``` cd tinyengine/examples/openmv_vww/openmv git apply ../openmv.patch ``` # Add the Tinyengine into openmv + ``` cd tinyengine cp -r TinyEngine examples/openmv_vww/openmv/src/omv/modules/ ``` - ## Generate model-specific code for VWW + ``` cd tinyengine/examples/openmv_vww/ python ../vww.py @@ -71,15 +79,16 @@ cp -r codegen/ openmv/src/omv/modules/TinyEngine/ Copy the generated code at `tinyengine/example/openmv_vww/codegen` into TinyEngie. ## Recompile the firmware with TinyEngine + ``` cd tinyengine/examples/openmv_vww/openmv/ make -j4 TARGET=OPENMV4 -C src ``` -Flash the binary `openmv/src/build/bin/firmware.bin` into your OpenMV. Please refer to the official [Instructions](https://github.com/openmv/openmv/blob/master/src/README.md#flashing-the-firmware]). +Flash the binary `openmv/src/build/bin/firmware.bin` into your OpenMV. Please refer to the official [Instructions](https://github.com/openmv/openmv/blob/master/src/README.md#flashing-the-firmware%5D). ## Start the demo -1. download OpenMV IDE -2. Connect your OpenMV cam to the PC -3. Run the python script `tinyengine/examples/openmv_vww/vww_openmv_demo.py` in OpenMV IDE. +1. Open OpenMV IDE +1. Connect your OpenMV cam to the PC +1. Run the python script `tinyengine/examples/openmv_vww/vww_openmv_demo.py` in OpenMV IDE.