mirror of
https://github.com/mit-han-lab/tinyengine.git
synced 2025-05-10 01:18:47 +08:00
237 lines
10 KiB
C
237 lines
10 KiB
C
/*
|
|
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the License); you may
|
|
* not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* This file is MODIFIED from Arm CMSIS NN Library.
|
|
*
|
|
* Project: TinyEngine
|
|
* Title: arm_nnfunctions_modified.h
|
|
* Description: Public header file for TinyEngine.
|
|
*
|
|
* Reference papers:
|
|
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
|
|
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
|
|
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
|
|
* Contact authors:
|
|
* - Wei-Ming Chen, wmchen@mit.edu
|
|
* - Wei-Chen Wang, wweichen@mit.edu
|
|
* - Ji Lin, jilin@mit.edu
|
|
* - Ligeng Zhu, ligeng@mit.edu
|
|
* - Song Han, songhan@mit.edu
|
|
*
|
|
* Original Project: CMSIS NN Library
|
|
* Original Title: arm_nnfunctions.h
|
|
*
|
|
* Target Processor: Cortex-M CPUs
|
|
* -------------------------------------------------------------------- */
|
|
|
|
/**
|
|
\mainpage CMSIS NN Software Library
|
|
*
|
|
* Introduction
|
|
* ------------
|
|
*
|
|
* This user manual describes the CMSIS NN software library,
|
|
* a collection of efficient neural network kernels developed to maximize the
|
|
* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
|
|
*
|
|
* The library is divided into a number of functions each covering a specific category:
|
|
* - Convolution Functions
|
|
* - Activation Functions
|
|
* - Fully-connected Layer Functions
|
|
* - SVDF Layer Functions
|
|
* - Pooling Functions
|
|
* - Softmax Functions
|
|
* - Basic math Functions
|
|
*
|
|
* The library has separate functions for operating on different weight and activation data
|
|
* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
|
|
* kernels are included in the function description. The implementation details are also
|
|
* described in this paper [1].
|
|
*
|
|
* Function Classification
|
|
* --------
|
|
* The functions can be classified into two segments
|
|
* - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
|
|
* - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
|
|
*
|
|
* The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
|
|
* The article in [2] describes in detail how to run a network using the legacy functions.
|
|
*
|
|
* The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
|
|
* micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
|
|
* a TensorFlow Lite model using optimized CMSIS-NN kernels.
|
|
*
|
|
* Block Diagram
|
|
* --------
|
|
* \image html CMSIS-NN-OVERVIEW.PNG
|
|
*
|
|
* Examples
|
|
* --------
|
|
*
|
|
* The library ships with a number of examples which demonstrate how to use the library functions.
|
|
*
|
|
* Pre-processor Macros
|
|
* ------------
|
|
*
|
|
* Each library project have different pre-processor macros.
|
|
*
|
|
* - ARM_MATH_DSP:
|
|
*
|
|
* Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
|
|
*
|
|
* - ARM_MATH_MVEI:
|
|
*
|
|
* Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
|
|
|
|
* - ARM_MATH_AUTOVECTORIZE
|
|
* Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
|
|
* assembly. It does not affect functions that use C or intrinsics.
|
|
* - ARM_MATH_BIG_ENDIAN:
|
|
*
|
|
* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
|
|
* functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
|
|
* little endian targets.
|
|
*
|
|
* - ARM_NN_TRUNCATE:
|
|
*
|
|
* Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
|
|
*
|
|
*
|
|
* Copyright Notice
|
|
* ------------
|
|
*
|
|
* Copyright (C) 2010-2019 Arm Limited. All rights reserved.
|
|
*
|
|
* [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
|
|
*
|
|
* [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
|
|
*
|
|
https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
|
|
* [3] https://www.tensorflow.org/lite/microcontrollers/library
|
|
*
|
|
* [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
|
|
*/
|
|
|
|
/**
|
|
* @defgroup groupNN Neural Network Functions
|
|
* A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
|
|
* TensorFlow Lite framework.
|
|
*/
|
|
|
|
#ifndef _ARM_NNFUNCTIONS_H
|
|
#define _ARM_NNFUNCTIONS_H
|
|
|
|
#include "arm_nn_math_types.h"
|
|
#include "arm_nn_types.h"
|
|
#include "arm_nnsupportfunctions.h"
|
|
|
|
#define USE_INTRINSIC
|
|
|
|
//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/**
|
|
* @defgroup NNConv Convolution Functions
|
|
*
|
|
* Collection of convolution, depthwise convolution functions and their variants.
|
|
*
|
|
* The convolution is implemented in 2 steps: im2col and GEMM
|
|
*
|
|
* im2col is a process of converting each patch of image data into
|
|
* a column. After im2col, the convolution is computed as matrix-matrix
|
|
* multiplication.
|
|
*
|
|
* To reduce the memory footprint, the im2col is performed partially.
|
|
* Each iteration, only a few column (i.e., patches) are generated and
|
|
* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
|
|
*
|
|
*/
|
|
|
|
arm_status arm_convolve_s8_4col(const q7_t *input,
|
|
const uint16_t input_x,
|
|
const uint16_t input_y,
|
|
const uint16_t input_ch,
|
|
const uint16_t input_batches,
|
|
const q7_t *kernel,
|
|
const uint16_t output_ch,
|
|
const uint16_t kernel_x,
|
|
const uint16_t kernel_y,
|
|
const uint16_t pad_x,
|
|
const uint16_t pad_y,
|
|
const uint16_t stride_x,
|
|
const uint16_t stride_y,
|
|
const int32_t *bias,
|
|
q7_t *output,
|
|
const int32_t *output_shift,
|
|
const int32_t *output_mult,
|
|
const int32_t out_offset,
|
|
const int32_t input_offset,
|
|
const int32_t out_activation_min,
|
|
const int32_t out_activation_max,
|
|
const uint16_t output_x,
|
|
const uint16_t output_y,
|
|
q15_t *buffer_a);
|
|
|
|
|
|
q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered_oddch(const q7_t *input_a,
|
|
const q15_t *input_b,
|
|
const uint16_t output_ch,
|
|
const int32_t *out_shift,
|
|
const int32_t *out_mult,
|
|
const int32_t out_offset,
|
|
const int16_t activation_min,
|
|
const int16_t activation_max,
|
|
const uint16_t num_col_a,
|
|
const int32_t *const output_bias,
|
|
q7_t *out_0);
|
|
|
|
q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered_8mul(const q7_t *input_a,
|
|
const q15_t *input_b,
|
|
const uint16_t output_ch,
|
|
const int32_t *out_shift,
|
|
const int32_t *out_mult,
|
|
const int32_t out_offset,
|
|
const int16_t activation_min,
|
|
const int16_t activation_max,
|
|
const uint16_t num_col_a,
|
|
const int32_t *const output_bias,
|
|
q7_t *out_0);
|
|
|
|
q7_t *arm_nn_mat_mult_kernel3_input3_s8_s16(const q7_t *input_a,
|
|
const q15_t *input_b,
|
|
const uint16_t output_ch,
|
|
const int32_t *out_shift,
|
|
const int32_t *out_mult,
|
|
const int32_t out_offset,
|
|
const int16_t activation_min,
|
|
const int16_t activation_max,
|
|
const uint16_t num_col_a,
|
|
const int32_t *const output_bias,
|
|
q7_t *out_0,
|
|
q15_t *kbuf);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif
|