/*
 * Copyright (C) 2010-2022 Arm Limited or its affiliates.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ----------------------------------------------------------------------
 * This file is MODIFIED from Arm CMSIS NN Library.
 *
 * Project: TinyEngine
 * Title:   arm_nnfunctions_modified.h
 * Description:  Public header file for TinyEngine.
 *
 * Reference papers:
 *  - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
 *  - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
 *  - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
 * Contact authors:
 *  - Wei-Ming Chen, wmchen@mit.edu
 *  - Wei-Chen Wang, wweichen@mit.edu
 *  - Ji Lin, jilin@mit.edu
 *  - Ligeng Zhu, ligeng@mit.edu
 *  - Song Han, songhan@mit.edu
 *
 * Original Project:      CMSIS NN Library
 * Original Title:        arm_nnfunctions.h
 *
 * Target Processor:  Cortex-M CPUs
 * -------------------------------------------------------------------- */

/**
   \mainpage CMSIS NN Software Library
   *
   * Introduction
   * ------------
   *
   * This user manual describes the CMSIS NN software library,
   * a collection of efficient neural network kernels developed to maximize the
   * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
   *
   * The library is divided into a number of functions each covering a specific category:
   * - Convolution Functions
   * - Activation Functions
   * - Fully-connected Layer Functions
   * - SVDF Layer Functions
   * - Pooling Functions
   * - Softmax Functions
   * - Basic math Functions
   *
   * The library has separate functions for operating on different weight and activation data
   * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
   * kernels are included in the function description. The implementation details are also
   * described in this paper [1].
   *
   * Function Classification
   * --------
   * The functions can be classified into two segments
   * - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
   * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
   *
   * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
   * The article in [2] describes in detail how to run a network using the legacy functions.
   *
   * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
   * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
   * a TensorFlow Lite model using optimized CMSIS-NN kernels.
   *
   * Block Diagram
   * --------
   * \image html CMSIS-NN-OVERVIEW.PNG
   *
   * Examples
   * --------
   *
   * The library ships with a number of examples which demonstrate how to use the library functions.
   *
   * Pre-processor Macros
   * ------------
   *
   * Each library project have different pre-processor macros.
   *
   * - ARM_MATH_DSP:
   *
   * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
   *
   * - ARM_MATH_MVEI:
   *
   * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.

   * - ARM_MATH_AUTOVECTORIZE
   *  Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
   *  assembly. It does not affect functions that use C or intrinsics.
   * - ARM_MATH_BIG_ENDIAN:
   *
   * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
   * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
   * little endian targets.
   *
   * - ARM_NN_TRUNCATE:
   *
   * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
   *
   *
   * Copyright Notice
   * ------------
   *
   * Copyright (C) 2010-2019 Arm Limited. All rights reserved.
   *
   * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
   *
   * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
   *
   https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
   * [3] https://www.tensorflow.org/lite/microcontrollers/library
   *
   * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
   */

/**
 * @defgroup groupNN Neural Network Functions
 * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
 * TensorFlow Lite framework.
 */

#ifndef _ARM_NNFUNCTIONS_H
#define _ARM_NNFUNCTIONS_H

#include "arm_nn_math_types.h"
#include "arm_nn_types.h"
#include "arm_nnsupportfunctions.h"

#define USE_INTRINSIC

//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */

#ifdef __cplusplus
extern "C" {
#endif

/**
 * @defgroup NNConv Convolution Functions
 *
 * Collection of convolution, depthwise convolution functions and their variants.
 *
 * The convolution is implemented in 2 steps: im2col and GEMM
 *
 * im2col is a process of converting each patch of image data into
 * a column. After im2col, the convolution is computed as matrix-matrix
 * multiplication.
 *
 * To reduce the memory footprint, the im2col is performed partially.
 * Each iteration, only a few column (i.e., patches) are generated and
 * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
 *
 */

    arm_status arm_convolve_s8_4col(const q7_t *input,
                                    const uint16_t input_x,
                                    const uint16_t input_y,
                                    const uint16_t input_ch,
                                    const uint16_t input_batches,
                                    const q7_t *kernel,
                                    const uint16_t output_ch,
                                    const uint16_t kernel_x,
                                    const uint16_t kernel_y,
                                    const uint16_t pad_x,
                                    const uint16_t pad_y,
                                    const uint16_t stride_x,
                                    const uint16_t stride_y,
                                    const int32_t *bias,
                                    q7_t *output,
                                    const int32_t *output_shift,
                                    const int32_t *output_mult,
                                    const int32_t out_offset,
                                    const int32_t input_offset,
                                    const int32_t out_activation_min,
                                    const int32_t out_activation_max,
                                    const uint16_t output_x,
                                    const uint16_t output_y,
                                    q15_t *buffer_a);


    q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered_oddch(const q7_t *input_a,
                                                        const q15_t *input_b,
                                                        const uint16_t output_ch,
                                                        const int32_t *out_shift,
                                                        const int32_t *out_mult,
                                                        const int32_t out_offset,
                                                        const int16_t activation_min,
                                                        const int16_t activation_max,
                                                        const uint16_t num_col_a,
                                                        const int32_t *const output_bias,
                                                        q7_t *out_0);

    q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered_8mul(const q7_t *input_a,
                                                       const q15_t *input_b,
                                                       const uint16_t output_ch,
                                                       const int32_t *out_shift,
                                                       const int32_t *out_mult,
                                                       const int32_t out_offset,
                                                       const int16_t activation_min,
                                                       const int16_t activation_max,
                                                       const uint16_t num_col_a,
                                                       const int32_t *const output_bias,
                                                       q7_t *out_0);

    q7_t *arm_nn_mat_mult_kernel3_input3_s8_s16(const q7_t *input_a,
                                                const q15_t *input_b,
                                                const uint16_t output_ch,
                                                const int32_t *out_shift,
                                                const int32_t *out_mult,
                                                const int32_t out_offset,
                                                const int16_t activation_min,
                                                const int16_t activation_max,
                                                const uint16_t num_col_a,
                                                const int32_t *const output_bias,
                                                q7_t *out_0,
                                                q15_t *kbuf);

#ifdef __cplusplus
}
#endif

#endif