#ifndef _INFERENCE_H_
#define _INFERENCE_H_

#include <algorithm>
#include <chrono>
#include <cstdint>
#include <fstream>
#include <functional>
#include <iostream>
#include <numeric>
#include <vector>

#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

#include <NvInfer.h>
#include <NvInferPlugin.h>
#include <NvOnnxParser.h>
#include <NvCaffeParser.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include "cuda_utils.h"
#include "logging.h"

using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace std;

#define ENABLE_CUDA_PREPROCESS

class Inference
{
public:
    Inference();
    ~Inference();

    inline unsigned int getElementSize(nvinfer1::DataType t);
    inline int64_t volume(const nvinfer1::Dims& d);

    ICudaEngine* build_engine_onnx(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config, std::string& source_onnx);
    
    ICudaEngine* build_engine_caffe(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config,            
                        const std::string& strCaffeModelFile,  const std::string& strCaffeDeployFile, const std::vector<std::string>& vecOutputs);

    void ONNXToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& onnx_model_name);

    void CaffeToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& caffe_model_name, std::string& caffe_deploy_name, std::vector<std::string>& outputs);

    void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize, 
                    unsigned int ouputIndex, float* output, int outputSize, int batchSize);
    
    void doInferenceV2(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int ouputIndex, float* output, int outputSize, int batchSize);

    void doInferenceV3(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize, 
                unsigned int ouputIndex, float* output, int outputSize, int batchSize);
    
    void doInferenceV4(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int ouputIndex, float* output, int outputSize, int batchSize);

    float* pfBuffers_[2];
    float* pfInputData_ = nullptr;
    float* pfOutputData_ = nullptr;

    uint8_t* pu8ImgHost_ = nullptr;    //相关内存分配
    uint8_t* pu8ImgDevice_ = nullptr;

    unsigned int uiInputIndex_ = 0, uiOutputIndex_ = 0;

    cudaStream_t* pImagePreprocessStream_ = nullptr;   //图像预处理CUDA流
    cudaStream_t* pInferenceModelStream_ = nullptr;    //模型推理CUDA流

    Logger* pGLogger_ = nullptr;  
    IRuntime* pRuntime_ = nullptr;
    ICudaEngine* pEngine_ = nullptr;
    IExecutionContext* pContext_ = nullptr;

private:
};


#endif //END OF _INFERENCE_H_