#include "retinanet_classify_inference.h" RetinanetClassifyInference::RetinanetClassifyInference() {} RetinanetClassifyInference::~RetinanetClassifyInference() {} int RetinanetClassifyInference::RetinanetClassifyInferenceInit(ModelInfo* pRetinanetClassifyModelInfo, const std::string& strModelName, const std::string& strDeployName, const std::string& strEngineName) { pRetinanetClassifyModelInfo_ = pRetinanetClassifyModelInfo; //资源分配(创建流,host及device侧内存) cudaSetDevice(DEVICE); //设置GPU //创建图像预处理CUDA流 pImagePreprocessStream_ = new cudaStream_t; CUDA_CHECK(cudaStreamCreate(pImagePreprocessStream_)); //创建模型推理CUDA流 pInferenceModelStream_ = new cudaStream_t; CUDA_CHECK(cudaStreamCreate(pInferenceModelStream_)); pGLogger_ = new Logger; //相关资源分配 pfBuffers_[0] = nullptr; pfBuffers_[1] = nullptr; CUDA_CHECK(cudaMalloc((void**)&pfBuffers_[0], pRetinanetClassifyModelInfo_->uiInputSize * sizeof(float))); //输入资源分配 CUDA_CHECK(cudaMalloc((void**)&pfBuffers_[1], pRetinanetClassifyModelInfo_->uiOutputSize * sizeof(float))); //输出资源分配 pu8ImgHost_ = new uint8_t; pu8ImgDevice_ = new uint8_t; CUDA_CHECK(cudaMallocHost((void**)&pu8ImgHost_, MAX_IMAGE_INPUT_SIZE_THRESH * pRetinanetClassifyModelInfo_->uiChannel)); //在HOST侧申请预处理数据缓存 CUDA_CHECK(cudaMalloc((void**)&pu8ImgDevice_, MAX_IMAGE_INPUT_SIZE_THRESH * pRetinanetClassifyModelInfo_->uiChannel)); //在DEVICE侧申请预处理数据缓存 pfInputData_ = new float[pRetinanetClassifyModelInfo_->uiBatchSize * pRetinanetClassifyModelInfo_->uiInputSize]; pfOutputData_ = new float[pRetinanetClassifyModelInfo_->uiBatchSize * pRetinanetClassifyModelInfo_->uiOutputSize]; //序列化引擎 //直接使用API创建一个模型,并将其序列化为流 编译成TensorRT引擎engine文件后无需再次调用,调用依次生成engine即可 //基于caffe解析器编译tensorrt引擎 #if 0 std::vector vecOutputs = {pRetinanetClassifyModelInfo_->strOutputBlobName}; if (!strModelName.empty() && !strDeployName.empty()) { IHostMemory* modelStream{ nullptr }; CaffeToModel(*pGLogger_, pRetinanetClassifyModelInfo_->uiBatchSize, MAX_WORKSPAXE_SIZE, &modelStream, strModelName, strDeployName, vecOutputs); assert(modelStream != nullptr); std::ofstream p(strEngineName, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); } #endif //反序列化模型并运行推理 std::ifstream file(strEngineName, std::ios::binary); if (!file.good()) { std::cerr << "read " << strEngineName << " error!" << std::endl; return -1; } //创建tensorRT流对象trtModelStream,这个就跟文件流中的ifstream类似的 //trtModelStream是一块内存区域,用于保存序列化的plan文件 char *trtModelStream = nullptr; size_t size = 0; file.seekg(0, file.end); //将指针移动至距离文件末尾0处的位置 size = file.tellg(); //获得当前字符的位置 file.seekg(0, file.beg); //将指针移动至距离文件开头0处的位置 trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); //将序列化engine模型(数据及数据大小)读入trtModelStream file.close(); //1.设置运行时环境 pRuntime_ = createInferRuntime(*pGLogger_); //创建运行时环境IRuntime对象,传入gLogger用于打印信息 assert(pRuntime_ != nullptr); //2.生成反序列化引擎 pEngine_ = pRuntime_->deserializeCudaEngine(trtModelStream, size); //反序列化引擎engine(根据trtModelStream反序列化) assert(pEngine_ != nullptr); //3.创建上下文环境 pContext_ = pEngine_->createExecutionContext(); //创建上下文环境,主要用于inference函数中启动cuda核 assert(pContext_ != nullptr); delete[] trtModelStream; //析构trtModelStream std::cout<<"Engine get NB Bindings is: "<getNbBindings()<getNbBindings() == 2); //获取绑定的输入输入 uiInputIndex_ = pEngine_->getBindingIndex((pRetinanetClassifyModelInfo_->strInputBlobName).c_str()); uiOutputIndex_ = pEngine_->getBindingIndex((pRetinanetClassifyModelInfo_->strOutputBlobName).c_str()); std::cout<<"inputIndex: "<destroy(); //析构绘话 pEngine_->destroy(); //析构TensorRT引擎 pRuntime_->destroy(); //析构运行时环境 if(pGLogger_){ //释放Logger delete pGLogger_; pGLogger_ = nullptr; } if(pfInputData_){ delete[] pfInputData_; pfInputData_ = nullptr; } if(pfOutputData_){ delete[] pfOutputData_; pfOutputData_ = nullptr; } return 0; } bool RetinanetClassifyInference::RetinanetClassifyInferenceModel(cv::Mat& frame) { size_t size_image_src = frame.cols * frame.rows * pRetinanetClassifyModelInfo_->uiChannel; unsigned int img_width = frame.cols, img_height = frame.rows; size_t size_image_dst = pRetinanetClassifyModelInfo_->uiModelWidth * pRetinanetClassifyModelInfo_->uiModelHeight * pRetinanetClassifyModelInfo_->uiChannel; auto preprocess_start = std::chrono::system_clock::now(); //计时开始 // printf("frame cols: %d\t frame rows: %d\n", frame.cols, frame.rows); // printf("model witdh: %d\t model height: %d\t model channle: %d\n", pRetinanetClassifyModelInfo_->uiModelWidth, // pRetinanetClassifyModelInfo_->uiModelHeight, pRetinanetClassifyModelInfo_->uiChannel); #ifdef ENABLE_CUDA_PREPROCESS memcpy(pu8ImgHost_, frame.data, size_image_src); //拷贝预处理数据到HOST侧 CUDA_CHECK(cudaMemcpyAsync(pu8ImgDevice_, pu8ImgHost_, size_image_src, cudaMemcpyHostToDevice, *pImagePreprocessStream_)); //拷贝预处理数据到Device侧 retinanet_classify_preprocess_kernel_img(pu8ImgDevice_, frame.cols, frame.rows, (float*)pfBuffers_[0], pRetinanetClassifyModelInfo_->uiModelWidth, pRetinanetClassifyModelInfo_->uiModelHeight, *pImagePreprocessStream_); cudaStreamSynchronize(*pImagePreprocessStream_); #else cv::Mat pr_img = preprocess_img(frame, pRetinanetClassifyModelInfo_->uiModelWidth, pRetinanetClassifyModelInfo_->uiModelHeight); // letterbox BGR to RGB int n = 0; for (int row = 0; row < pRetinanetClassifyModelInfo_->uiModelHeight; ++row) { uchar* uc_pixel = pr_img.data + row * pr_img.step; for (int col = 0; col < pRetinanetClassifyModelInfo_->uiModelWidth; ++col) { pfInputData_[n] = (float)uc_pixel[2] - 104; pfInputData_[n + pRetinanetClassifyModelInfo_->uiModelHeight * pRetinanetClassifyModelInfo_->uiModelWidth] = (float)uc_pixel[1] - 117; pfInputData_[n + 2 * pRetinanetClassifyModelInfo_->uiModelHeight * pRetinanetClassifyModelInfo_->uiModelWidth] = (float)uc_pixel[0] - 123; uc_pixel += pRetinanetClassifyModelInfo_->uiChannel; ++n; } } #endif auto preprocess_end = std::chrono::system_clock::now(); // std::cout << "retinanet classify preprocess time: " << std::chrono::duration_cast(preprocess_end - preprocess_start).count() << "ms" << std::endl; //2.推理 auto start = std::chrono::system_clock::now(); //计时开始 #ifdef ENABLE_CUDA_PREPROCESS doInferenceV2(*pContext_, *pInferenceModelStream_, (void**)pfBuffers_, uiOutputIndex_, pfOutputData_, pRetinanetClassifyModelInfo_->uiOutputSize, pRetinanetClassifyModelInfo_->uiBatchSize); #else float a[2]={0}; doInference(*pContext_, *pInferenceModelStream_, (void**)pfBuffers_, uiInputIndex_, pfInputData_, pRetinanetClassifyModelInfo_->uiInputSize, uiOutputIndex_, pfOutputData_, pRetinanetClassifyModelInfo_->uiOutputSize, pRetinanetClassifyModelInfo_->uiBatchSize); #endif auto end = std::chrono::system_clock::now(); // std::cout << "retinanet classify inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; //3.后处理 std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); // std::cout<<"after inference retinanet classify output[0] is: "<