#include "yolov5_classify_inference.h" YoloV5ClassifyInference::YoloV5ClassifyInference() {} YoloV5ClassifyInference::~YoloV5ClassifyInference() {} int YoloV5ClassifyInference::YoloV5ClassifyInferenceInit(ModelInfo* pYoloV5ClassifyModelInfo, const std::string& strModelName, const std::string& strEngineName) { pYoloV5ClassifyModelInfo_ = pYoloV5ClassifyModelInfo; //资源分配(创建流,host及device侧内存) cudaSetDevice(DEVICE); //设置GPU //创建图像预处理CUDA流 pImagePreprocessStream_ = new cudaStream_t; CUDA_CHECK(cudaStreamCreate(pImagePreprocessStream_)); //创建模型推理CUDA流 pInferenceModelStream_ = new cudaStream_t; CUDA_CHECK(cudaStreamCreate(pInferenceModelStream_)); pGLogger_ = new Logger; //相关资源分配 pfBuffers_[0] = nullptr; pfBuffers_[1] = nullptr; CUDA_CHECK(cudaMalloc((void**)&pfBuffers_[0], pYoloV5ClassifyModelInfo_->uiInputSize * sizeof(float))); //输入资源分配 CUDA_CHECK(cudaMalloc((void**)&pfBuffers_[1], pYoloV5ClassifyModelInfo_->uiOutputSize * sizeof(float))); //输出资源分配 pu8ImgHost_ = new uint8_t; pu8ImgDevice_ = new uint8_t; CUDA_CHECK(cudaMallocHost((void**)&pu8ImgHost_, MAX_IMAGE_INPUT_SIZE_THRESH * pYoloV5ClassifyModelInfo_->uiChannel)); //在HOST侧申请预处理数据缓存 CUDA_CHECK(cudaMalloc((void**)&pu8ImgDevice_, MAX_IMAGE_INPUT_SIZE_THRESH * pYoloV5ClassifyModelInfo_->uiChannel)); //在DEVICE侧申请预处理数据缓存 pfInputData_ = new float[pYoloV5ClassifyModelInfo_->uiBatchSize * pYoloV5ClassifyModelInfo_->uiInputSize]; pfOutputData_ = new float[pYoloV5ClassifyModelInfo_->uiBatchSize * pYoloV5ClassifyModelInfo_->uiOutputSize]; //序列化引擎 //直接使用API创建一个模型，并将其序列化为流编译成TensorRT引擎engine文件后无需再次调用,调用依次生成engine即可 //基于onnx解析器编译tensorrt引擎 #if 0 if (!strModelName.empty()) { IHostMemory* modelStream{ nullptr }; ONNXToModel(*pGLogger_, pYoloV5ClassifyModelInfo_->uiBatchSize, MAX_WORKSPAXE_SIZE, &modelStream, strModelName); assert(modelStream != nullptr); std::ofstream p(strEngineName, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); } #endif //反序列化模型并运行推理 std::ifstream file(strEngineName, std::ios::binary); if (!file.good()) { std::cerr << "read " << strEngineName << " error!" << std::endl; return -1; } //创建tensorRT流对象trtModelStream,这个就跟文件流中的ifstream类似的 //trtModelStream是一块内存区域,用于保存序列化的plan文件 char *trtModelStream = nullptr; size_t size = 0; file.seekg(0, file.end); //将指针移动至距离文件末尾0处的位置 size = file.tellg(); //获得当前字符的位置 file.seekg(0, file.beg); //将指针移动至距离文件开头0处的位置 trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); //将序列化engine模型(数据及数据大小)读入trtModelStream file.close(); //1.设置运行时环境 pRuntime_ = createInferRuntime(*pGLogger_); //创建运行时环境IRuntime对象,传入gLogger用于打印信息 assert(pRuntime_ != nullptr); //2.生成反序列化引擎 //engine = new ICudaEngine; bool didInitPlugins = initLibNvInferPlugins(nullptr, ""); pEngine_ = pRuntime_->deserializeCudaEngine(trtModelStream, size); //反序列化引擎engine(根据trtModelStream反序列化) assert(pEngine_ != nullptr); //3.创建上下文环境 //context = new IExecutionContext; pContext_ = pEngine_->createExecutionContext(); //创建上下文环境,主要用于inference函数中启动cuda核 assert(pContext_ != nullptr); delete[] trtModelStream; //析构trtModelStream std::cout<<"Engine get NB Bindings is: "<getNbBindings()<getNbBindings() == 2); //获取绑定的输入输入 uiInputIndex_ = pEngine_->getBindingIndex((pYoloV5ClassifyModelInfo_->strInputBlobName).c_str()); uiOutputIndex_ = pEngine_->getBindingIndex((pYoloV5ClassifyModelInfo_->strOutputBlobName).c_str()); std::cout<<"inputIndex: "<destroy(); //析构绘话 pEngine_->destroy(); //析构TensorRT引擎 pRuntime_->destroy(); //析构运行时环境 if(pGLogger_){ //释放Logger delete pGLogger_; pGLogger_ = nullptr; } if(pfInputData_){ delete[] pfInputData_; pfInputData_ = nullptr; } if(pfOutputData_){ delete[] pfOutputData_; pfOutputData_ = nullptr; } return 0; } int YoloV5ClassifyInference::YoloV5ClassifyInferenceModel(cv::Mat& frame, unsigned int* uiClassLabel) { size_t size_image_src = frame.cols * frame.rows * pYoloV5ClassifyModelInfo_->uiChannel; unsigned int img_width = frame.cols, img_height = frame.rows; size_t size_image_dst = pYoloV5ClassifyModelInfo_->uiModelWidth * pYoloV5ClassifyModelInfo_->uiModelHeight * pYoloV5ClassifyModelInfo_->uiChannel; auto preprocess_start = std::chrono::system_clock::now(); //计时开始 // printf("frame cols: %d\t frame rows: %d\n", frame.cols, frame.rows); // printf("model witdh: %d\t model height: %d\t model channle: %d\n", pYoloV5ClassifyModelInfo_->uiModelWidth, // pYoloV5ClassifyModelInfo_->uiModelHeight, pYoloV5ClassifyModelInfo_->uiChannel); #ifdef ENABLE_CUDA_PREPROCESS memcpy(pu8ImgHost_, frame.data, size_image_src); //拷贝预处理数据到HOST侧 CUDA_CHECK(cudaMemcpyAsync(pu8ImgDevice_, pu8ImgHost_, size_image_src, cudaMemcpyHostToDevice, *pImagePreprocessStream_)); //拷贝预处理数据到Device侧 yolov5_classify_preprocess_kernel_img(pu8ImgDevice_, frame.cols, frame.rows, (float*)pfBuffers_[0], pYoloV5ClassifyModelInfo_->uiModelWidth, pYoloV5ClassifyModelInfo_->uiModelHeight, *pImagePreprocessStream_); cudaStreamSynchronize(*pImagePreprocessStream_); #else cv::Mat pr_img = preprocess_img(frame, pYoloV5ClassifyModelInfo_->uiModelWidth, pYoloV5ClassifyModelInfo_->uiModelHeight); // letterbox BGR to RGB int n = 0; for (int row = 0; row < pYoloV5ClassifyModelInfo_->uiModelHeight; ++row) { uchar* uc_pixel = pr_img.data + row * pr_img.step; for (int col = 0; col < pYoloV5ClassifyModelInfo_->uiModelWidth; ++col) { pfInputData_[n] = ((float)uc_pixel[2]/255.0 - 0.406) / 0.225; //先进行归一化然后减均值除以标准差 pfInputData_[n + pYoloV5ClassifyModelInfo_->uiModelHeight * pYoloV5ClassifyModelInfo_->uiModelWidth] = ((float)uc_pixel[1]/255.0 - 0.456) / 0.224; pfInputData_[n + 2 * pYoloV5ClassifyModelInfo_->uiModelHeight * pYoloV5ClassifyModelInfo_->uiModelWidth] = ((float)uc_pixel[0]/255.0 - 0.485) / 0.229; uc_pixel += pYoloV5ClassifyModelInfo_->uiChannel; ++n; } } #endif auto preprocess_end = std::chrono::system_clock::now(); std::cout << "yolov5 classify preprocess time: " << std::chrono::duration_cast(preprocess_end - preprocess_start).count() << "ms" << std::endl; //2.推理 float fResizeRatio = GetResizeRatio(img_width, img_height, pYoloV5ClassifyModelInfo_->uiModelWidth, pYoloV5ClassifyModelInfo_->uiModelHeight); auto start = std::chrono::system_clock::now(); //计时开始 #ifdef ENABLE_CUDA_PREPROCESS doInferenceV4(*pContext_, *pInferenceModelStream_, (void**)pfBuffers_, uiOutputIndex_, pfOutputData_, pYoloV5ClassifyModelInfo_->uiOutputSize, pYoloV5ClassifyModelInfo_->uiBatchSize); #else doInferenceV3(*pContext_, *pInferenceModelStream_, (void**)pfBuffers_, uiInputIndex_, pfInputData_, pYoloV5ClassifyModelInfo_->uiInputSize, uiOutputIndex_, pfOutputData_, pYoloV5ClassifyModelInfo_->uiOutputSize, pYoloV5ClassifyModelInfo_->uiBatchSize); #endif auto end = std::chrono::system_clock::now(); std::cout << "yolov5 classify inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; //3.后处理 std::vector vecInput, vecOutput; for(int i=0; iuiOutputSize;i++){ vecInput.push_back(pfOutputData_[i]); } vecOutput = softMax(vecInput); float fValue = 0.0; for(unsigned int i=0; iuiOutputSize;i++){ std::cout<fValue){ fValue = vecOutput[i]; *uiClassLabel = i; } std::cout< YoloV5ClassifyInference::softMax(std::vector vecInput) { double total=0; double dmax = vecInput[0]; for(auto x:vecInput){ dmax = max(x,dmax); } for(auto x:vecInput){ total += exp(x-dmax); } std::vector vecResult; for(auto x:vecInput){ vecResult.push_back(exp(x-dmax)/total); } return vecResult; }