Train_Identify_arm/nvidia_ascend_engine/nvidia_engine/InferenceModelEngine/InferenceModelEngine.cpp

449 lines
24 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "InferenceModelEngine.h"
using namespace std;
using namespace ai_matrix;
InferenceModelEngine::InferenceModelEngine() {}
InferenceModelEngine::~InferenceModelEngine() {}
APP_ERROR InferenceModelEngine::Init()
{
strPort0_ = engineName_ + "_" + std::to_string(engineId_) + "_0";
//创建模型推理CUDA流
inference_model_stream_ = new cudaStream_t;
CUDA_CHECK(cudaStreamCreate(inference_model_stream_));
gLogger_ = new Logger;
//相关资源分配
buffers_[0] = nullptr; buffers_[1] = nullptr;
CUDA_CHECK(cudaMalloc((void**)&buffers_[0], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); //输入资源分配
CUDA_CHECK(cudaMalloc((void**)&buffers_[1], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); //输出资源分配
LogInfo << "engineId_:" << engineId_ << " InferenceModelEngine Init ok";
return APP_ERR_OK;
}
APP_ERROR InferenceModelEngine::DeInit()
{
CUDA_CHECK(cudaStreamDestroy(*inference_model_stream_)); delete inference_model_stream_; inference_model_stream_ = nullptr; //释放模型推理CUDA流
CUDA_CHECK(cudaFree(buffers_[0])); //释放输入数据设备端内存
CUDA_CHECK(cudaFree(buffers_[1])); //释放输出数据设备端内存
//析构engine引擎资源
context_->destroy(); //析构绘话
engine_->destroy(); //析构TensorRT引擎
runtime_->destroy(); //析构运行时环境
delete gLogger_; gLogger_ = nullptr;
LogInfo << "engineId_:" << engineId_ << " InferenceModelEngine DeInit ok";
return APP_ERR_OK;
}
APP_ERROR InferenceModelEngine::Process()
{
cudaSetDevice(DEVICE); //设置GPU
//wts及engine模型名称
std::string wts_name = MyYaml::GetIns()->GetStringValue("yolov5_wts_name");
std::string engine_name = MyYaml::GetIns()->GetStringValue("yolov5_model_name");
bool is_p6 = false; //默认不是P6模型
/**********************************************************************************
gw width_multiple系数: width_multiple控制网络的宽度。
gd depth_multiple系数: depth_multiple控制网络的深度
N模型:
gd = 0.33;gw = 0.25;
S模型:
gd = 0.33;gw = 0.50;
M模型:
gd = 0.67;gw = 0.75;
L模型:
gd = 1.0;gw = 1.0;
X模型:
gd = 1.33;gw = 1.25;
**********************************************************************************/
float gd = 0.67, gw = 0.75; //默认使用M模型
//序列化引擎
//直接使用API创建一个模型并将其序列化为流 编译成TensorRT引擎engine文件后无需再次调用,调用依次生成engine即可
#if 0
if (!wts_name.empty()) {
IHostMemory* modelStream{ nullptr };
APIToModel(*gLogger_, BATCH_SIZE, &modelStream, is_p6, gd, gw, wts_name);
assert(modelStream != nullptr);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
modelStream->destroy();
}
#endif
//反序列化模型并运行推理
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
LogInfo << "read " << engine_name << " error!" << std::endl;
exit(0);
}
//创建tensorRT流对象trtModelStream,这个就跟文件流中的ifstream类似的
//trtModelStream是一块内存区域,用于保存序列化的plan文件
char *trtModelStream = nullptr;
size_t size = 0;
file.seekg(0, file.end); //将指针移动至距离文件末尾0处的位置
size = file.tellg(); //获得当前字符的位置
file.seekg(0, file.beg); //将指针移动至距离文件开头0处的位置
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size); //将序列化engine模型(数据及数据大小)读入trtModelStream
file.close();
//1.设置运行时环境
//runtime_ = new IRuntime;
runtime_ = createInferRuntime(*gLogger_); //创建运行时环境IRuntime对象,传入gLogger用于打印信息
assert(runtime_ != nullptr);
//2.生成反序列化引擎
//engine = new ICudaEngine;
engine_ = runtime_->deserializeCudaEngine(trtModelStream, size); //反序列化引擎engine(根据trtModelStream反序列化)
assert(engine_ != nullptr);
//3.创建上下文环境
//context = new IExecutionContext;
context_ = engine_->createExecutionContext(); //创建上下文环境,主要用于inference函数中启动cuda核
assert(context_ != nullptr);
delete[] trtModelStream; //析构trtModelStream
assert(engine_->getNbBindings() == 2);
//获取绑定的输入输入
const int inputIndex = engine_->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine_->getBindingIndex(OUTPUT_BLOB_NAME);
std::cout<<"inputIndex: "<<inputIndex<<"\toutputIndex: "<<outputIndex<<std::endl;
assert(inputIndex == 0);
assert(outputIndex == 1);
uint64_t u64count_num = 0;
int iRet = APP_ERR_OK;
while (!isStop_)
{
std::shared_ptr<void> pVoidData0 = nullptr;
inputQueMap_[strPort0_]->pop(pVoidData0);
if (nullptr == pVoidData0)
{
usleep(1*1000); //n ms
continue;
}
// LogInfo << "receive from ImagePreprocessEngine's data success!";
// std::cout<<"receive from ImagePreprocessEngine's data success!"<<std::endl;
// std::cout<<"Enter InferenceModelEngine Thread "<<++u64count_num<<" Times!"<<std::endl;
std::shared_ptr<InferenceData> pImagePreprocessData = std::static_pointer_cast<InferenceData>(pVoidData0);
//将图像预处理数据拷贝到buffers_[0]
#ifdef CUDA_MEMCPY_TIME_CONSUMING_TEST
auto cuda_memcpy_start = std::chrono::system_clock::now(); //计时开始
CUDA_CHECK(cudaMemcpyAsync(buffers_[0], static_cast<void *>(pImagePreprocessData->pData.get()), pImagePreprocessData->iSize, cudaMemcpyDeviceToDevice,*inference_model_stream_));
auto cuda_memcpy_end = std::chrono::system_clock::now(); //计时结束
std::cout<< "InferenceModelEngine cuda memcpy data size is: "<<pImagePreprocessData->iSize<<std::endl;
std::cout << "InferenceModelEngine cuda memcpy device to device time: " << std::chrono::duration_cast<std::chrono::milliseconds>(cuda_memcpy_end - cuda_memcpy_start).count() << "ms" << std::endl;
#else
CUDA_CHECK(cudaMemcpyAsync(buffers_[0], static_cast<void *>(pImagePreprocessData->pData.get()), pImagePreprocessData->iSize, cudaMemcpyDeviceToDevice,*inference_model_stream_));
#endif
//构造推理结果数据
void* pInferenceModelBuffer = nullptr;
unsigned int pInferenceModelBuffer_Size = BATCH_SIZE * OUTPUT_SIZE;
pInferenceModelBuffer = new float[pInferenceModelBuffer_Size];
void* pSrcRGBBuffer = nullptr;
unsigned int pSrcRGBBuffer_Size = pImagePreprocessData->iSrcSize;
pSrcRGBBuffer = new uint8_t[pSrcRGBBuffer_Size];
memcpy(pSrcRGBBuffer, pImagePreprocessData->pSrcData.get(), pSrcRGBBuffer_Size);
std::shared_ptr<InferenceData> pInferenceModelData = std::make_shared<InferenceData>();
#ifdef INFERENCE_MODEL_TIME_CONSUMING_TEST
auto start = std::chrono::system_clock::now(); //计时开始
doInference(*context_, *inference_model_stream_, (void**)buffers_, (float*)pInferenceModelBuffer, BATCH_SIZE); //context为推理的上下文环境,stream为注册流(用于异步推理时进行同步),buffers为传入的图像数据,pInferenceModelBuffer为推理的结果
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
#else
doInference(*context_, *inference_model_stream_, (void**)buffers_, (float*)pInferenceModelBuffer, BATCH_SIZE); //context为推理的上下文环境,stream为注册流(用于异步推理时进行同步),buffers为传入的图像数据,pInferenceModelBuffer为推理的结果
#endif
//组织数据
pInferenceModelData->iDataSource = engineId_;
pInferenceModelData->iSize = pInferenceModelBuffer_Size;
pInferenceModelData->pData.reset(pInferenceModelBuffer, [](void* data){if(data){delete[] data; data = nullptr;}}); //智能指针管理内存
pInferenceModelData->iSrcSize = pSrcRGBBuffer_Size;
pInferenceModelData->pSrcData.reset(pSrcRGBBuffer, [](void* data){if(data){delete[] data; data = nullptr;}}); //智能指针管理内存
pInferenceModelData->i64TimeStamp = pImagePreprocessData->i64TimeStamp;
#if 1
//推理结果送入下一引擎
iRet = outputQueMap_[strPort0_]->push(std::static_pointer_cast<void>(pInferenceModelData));
if (iRet != APP_ERR_OK){
LogError << "push info error";
// std::cerr<<"push the inference model data failed..."<<std::endl;
}else{
// std::cout<<"push the inference model data success!"<<std::endl;
}
#endif
}
}
//获取宽度
int InferenceModelEngine::get_width(int x, float gw, int divisor = 8) {
return int(ceil((x * gw) / divisor)) * divisor;
}
//获取深度
int InferenceModelEngine::get_depth(int x, float gd) {
if (x == 1) return 1;
int r = round(x * gd);
if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
--r;
}
return std::max<int>(r, 1);
}
//构建普通引擎(例:yolov5s,yolov5m...)
ICudaEngine* InferenceModelEngine::build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name) {
INetworkDefinition* network = builder->createNetworkV2(0U);
// Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
assert(data);
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
/* ------ yolov5 backbone------ */
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
assert(conv0);
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");
/* ------ yolov5 head ------ */
auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
auto upsample11 = network->addResize(*conv10->getOutput(0));
assert(upsample11);
upsample11->setResizeMode(ResizeMode::kNEAREST);
upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
auto cat12 = network->addConcatenation(inputTensors12, 2);
auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
auto upsample15 = network->addResize(*conv14->getOutput(0));
assert(upsample15);
upsample15->setResizeMode(ResizeMode::kNEAREST);
upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
auto cat16 = network->addConcatenation(inputTensors16, 2);
auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");
/* ------ detect ------ */
IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
auto cat19 = network->addConcatenation(inputTensors19, 2);
auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
auto cat22 = network->addConcatenation(inputTensors22, 2);
auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2});
yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*yolo->getOutput(0));
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
#if defined(USE_FP16)
config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
assert(builder->platformHasFastInt8());
config->setFlag(BuilderFlag::kINT8);
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
config->setInt8Calibrator(calibrator);
#endif
std::cout << "Building engine, please wait for a while..." << std::endl;
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;
// Don't need the network any more
network->destroy();
// Release host memory
for (auto& mem : weightMap)
{
free((void*)(mem.second.values));
}
return engine;
}
//构建p6引擎(例:yolov5s6,yolov5m6...)
ICudaEngine* InferenceModelEngine::build_engine_p6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name) {
INetworkDefinition* network = builder->createNetworkV2(0U);
// Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
assert(data);
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
/* ------ yolov5 backbone------ */
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7");
auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9");
auto c3_10 = C3(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.10");
auto sppf11 = SPPF(network, weightMap, *c3_10->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.11");
/* ------ yolov5 head ------ */
auto conv12 = convBlock(network, weightMap, *sppf11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12");
auto upsample13 = network->addResize(*conv12->getOutput(0));
assert(upsample13);
upsample13->setResizeMode(ResizeMode::kNEAREST);
upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions());
ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) };
auto cat14 = network->addConcatenation(inputTensors14, 2);
auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15");
auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16");
auto upsample17 = network->addResize(*conv16->getOutput(0));
assert(upsample17);
upsample17->setResizeMode(ResizeMode::kNEAREST);
upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions());
ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) };
auto cat18 = network->addConcatenation(inputTensors18, 2);
auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19");
auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20");
auto upsample21 = network->addResize(*conv20->getOutput(0));
assert(upsample21);
upsample21->setResizeMode(ResizeMode::kNEAREST);
upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions());
ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) };
auto cat22 = network->addConcatenation(inputTensors21, 2);
auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24");
ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) };
auto cat25 = network->addConcatenation(inputTensors25, 2);
auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26");
auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27");
ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) };
auto cat28 = network->addConcatenation(inputTensors28, 2);
auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29");
auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30");
ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) };
auto cat31 = network->addConcatenation(inputTensors31, 2);
auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32");
/* ------ detect ------ */
IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]);
IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]);
IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]);
IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]);
auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector<IConvolutionLayer*>{det0, det1, det2, det3});
yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*yolo->getOutput(0));
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
#if defined(USE_FP16)
config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
assert(builder->platformHasFastInt8());
config->setFlag(BuilderFlag::kINT8);
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
config->setInt8Calibrator(calibrator);
#endif
std::cout << "Building engine, please wait for a while..." << std::endl;
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;
// Don't need the network any more
network->destroy();
// Release host memory
for (auto& mem : weightMap)
{
free((void*)(mem.second.values));
}
return engine;
}
//转换模型
void InferenceModelEngine::APIToModel(Logger gLogger, unsigned int maxBatchSize, IHostMemory** modelStream, bool& is_p6, float& gd, float& gw, std::string& wts_name) {
// Create builder
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
// 创建模型来填充网络,然后设置输出并创建一个引擎
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine *engine = nullptr;
if (is_p6) {
engine = build_engine_p6(maxBatchSize, builder, config, nvinfer1::DataType::kFLOAT, gd, gw, wts_name);
} else {
engine = build_engine(maxBatchSize, builder, config, nvinfer1::DataType::kFLOAT, gd, gw, wts_name);
}
assert(engine != nullptr);
// Serialize the engine
//序列化引擎生成模型流
(*modelStream) = engine->serialize();
// Close everything down
//释放相关资源
engine->destroy();
builder->destroy();
config->destroy();
}
//执行推理
void InferenceModelEngine::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output, int batchSize) {
// infer on the batch asynchronously, and DMA output back to host
context.enqueue(batchSize, buffers, stream, nullptr); //执行异步推理(调用context->enqueueV2即可执行异步推理,如果用同步推理的话,可以调用context->executeV2)
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); //把推理后的结果从GPU上拷贝到CPU上
cudaStreamSynchronize(stream); //同步之前创建的cuda流原因很简单直接使用的context->enqueueV2函数是异步推理,因此需要把cuda流同步一下
}