VTrain/tools/yolov5/src/inference.cu

141 lines
6.5 KiB
Plaintext
Raw Normal View History

2024-11-27 12:47:45 +00:00
#include "inference.h"
Inference::Inference() {}
Inference::~Inference() {}
//onnx解析器
ICudaEngine* Inference::build_engine_onnx(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config, std::string& source_onnx)
{
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
//创建onnx解析器
nvonnxparser::IParser* onnxParser = nvonnxparser::createParser(*network, gLogger);
//解析onnx文件
onnxParser->parseFromFile(source_onnx.c_str(), 1);
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(maxWorkSpaceSize); // 16MB
float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f;
#if defined(USE_FP16)
config->setFlag(BuilderFlag::kFP16);
#endif
std::cout<<"Set max batch size = "<<maxBatchSize<<std::endl; //最大batch_size
std::cout<<"Set max workspace size = "<<max_workspace_size<<" MB"<<std::endl; //最大batch_size
int net_num_input = network->getNbInputs(); //获取网络输入个数
printf("Network has %d inputs:\n", net_num_input);
std::vector<std::string> input_names(net_num_input);
for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度
auto tensor = network->getInput(i);
auto dims = tensor->getDimensions();
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
input_names[i] = tensor->getName();
}
int net_num_output = network->getNbOutputs(); //获取网络输出个数
printf("Network has %d outputs:\n", net_num_output);
for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度
auto tensor = network->getOutput(i);
auto dims = tensor->getDimensions();
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
}
int net_num_layers = network->getNbLayers(); //获取网络层数
printf("Network has %d layers\n", net_num_layers);
//配置OptimizationProfile文件(最佳优化)
auto profile = builder->createOptimizationProfile();
for(int i = 0; i < net_num_input; ++i){
auto input = network->getInput(i);
auto input_dims = input->getDimensions();
input_dims.d[0] = 1;
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
input_dims.d[0] = maxBatchSize;
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
}
config->addOptimizationProfile(profile); //builderconfig里面添加OptimizationProfile文件
std::cout << "Building engine with onnx parser, please wait for a while..." << std::endl;
//计时 计算编译时间
auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
std::cout << "Build engine with onnx parser successfully!" << std::endl;
printf("Build done %lld ms !\n", time_end - time_start);
// Don't need the network any more
network->destroy();
return engine;
}
//转换模型
void Inference::APIToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& onnx_model_name)
{
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
// 创建模型来填充网络,然后设置输出并创建一个引擎
ICudaEngine *engine = nullptr;
engine = build_engine_onnx(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, onnx_model_name);
assert(engine != nullptr);
//序列化引擎生成模型流
(*modelStream) = engine->serialize();
//释放相关资源
engine->destroy();
builder->destroy();
config->destroy();
}
//执行推理
void Inference::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
{
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
//context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
void Inference::doInferenceV2(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
{
context.enqueue(batchSize, buffers, stream, nullptr);
//context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
//执行推理3
void Inference::doInferenceV3(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
{
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
//执行推理4
void Inference::doInferenceV4(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
{
context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}