#include "inference.h" Inference::Inference() {} Inference::~Inference() {} //onnx解析器 ICudaEngine* Inference::build_engine_onnx(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config, std::string& source_onnx) { const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition* network = builder->createNetworkV2(explicitBatch); //创建onnx解析器 nvonnxparser::IParser* onnxParser = nvonnxparser::createParser(*network, gLogger); //解析onnx文件 onnxParser->parseFromFile(source_onnx.c_str(), 1); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(maxWorkSpaceSize); // 16MB float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f; #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #endif std::cout<<"Set max batch size = "<getNbInputs(); //获取网络输入个数 printf("Network has %d inputs:\n", net_num_input); std::vector input_names(net_num_input); for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度 auto tensor = network->getInput(i); auto dims = tensor->getDimensions(); auto dims_str = join_dims(vector(dims.d, dims.d+dims.nbDims)); printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str()); input_names[i] = tensor->getName(); } int net_num_output = network->getNbOutputs(); //获取网络输出个数 printf("Network has %d outputs:\n", net_num_output); for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度 auto tensor = network->getOutput(i); auto dims = tensor->getDimensions(); auto dims_str = join_dims(vector(dims.d, dims.d+dims.nbDims)); printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str()); } int net_num_layers = network->getNbLayers(); //获取网络层数 printf("Network has %d layers\n", net_num_layers); //配置OptimizationProfile文件(最佳优化) auto profile = builder->createOptimizationProfile(); for(int i = 0; i < net_num_input; ++i){ auto input = network->getInput(i); auto input_dims = input->getDimensions(); input_dims.d[0] = 1; profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims); profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims); input_dims.d[0] = maxBatchSize; profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims); } config->addOptimizationProfile(profile); //builderconfig里面添加OptimizationProfile文件 std::cout << "Building engine with onnx parser, please wait for a while..." << std::endl; //计时 计算编译时间 auto time_start = chrono::duration_cast(chrono::system_clock::now().time_since_epoch()).count(); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); auto time_end = chrono::duration_cast(chrono::system_clock::now().time_since_epoch()).count(); std::cout << "Build engine with onnx parser successfully!" << std::endl; printf("Build done %lld ms !\n", time_end - time_start); // Don't need the network any more network->destroy(); return engine; } //转换模型 void Inference::APIToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& onnx_model_name) { IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger) IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig // 创建模型来填充网络,然后设置输出并创建一个引擎 ICudaEngine *engine = nullptr; engine = build_engine_onnx(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, onnx_model_name); assert(engine != nullptr); //序列化引擎生成模型流 (*modelStream) = engine->serialize(); //释放相关资源 engine->destroy(); builder->destroy(); config->destroy(); } //执行推理 void Inference::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize, unsigned int ouputIndex, float* output, int outputSize, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); //context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void Inference::doInferenceV2(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize) { context.enqueue(batchSize, buffers, stream, nullptr); //context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } //执行推理3 void Inference::doInferenceV3(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize, unsigned int ouputIndex, float* output, int outputSize, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } //执行推理4 void Inference::doInferenceV4(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize) { context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); }