generated from zhangwei/Matrixai
141 lines
6.5 KiB
Plaintext
141 lines
6.5 KiB
Plaintext
#include "inference.h"
|
|
|
|
Inference::Inference() {}
|
|
|
|
Inference::~Inference() {}
|
|
|
|
//onnx解析器
|
|
ICudaEngine* Inference::build_engine_onnx(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config, std::string& source_onnx)
|
|
{
|
|
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
|
|
INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
|
|
|
|
//创建onnx解析器
|
|
nvonnxparser::IParser* onnxParser = nvonnxparser::createParser(*network, gLogger);
|
|
//解析onnx文件
|
|
onnxParser->parseFromFile(source_onnx.c_str(), 1);
|
|
|
|
// Build engine
|
|
builder->setMaxBatchSize(maxBatchSize);
|
|
config->setMaxWorkspaceSize(maxWorkSpaceSize); // 16MB
|
|
float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f;
|
|
|
|
#if defined(USE_FP16)
|
|
config->setFlag(BuilderFlag::kFP16);
|
|
#endif
|
|
|
|
std::cout<<"Set max batch size = "<<maxBatchSize<<std::endl; //最大batch_size
|
|
std::cout<<"Set max workspace size = "<<max_workspace_size<<" MB"<<std::endl; //最大batch_size
|
|
|
|
int net_num_input = network->getNbInputs(); //获取网络输入个数
|
|
printf("Network has %d inputs:\n", net_num_input);
|
|
std::vector<std::string> input_names(net_num_input);
|
|
for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度
|
|
auto tensor = network->getInput(i);
|
|
auto dims = tensor->getDimensions();
|
|
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
|
|
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
|
|
|
|
input_names[i] = tensor->getName();
|
|
}
|
|
|
|
int net_num_output = network->getNbOutputs(); //获取网络输出个数
|
|
printf("Network has %d outputs:\n", net_num_output);
|
|
for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度
|
|
auto tensor = network->getOutput(i);
|
|
auto dims = tensor->getDimensions();
|
|
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
|
|
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
|
|
}
|
|
|
|
int net_num_layers = network->getNbLayers(); //获取网络层数
|
|
printf("Network has %d layers\n", net_num_layers);
|
|
|
|
//配置OptimizationProfile文件(最佳优化)
|
|
auto profile = builder->createOptimizationProfile();
|
|
for(int i = 0; i < net_num_input; ++i){
|
|
auto input = network->getInput(i);
|
|
auto input_dims = input->getDimensions();
|
|
input_dims.d[0] = 1;
|
|
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
|
|
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
|
|
input_dims.d[0] = maxBatchSize;
|
|
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
|
|
}
|
|
config->addOptimizationProfile(profile); //builderconfig里面添加OptimizationProfile文件
|
|
|
|
|
|
std::cout << "Building engine with onnx parser, please wait for a while..." << std::endl;
|
|
//计时 计算编译时间
|
|
auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
|
|
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
|
auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
|
|
std::cout << "Build engine with onnx parser successfully!" << std::endl;
|
|
printf("Build done %lld ms !\n", time_end - time_start);
|
|
|
|
// Don't need the network any more
|
|
network->destroy();
|
|
|
|
return engine;
|
|
}
|
|
|
|
//转换模型
|
|
void Inference::APIToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& onnx_model_name)
|
|
{
|
|
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
|
|
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
|
|
|
|
// 创建模型来填充网络,然后设置输出并创建一个引擎
|
|
ICudaEngine *engine = nullptr;
|
|
|
|
engine = build_engine_onnx(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, onnx_model_name);
|
|
assert(engine != nullptr);
|
|
|
|
//序列化引擎生成模型流
|
|
(*modelStream) = engine->serialize();
|
|
|
|
//释放相关资源
|
|
engine->destroy();
|
|
builder->destroy();
|
|
config->destroy();
|
|
}
|
|
|
|
//执行推理
|
|
void Inference::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
|
|
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
|
|
{
|
|
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
|
|
context.enqueue(batchSize, buffers, stream, nullptr);
|
|
//context.enqueueV2(buffers, stream, nullptr);
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
|
cudaStreamSynchronize(stream);
|
|
}
|
|
|
|
void Inference::doInferenceV2(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
|
|
{
|
|
context.enqueue(batchSize, buffers, stream, nullptr);
|
|
//context.enqueueV2(buffers, stream, nullptr);
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
|
cudaStreamSynchronize(stream);
|
|
}
|
|
|
|
//执行推理3
|
|
void Inference::doInferenceV3(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
|
|
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
|
|
{
|
|
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
|
|
context.enqueueV2(buffers, stream, nullptr);
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
|
cudaStreamSynchronize(stream);
|
|
}
|
|
|
|
//执行推理4
|
|
void Inference::doInferenceV4(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
|
|
{
|
|
context.enqueueV2(buffers, stream, nullptr);
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
|
cudaStreamSynchronize(stream);
|
|
}
|
|
|
|
|