271 lines
12 KiB
Plaintext
271 lines
12 KiB
Plaintext
|
|
#include "inference.h"
|
||
|
|
|
||
|
|
template<typename _T>
|
||
|
|
static std::string join_dims(const std::vector<_T>& dims)
|
||
|
|
{
|
||
|
|
std::stringstream output;
|
||
|
|
char buf[64];
|
||
|
|
const char* fmts[] = {"%d", " x %d"};
|
||
|
|
for(int i = 0; i < dims.size(); ++i){
|
||
|
|
snprintf(buf, sizeof(buf), fmts[i != 0], dims[i]);
|
||
|
|
output << buf;
|
||
|
|
}
|
||
|
|
return output.str();
|
||
|
|
}
|
||
|
|
|
||
|
|
Inference::Inference() {}
|
||
|
|
|
||
|
|
Inference::~Inference() {}
|
||
|
|
|
||
|
|
inline unsigned int Inference::getElementSize(nvinfer1::DataType t)
|
||
|
|
{
|
||
|
|
switch (t)
|
||
|
|
{
|
||
|
|
case nvinfer1::DataType::kINT32: return 4;
|
||
|
|
case nvinfer1::DataType::kFLOAT: return 4;
|
||
|
|
case nvinfer1::DataType::kHALF: return 2;
|
||
|
|
case nvinfer1::DataType::kBOOL:
|
||
|
|
case nvinfer1::DataType::kINT8: return 1;
|
||
|
|
}
|
||
|
|
throw std::runtime_error("Invalid DataType.");
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline int64_t Inference::volume(const nvinfer1::Dims& d)
|
||
|
|
{
|
||
|
|
return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
|
||
|
|
}
|
||
|
|
|
||
|
|
//onnx解析器
|
||
|
|
ICudaEngine* Inference::build_engine_onnx(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config, std::string& source_onnx)
|
||
|
|
{
|
||
|
|
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
|
||
|
|
INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
|
||
|
|
|
||
|
|
//创建onnx解析器
|
||
|
|
nvonnxparser::IParser* onnxParser = nvonnxparser::createParser(*network, gLogger);
|
||
|
|
//解析onnx文件
|
||
|
|
onnxParser->parseFromFile(source_onnx.c_str(), 1);
|
||
|
|
|
||
|
|
// Build engine
|
||
|
|
builder->setMaxBatchSize(maxBatchSize);
|
||
|
|
config->setMaxWorkspaceSize(maxWorkSpaceSize); // 16MB
|
||
|
|
float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f;
|
||
|
|
|
||
|
|
#if defined(USE_FP16)
|
||
|
|
config->setFlag(BuilderFlag::kFP16);
|
||
|
|
#endif
|
||
|
|
|
||
|
|
std::cout<<"Set max batch size = "<<maxBatchSize<<std::endl; //最大batch size
|
||
|
|
std::cout<<"Set max workspace size = "<<max_workspace_size<<" MB"<<std::endl; //最大workspace size
|
||
|
|
|
||
|
|
int net_num_input = network->getNbInputs(); //获取网络输入个数
|
||
|
|
printf("Network has %d inputs:\n", net_num_input);
|
||
|
|
std::vector<std::string> input_names(net_num_input);
|
||
|
|
for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度
|
||
|
|
auto tensor = network->getInput(i);
|
||
|
|
auto dims = tensor->getDimensions();
|
||
|
|
auto dims_str = join_dims(std::vector<int>(dims.d, dims.d+dims.nbDims));
|
||
|
|
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
|
||
|
|
|
||
|
|
input_names[i] = tensor->getName();
|
||
|
|
}
|
||
|
|
|
||
|
|
int net_num_output = network->getNbOutputs(); //获取网络输出个数
|
||
|
|
printf("Network has %d outputs:\n", net_num_output);
|
||
|
|
for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度
|
||
|
|
auto tensor = network->getOutput(i);
|
||
|
|
auto dims = tensor->getDimensions();
|
||
|
|
auto dims_str = join_dims(std::vector<int>(dims.d, dims.d+dims.nbDims));
|
||
|
|
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
|
||
|
|
}
|
||
|
|
|
||
|
|
int net_num_layers = network->getNbLayers(); //获取网络层数
|
||
|
|
printf("Network has %d layers\n", net_num_layers);
|
||
|
|
|
||
|
|
//配置OptimizationProfile文件(最佳优化)
|
||
|
|
auto profile = builder->createOptimizationProfile();
|
||
|
|
for(int i = 0; i < net_num_input; ++i){
|
||
|
|
auto input = network->getInput(i);
|
||
|
|
auto input_dims = input->getDimensions();
|
||
|
|
input_dims.d[0] = 1;
|
||
|
|
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
|
||
|
|
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
|
||
|
|
input_dims.d[0] = maxBatchSize;
|
||
|
|
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
|
||
|
|
}
|
||
|
|
config->addOptimizationProfile(profile); //builderconfig里面添加OptimizationProfile文件
|
||
|
|
|
||
|
|
|
||
|
|
std::cout << "Building engine with onnx parser, please wait for a while..." << std::endl;
|
||
|
|
//计时 计算编译时间
|
||
|
|
auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
|
||
|
|
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||
|
|
auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
|
||
|
|
std::cout << "Build engine with onnx parser successfully!" << std::endl;
|
||
|
|
printf("Build done %lld ms !\n", time_end - time_start);
|
||
|
|
|
||
|
|
// Don't need the network any more
|
||
|
|
onnxParser->destroy();
|
||
|
|
network->destroy();
|
||
|
|
|
||
|
|
return engine;
|
||
|
|
}
|
||
|
|
|
||
|
|
ICudaEngine* Inference::build_engine_caffe(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config,
|
||
|
|
const std::string& strCaffeModelFile, const std::string& strCaffeDeployFile, const std::vector<std::string>& vecOutputs)
|
||
|
|
{
|
||
|
|
// 创建network
|
||
|
|
INetworkDefinition* network = builder->createNetworkV2(0);
|
||
|
|
|
||
|
|
// 创建caffe解析器
|
||
|
|
ICaffeParser* caffeParser = createCaffeParser();
|
||
|
|
|
||
|
|
const IBlobNameToTensor *blobNameToTensor = caffeParser->parse(strCaffeDeployFile.c_str(),
|
||
|
|
strCaffeModelFile.c_str(),
|
||
|
|
*network,
|
||
|
|
nvinfer1::DataType::kFLOAT);
|
||
|
|
//标记输出
|
||
|
|
for (auto& s : vecOutputs){
|
||
|
|
network->markOutput(*blobNameToTensor->find(s.c_str()));
|
||
|
|
}
|
||
|
|
|
||
|
|
//设置batch_size和workspace size
|
||
|
|
builder->setMaxBatchSize(maxBatchSize);
|
||
|
|
config->setMaxWorkspaceSize(maxWorkSpaceSize);
|
||
|
|
config->setFlag(BuilderFlag::kGPU_FALLBACK);
|
||
|
|
config->setFlag(BuilderFlag::kSTRICT_TYPES);
|
||
|
|
|
||
|
|
// FP16精度
|
||
|
|
#if defined(USE_FP16)
|
||
|
|
config->setFlag(BuilderFlag::kFP16);
|
||
|
|
#endif
|
||
|
|
|
||
|
|
float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f;
|
||
|
|
std::cout<<"Set max batch size = "<<maxBatchSize<<std::endl; //最大batch_size
|
||
|
|
std::cout<<"Set max workspace size = "<<max_workspace_size<<" MB"<<std::endl; //最大batch_size
|
||
|
|
|
||
|
|
int net_num_input = network->getNbInputs(); //获取网络输入个数
|
||
|
|
printf("Network has %d inputs:\n", net_num_input);
|
||
|
|
std::vector<std::string> input_names(net_num_input);
|
||
|
|
for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度
|
||
|
|
auto tensor = network->getInput(i);
|
||
|
|
auto dims = tensor->getDimensions();
|
||
|
|
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
|
||
|
|
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
|
||
|
|
|
||
|
|
input_names[i] = tensor->getName();
|
||
|
|
}
|
||
|
|
|
||
|
|
int net_num_output = network->getNbOutputs(); //获取网络输出个数
|
||
|
|
printf("Network has %d outputs:\n", net_num_output);
|
||
|
|
for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度
|
||
|
|
auto tensor = network->getOutput(i);
|
||
|
|
auto dims = tensor->getDimensions();
|
||
|
|
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
|
||
|
|
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
|
||
|
|
}
|
||
|
|
|
||
|
|
int net_num_layers = network->getNbLayers(); //获取网络层数
|
||
|
|
printf("Network has %d layers\n", net_num_layers);
|
||
|
|
|
||
|
|
|
||
|
|
//编译引擎
|
||
|
|
//计时 计算编译时间
|
||
|
|
std::cout << "Building engine with caffe parser, please wait for a while..." << std::endl;
|
||
|
|
auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
|
||
|
|
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||
|
|
assert(engine);
|
||
|
|
auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
|
||
|
|
std::cout << "Build engine with caffe parser successfully!" << std::endl;
|
||
|
|
printf("Build done %lld ms !\n", time_end - time_start);
|
||
|
|
|
||
|
|
//释放所有资源
|
||
|
|
caffeParser->destroy();
|
||
|
|
network->destroy();
|
||
|
|
|
||
|
|
return engine;
|
||
|
|
}
|
||
|
|
|
||
|
|
//转换模型
|
||
|
|
void Inference::ONNXToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& onnx_model_name)
|
||
|
|
{
|
||
|
|
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
|
||
|
|
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
|
||
|
|
|
||
|
|
// 创建模型来填充网络,然后设置输出并创建一个引擎
|
||
|
|
ICudaEngine *engine = nullptr;
|
||
|
|
|
||
|
|
engine = build_engine_onnx(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, onnx_model_name);
|
||
|
|
assert(engine != nullptr);
|
||
|
|
|
||
|
|
//序列化引擎生成模型流
|
||
|
|
(*modelStream) = engine->serialize();
|
||
|
|
|
||
|
|
//释放相关资源
|
||
|
|
engine->destroy();
|
||
|
|
builder->destroy();
|
||
|
|
config->destroy();
|
||
|
|
}
|
||
|
|
|
||
|
|
void Inference::CaffeToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& caffe_model_name, std::string& caffe_deploy_name, std::vector<std::string>& outputs)
|
||
|
|
{
|
||
|
|
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
|
||
|
|
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
|
||
|
|
|
||
|
|
// 创建模型来填充网络,然后设置输出并创建一个引擎
|
||
|
|
ICudaEngine *engine = nullptr;
|
||
|
|
|
||
|
|
engine = build_engine_caffe(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, caffe_model_name, caffe_deploy_name, outputs);
|
||
|
|
assert(engine != nullptr);
|
||
|
|
|
||
|
|
//序列化引擎生成模型流
|
||
|
|
(*modelStream) = engine->serialize();
|
||
|
|
|
||
|
|
//释放相关资源
|
||
|
|
engine->destroy();
|
||
|
|
builder->destroy();
|
||
|
|
config->destroy();
|
||
|
|
}
|
||
|
|
|
||
|
|
//执行推理1
|
||
|
|
void Inference::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
|
||
|
|
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
|
||
|
|
{
|
||
|
|
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
|
||
|
|
context.enqueue(batchSize, buffers, stream, nullptr);
|
||
|
|
// context.enqueueV2(buffers, stream, nullptr);
|
||
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||
|
|
cudaStreamSynchronize(stream);
|
||
|
|
}
|
||
|
|
|
||
|
|
//执行推理2
|
||
|
|
void Inference::doInferenceV2(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
|
||
|
|
{
|
||
|
|
context.enqueue(batchSize, buffers, stream, nullptr);
|
||
|
|
// context.enqueueV2(buffers, stream, nullptr);
|
||
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||
|
|
cudaStreamSynchronize(stream);
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
//执行推理3
|
||
|
|
void Inference::doInferenceV3(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
|
||
|
|
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
|
||
|
|
{
|
||
|
|
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
|
||
|
|
context.enqueueV2(buffers, stream, nullptr);
|
||
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||
|
|
cudaStreamSynchronize(stream);
|
||
|
|
}
|
||
|
|
|
||
|
|
//执行推理4
|
||
|
|
void Inference::doInferenceV4(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
|
||
|
|
{
|
||
|
|
context.enqueueV2(buffers, stream, nullptr);
|
||
|
|
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||
|
|
cudaStreamSynchronize(stream);
|
||
|
|
}
|
||
|
|
|
||
|
|
|