Train_RFID_Linux/code/inference/inference.cu

271 lines
12 KiB
Plaintext
Raw Normal View History

#include "inference.h"
template<typename _T>
static std::string join_dims(const std::vector<_T>& dims)
{
std::stringstream output;
char buf[64];
const char* fmts[] = {"%d", " x %d"};
for(int i = 0; i < dims.size(); ++i){
snprintf(buf, sizeof(buf), fmts[i != 0], dims[i]);
output << buf;
}
return output.str();
}
Inference::Inference() {}
Inference::~Inference() {}
inline unsigned int Inference::getElementSize(nvinfer1::DataType t)
{
switch (t)
{
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kINT8: return 1;
}
throw std::runtime_error("Invalid DataType.");
return 0;
}
inline int64_t Inference::volume(const nvinfer1::Dims& d)
{
return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
}
//onnx解析器
ICudaEngine* Inference::build_engine_onnx(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config, std::string& source_onnx)
{
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
//创建onnx解析器
nvonnxparser::IParser* onnxParser = nvonnxparser::createParser(*network, gLogger);
//解析onnx文件
onnxParser->parseFromFile(source_onnx.c_str(), 1);
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(maxWorkSpaceSize); // 16MB
float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f;
#if defined(USE_FP16)
config->setFlag(BuilderFlag::kFP16);
#endif
std::cout<<"Set max batch size = "<<maxBatchSize<<std::endl; //最大batch size
std::cout<<"Set max workspace size = "<<max_workspace_size<<" MB"<<std::endl; //最大workspace size
int net_num_input = network->getNbInputs(); //获取网络输入个数
printf("Network has %d inputs:\n", net_num_input);
std::vector<std::string> input_names(net_num_input);
for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度
auto tensor = network->getInput(i);
auto dims = tensor->getDimensions();
auto dims_str = join_dims(std::vector<int>(dims.d, dims.d+dims.nbDims));
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
input_names[i] = tensor->getName();
}
int net_num_output = network->getNbOutputs(); //获取网络输出个数
printf("Network has %d outputs:\n", net_num_output);
for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度
auto tensor = network->getOutput(i);
auto dims = tensor->getDimensions();
auto dims_str = join_dims(std::vector<int>(dims.d, dims.d+dims.nbDims));
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
}
int net_num_layers = network->getNbLayers(); //获取网络层数
printf("Network has %d layers\n", net_num_layers);
//配置OptimizationProfile文件(最佳优化)
auto profile = builder->createOptimizationProfile();
for(int i = 0; i < net_num_input; ++i){
auto input = network->getInput(i);
auto input_dims = input->getDimensions();
input_dims.d[0] = 1;
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
input_dims.d[0] = maxBatchSize;
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
}
config->addOptimizationProfile(profile); //builderconfig里面添加OptimizationProfile文件
std::cout << "Building engine with onnx parser, please wait for a while..." << std::endl;
//计时 计算编译时间
auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
std::cout << "Build engine with onnx parser successfully!" << std::endl;
printf("Build done %lld ms !\n", time_end - time_start);
// Don't need the network any more
onnxParser->destroy();
network->destroy();
return engine;
}
ICudaEngine* Inference::build_engine_caffe(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IBuilder* builder, IBuilderConfig* config,
const std::string& strCaffeModelFile, const std::string& strCaffeDeployFile, const std::vector<std::string>& vecOutputs)
{
// 创建network
INetworkDefinition* network = builder->createNetworkV2(0);
// 创建caffe解析器
ICaffeParser* caffeParser = createCaffeParser();
const IBlobNameToTensor *blobNameToTensor = caffeParser->parse(strCaffeDeployFile.c_str(),
strCaffeModelFile.c_str(),
*network,
nvinfer1::DataType::kFLOAT);
//标记输出
for (auto& s : vecOutputs){
network->markOutput(*blobNameToTensor->find(s.c_str()));
}
//设置batch_size和workspace size
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(maxWorkSpaceSize);
config->setFlag(BuilderFlag::kGPU_FALLBACK);
config->setFlag(BuilderFlag::kSTRICT_TYPES);
// FP16精度
#if defined(USE_FP16)
config->setFlag(BuilderFlag::kFP16);
#endif
float max_workspace_size = (float)maxWorkSpaceSize/1024.0f/1024.0f;
std::cout<<"Set max batch size = "<<maxBatchSize<<std::endl; //最大batch_size
std::cout<<"Set max workspace size = "<<max_workspace_size<<" MB"<<std::endl; //最大batch_size
int net_num_input = network->getNbInputs(); //获取网络输入个数
printf("Network has %d inputs:\n", net_num_input);
std::vector<std::string> input_names(net_num_input);
for(int i = 0; i < net_num_input; ++i){ //获取每个输入的张量及张量维度
auto tensor = network->getInput(i);
auto dims = tensor->getDimensions();
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
input_names[i] = tensor->getName();
}
int net_num_output = network->getNbOutputs(); //获取网络输出个数
printf("Network has %d outputs:\n", net_num_output);
for(int i = 0; i < net_num_output; ++i){ //获取每个输出的张量及张量维度
auto tensor = network->getOutput(i);
auto dims = tensor->getDimensions();
auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
printf(" %d.[%s] shape is %s\n", i, tensor->getName(), dims_str.c_str());
}
int net_num_layers = network->getNbLayers(); //获取网络层数
printf("Network has %d layers\n", net_num_layers);
//编译引擎
//计时 计算编译时间
std::cout << "Building engine with caffe parser, please wait for a while..." << std::endl;
auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
assert(engine);
auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
std::cout << "Build engine with caffe parser successfully!" << std::endl;
printf("Build done %lld ms !\n", time_end - time_start);
//释放所有资源
caffeParser->destroy();
network->destroy();
return engine;
}
//转换模型
void Inference::ONNXToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& onnx_model_name)
{
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
// 创建模型来填充网络,然后设置输出并创建一个引擎
ICudaEngine *engine = nullptr;
engine = build_engine_onnx(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, onnx_model_name);
assert(engine != nullptr);
//序列化引擎生成模型流
(*modelStream) = engine->serialize();
//释放相关资源
engine->destroy();
builder->destroy();
config->destroy();
}
void Inference::CaffeToModel(Logger gLogger, unsigned int maxBatchSize, unsigned int maxWorkSpaceSize, IHostMemory** modelStream, std::string& caffe_model_name, std::string& caffe_deploy_name, std::vector<std::string>& outputs)
{
IBuilder* builder = createInferBuilder(gLogger); //创建builder(要传入gLogger)
IBuilderConfig* config = builder->createBuilderConfig(); //创建builderconfig
// 创建模型来填充网络,然后设置输出并创建一个引擎
ICudaEngine *engine = nullptr;
engine = build_engine_caffe(gLogger, maxBatchSize, maxWorkSpaceSize, builder, config, caffe_model_name, caffe_deploy_name, outputs);
assert(engine != nullptr);
//序列化引擎生成模型流
(*modelStream) = engine->serialize();
//释放相关资源
engine->destroy();
builder->destroy();
config->destroy();
}
//执行推理1
void Inference::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
{
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
// context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
//执行推理2
void Inference::doInferenceV2(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
{
context.enqueue(batchSize, buffers, stream, nullptr);
// context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
//执行推理3
void Inference::doInferenceV3(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int inputIndex, float* input, int inputSize,
unsigned int ouputIndex, float* output, int outputSize, int batchSize)
{
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[ouputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
//执行推理4
void Inference::doInferenceV4(IExecutionContext& context, cudaStream_t& stream, void **buffers, unsigned int outputIndex, float* output, int outputSize, int batchSize)
{
context.enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}