Train_Identify_arm/nvidia_ascend_engine/nvidia_engine/ImagePreprocessEngine/ImagePreprocessEngine.cpp

130 lines
6.2 KiB
C++

#include "ImagePreprocessEngine.h"
using namespace std;
using namespace cv;
using namespace ai_matrix;
ImagePreprocessEngine::ImagePreprocessEngine() {}
ImagePreprocessEngine::~ImagePreprocessEngine() {}
APP_ERROR ImagePreprocessEngine::Init()
{
strPort0_ = engineName_ + "_" + std::to_string(engineId_) + "_0";
width_ = IMAGE_WIDTH, height_ = IMAGE_HEIGHT;
//资源分配(创建流,host及device侧内存)
cudaSetDevice(DEVICE); //设置GPU
image_preprocess_stream_ = new cudaStream_t;
CUDA_CHECK(cudaStreamCreate(image_preprocess_stream_));
img_host_ = new uint8_t;
img_device_ = new uint8_t;
CUDA_CHECK(cudaMallocHost((void**)&img_host_, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); //在HOST侧申请预处理数据缓存
CUDA_CHECK(cudaMalloc((void**)&img_device_, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); //在DEVICE侧申请预处理数据缓存
LogInfo << "engineId_:" << engineId_ << " ImagePreprocessEngine Init ok";
return APP_ERR_OK;
}
APP_ERROR ImagePreprocessEngine::DeInit()
{
//资源释放
CUDA_CHECK(cudaStreamDestroy(*image_preprocess_stream_)); delete image_preprocess_stream_; image_preprocess_stream_ = nullptr; //释放图像预处理cuda流
CUDA_CHECK(cudaFree(img_device_)); //释放设备端内存
CUDA_CHECK(cudaFreeHost(img_host_)); //释放HOST端内存
LogInfo << "engineId_:" << engineId_ << " ImagePreprocessEngine DeInit ok";
return APP_ERR_OK;
}
APP_ERROR ImagePreprocessEngine::Process()
{
unsigned int input_data_buffers_size = BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float);
size_t size_image = width_ * height_ * 3;
size_t size_image_dst = INPUT_H * INPUT_W * 3;
uint64_t u64count_num = 0;
int iRet = APP_ERR_OK;
while (!isStop_)
{
// 从上一引擎获取图像数据
std::shared_ptr<void> pVoidData0 = nullptr;
inputQueMap_[strPort0_]->pop(pVoidData0);
if (nullptr == pVoidData0)
{
usleep(1*1000); //n ms
continue;
}
// std::cout<<"Enter Image Preproess Thread "<<++u64count_num<<" Times!"<<std::endl;
// std::cout<<"Image Preproess Thread ID: "<<std::this_thread::get_id()<<std::endl;
std::shared_ptr<FrameData> pRGBFrameData = std::static_pointer_cast<FrameData>(pVoidData0);
//copy data to pinned memory
// memcpy(img_host_,pRGBFrameData->pData.get(),size_image); //拷贝预处理数据到HOST侧
// auto memcpy_start = std::chrono::system_clock::now(); //计时开始
memcpy(img_host_, pRGBFrameData->pData.get(), pRGBFrameData->iSize); //拷贝预处理数据到HOST侧
// auto memcpy_end = std::chrono::system_clock::now(); //计时结束
// std::cout << "ImagePreprocessEngine memcpy time: " << std::chrono::duration_cast<std::chrono::milliseconds>(memcpy_end - memcpy_start).count() << "ms" << std::endl;
//copy data to device memory
#ifdef CUDA_MEMCPY_TIME_CONSUMING_TEST
auto cuda_memcpy_start = std::chrono::system_clock::now(); //计时开始
CUDA_CHECK(cudaMemcpyAsync(img_device_,img_host_,size_image,cudaMemcpyHostToDevice,*image_preprocess_stream_)); //拷贝预处理数据到Device侧
auto cuda_memcpy_end = std::chrono::system_clock::now(); //计时结束
std::cout<< "ImagePreprocessEngine cuda memcpy data size is: "<<size_image<<std::endl;
std::cout << "ImagePreprocessEngine cuda memcpy host to device time: " << std::chrono::duration_cast<std::chrono::milliseconds>(cuda_memcpy_end - cuda_memcpy_start).count() << "ms" << std::endl;
#else
CUDA_CHECK(cudaMemcpyAsync(img_device_,img_host_,size_image,cudaMemcpyHostToDevice,*image_preprocess_stream_)); //拷贝预处理数据到Device侧
#endif
//组织数据压入下一引擎
float* input_data_buffers = new float;
#ifdef CUDA_MALLOC_TIME_CONSUMING_TEST
auto cuda_malloc_start = std::chrono::system_clock::now(); //计时开始
CUDA_CHECK(cudaMalloc((void**)&input_data_buffers, input_data_buffers_size)); //在DEVICE侧申请预处理输入数据缓存
auto cuda_malloc_end = std::chrono::system_clock::now();
std::cout<< "ImagePreprocessEngine cuda device malloc data size is: "<<input_data_buffers_size<<std::endl;
std::cout << "ImagePreprocessEngine cuda device malloc time: " << std::chrono::duration_cast<std::chrono::milliseconds>(cuda_malloc_end - cuda_malloc_start).count() << "ms" << std::endl;
#else
CUDA_CHECK(cudaMalloc((void**)&input_data_buffers, input_data_buffers_size)); //在DEVICE侧申请预处理输入数据缓存
#endif
//预处理CUDA核函数
std::cout << "src width = " << width_ << " src height= " << height_ << " dest width = " << INPUT_W << " desc height= " << INPUT_H << std::endl;
preprocess_kernel_img(img_device_, width_, height_, input_data_buffers, INPUT_W, INPUT_H, *image_preprocess_stream_);
//构造预处理数据及源数据
void* pSrcRGBBuffer = nullptr;
unsigned int pSrcRGBBuffer_Size = pRGBFrameData->iSize;
pSrcRGBBuffer = new uint8_t[pSrcRGBBuffer_Size];
memcpy(pSrcRGBBuffer, pRGBFrameData->pData.get(), pSrcRGBBuffer_Size);
std::shared_ptr<InferenceData> pImagePreprocessData = std::make_shared<InferenceData>();
pImagePreprocessData->iDataSource = engineId_;
pImagePreprocessData->iSize = input_data_buffers_size;
pImagePreprocessData->pData.reset(input_data_buffers, cudaFree);
pImagePreprocessData->iSrcSize = pSrcRGBBuffer_Size;
pImagePreprocessData->pSrcData.reset(pSrcRGBBuffer, [](void* data){if(data) {delete[] data; data = nullptr;}});
pImagePreprocessData->i64TimeStamp = pRGBFrameData->i64TimeStamp;
//压入input_data_buffers
#if 1
iRet = outputQueMap_[strPort0_]->push(std::static_pointer_cast<void>(pImagePreprocessData));
if (iRet != APP_ERR_OK){
LogError << "push the image preprocess data failed...";
// std::cerr<<"push the image preprocess data failed..."<<std::endl;
}else{
// std::cout<<"push the image preprocess data success!"<<std::endl;
}
#endif
}
}