Train_Identify_arm/nvidia_ascend_engine/nvidia_engine/ImagePreprocessEngine/ImagePreprocessEngine.cpp

#include "ImagePreprocessEngine.h"

using namespace std;
using namespace cv;
using namespace ai_matrix;

ImagePreprocessEngine::ImagePreprocessEngine() {}

ImagePreprocessEngine::~ImagePreprocessEngine() {}

APP_ERROR ImagePreprocessEngine::Init()
{
    strPort0_ = engineName_ + "_" + std::to_string(engineId_) + "_0";

    width_ = IMAGE_WIDTH, height_ = IMAGE_HEIGHT;

    //资源分配(创建流,host及device侧内存)
    cudaSetDevice(DEVICE);  //设置GPU

    image_preprocess_stream_ = new cudaStream_t;
    CUDA_CHECK(cudaStreamCreate(image_preprocess_stream_));

    img_host_ = new uint8_t;
    img_device_ = new uint8_t;

    CUDA_CHECK(cudaMallocHost((void**)&img_host_, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); //在HOST侧申请预处理数据缓存
    CUDA_CHECK(cudaMalloc((void**)&img_device_, MAX_IMAGE_INPUT_SIZE_THRESH * 3));     //在DEVICE侧申请预处理数据缓存

    LogInfo << "engineId_:" << engineId_ << " ImagePreprocessEngine Init ok";
    return APP_ERR_OK;
}

APP_ERROR ImagePreprocessEngine::DeInit()
{
    //资源释放
    CUDA_CHECK(cudaStreamDestroy(*image_preprocess_stream_));  delete image_preprocess_stream_; image_preprocess_stream_ = nullptr;   //释放图像预处理cuda流

    CUDA_CHECK(cudaFree(img_device_));       //释放设备端内存
    CUDA_CHECK(cudaFreeHost(img_host_));     //释放HOST端内存

    LogInfo << "engineId_:" << engineId_ << " ImagePreprocessEngine DeInit ok";
    return APP_ERR_OK;
}


APP_ERROR ImagePreprocessEngine::Process()
{
    unsigned int input_data_buffers_size =  BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float);
    size_t  size_image = width_ * height_ * 3;
    size_t  size_image_dst = INPUT_H * INPUT_W * 3;

    uint64_t u64count_num = 0;
    int iRet = APP_ERR_OK;

    while (!isStop_)
    {
        // 从上一引擎获取图像数据
        std::shared_ptr<void> pVoidData0 = nullptr;
        inputQueMap_[strPort0_]->pop(pVoidData0);
        if (nullptr == pVoidData0)
        {
            usleep(1*1000); //n ms
            continue;
        }
        // std::cout<<"Enter Image Preproess Thread "<<++u64count_num<<" Times!"<<std::endl;
        // std::cout<<"Image Preproess Thread ID: "<<std::this_thread::get_id()<<std::endl;

        std::shared_ptr<FrameData> pRGBFrameData = std::static_pointer_cast<FrameData>(pVoidData0);

        //copy data to pinned memory
        // memcpy(img_host_,pRGBFrameData->pData.get(),size_image);   //拷贝预处理数据到HOST侧
        // auto memcpy_start = std::chrono::system_clock::now();  //计时开始
        memcpy(img_host_, pRGBFrameData->pData.get(), pRGBFrameData->iSize);   //拷贝预处理数据到HOST侧
        // auto memcpy_end = std::chrono::system_clock::now();  //计时结束
        // std::cout << "ImagePreprocessEngine memcpy time: " << std::chrono::duration_cast<std::chrono::milliseconds>(memcpy_end - memcpy_start).count() << "ms" << std::endl;


        //copy data to device memory
        #ifdef CUDA_MEMCPY_TIME_CONSUMING_TEST
        auto cuda_memcpy_start = std::chrono::system_clock::now();  //计时开始
        CUDA_CHECK(cudaMemcpyAsync(img_device_,img_host_,size_image,cudaMemcpyHostToDevice,*image_preprocess_stream_));  //拷贝预处理数据到Device侧
        auto cuda_memcpy_end = std::chrono::system_clock::now();  //计时结束
        std::cout<< "ImagePreprocessEngine cuda memcpy data size is: "<<size_image<<std::endl;
        std::cout << "ImagePreprocessEngine cuda memcpy host to device time: " << std::chrono::duration_cast<std::chrono::milliseconds>(cuda_memcpy_end - cuda_memcpy_start).count() << "ms" << std::endl;
        #else
        CUDA_CHECK(cudaMemcpyAsync(img_device_,img_host_,size_image,cudaMemcpyHostToDevice,*image_preprocess_stream_));  //拷贝预处理数据到Device侧
        #endif

        //组织数据压入下一引擎
        float* input_data_buffers = new float;
        #ifdef CUDA_MALLOC_TIME_CONSUMING_TEST
        auto cuda_malloc_start = std::chrono::system_clock::now();  //计时开始
        CUDA_CHECK(cudaMalloc((void**)&input_data_buffers, input_data_buffers_size));    //在DEVICE侧申请预处理输入数据缓存
        auto cuda_malloc_end = std::chrono::system_clock::now();
        std::cout<< "ImagePreprocessEngine cuda device malloc data size is: "<<input_data_buffers_size<<std::endl;
        std::cout << "ImagePreprocessEngine cuda device malloc time: " << std::chrono::duration_cast<std::chrono::milliseconds>(cuda_malloc_end - cuda_malloc_start).count() << "ms" << std::endl;
        #else
        CUDA_CHECK(cudaMalloc((void**)&input_data_buffers, input_data_buffers_size));    //在DEVICE侧申请预处理输入数据缓存
        #endif

        //预处理CUDA核函数
        std::cout << "src width = " << width_ << " src height= " << height_ << " dest width = " << INPUT_W << " desc height= " << INPUT_H << std::endl;
        preprocess_kernel_img(img_device_, width_, height_, input_data_buffers, INPUT_W, INPUT_H, *image_preprocess_stream_);

        //构造预处理数据及源数据
        void* pSrcRGBBuffer = nullptr;
		unsigned int pSrcRGBBuffer_Size = pRGBFrameData->iSize;
		pSrcRGBBuffer = new uint8_t[pSrcRGBBuffer_Size];
        memcpy(pSrcRGBBuffer, pRGBFrameData->pData.get(), pSrcRGBBuffer_Size);

        std::shared_ptr<InferenceData> pImagePreprocessData = std::make_shared<InferenceData>();
        pImagePreprocessData->iDataSource = engineId_;
        pImagePreprocessData->iSize = input_data_buffers_size;
        pImagePreprocessData->pData.reset(input_data_buffers, cudaFree);
        pImagePreprocessData->iSrcSize = pSrcRGBBuffer_Size;
        pImagePreprocessData->pSrcData.reset(pSrcRGBBuffer, [](void* data){if(data) {delete[] data; data = nullptr;}});
        pImagePreprocessData->i64TimeStamp = pRGBFrameData->i64TimeStamp;

        //压入input_data_buffers
        #if 1
        iRet = outputQueMap_[strPort0_]->push(std::static_pointer_cast<void>(pImagePreprocessData));
        if (iRet != APP_ERR_OK){
			LogError << "push the image preprocess data failed...";
            // std::cerr<<"push the image preprocess data failed..."<<std::endl;
		}else{
            // std::cout<<"push the image preprocess data success!"<<std::endl;
        }
        #endif
    }
}